just doesn't work properly on windows. will leave it as a manual flag for others

2023-04-22 10:57:38 +08:00 · 2023-04-22 10:57:38 +08:00 · 4fa3dfe8bc
commit 4fa3dfe8bc
parent ef13443047
183 changed files with 4 additions and 281227 deletions
--- a/16
+++ b/16
@ -55,7 +55,6 @@ BONUSCFLAGS2 =

 OPENBLAS_FLAGS = -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
 CLBLAST_FLAGS = -DGGML_USE_CLBLAST -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
-CUBLAS_FLAGS = -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I./include/cuda -I./include/cuda/crt

 #lets try enabling everything
 CFLAGS   += -pthread -s
@ -157,7 +156,6 @@ NOAVX2_BUILD =
 OPENBLAS_BUILD =
 OPENBLAS_NOAVX2_BUILD =
 CLBLAST_BUILD =
-CUBLAS_BUILD =

 ifeq ($(OS),Windows_NT)
 	DEFAULT_BUILD = $(CXX) $(CXXFLAGS)  $^ -shared -o $@.dll $(LDFLAGS)
@ -165,7 +163,6 @@ ifeq ($(OS),Windows_NT)
 	OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o $@.dll $(LDFLAGS)
 	OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o $@.dll $(LDFLAGS)
 	CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o $@.dll $(LDFLAGS)
-	CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ lib/cuda.lib lib/cublas.lib lib/cublasLt.lib lib/cudart.lib lib/cudart_static.lib lib/ggml-cuda-kernel.lib -shared -o $@.dll $(LDFLAGS)
 else
 	DEFAULT_BUILD = $(CXX) $(CXXFLAGS)  $^ -shared -o $@.so $(LDFLAGS)
 	NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS)
@ -176,17 +173,12 @@ else
 	ifdef LLAMA_CLBLAST
 	CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
 	endif
-	ifdef LLAMA_CUBLAS
-	CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ -lcublas_static -lculibos -lcudart_static -lcublasLt_static -lpthread -ldl -L/usr/local/cuda/lib64 -shared -o $@.so $(LDFLAGS)
-	endif

 	ifndef LLAMA_OPENBLAS
 	ifndef LLAMA_CLBLAST
-	ifndef LLAMA_CUBLAS
 	OPENBLAS_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. For faster speeds, install and link a BLAS library. Set LLAMA_OPENBLAS=1 to compile with OpenBLAS support or LLAMA_CLBLAST=1 to compile with ClBlast support. This is just a reminder, not an error.'
 	endif
 	endif
-	endif
 endif

 #
@ -223,9 +215,6 @@ ggml_openblas_noavx2.o: ggml.c ggml.h
 ggml_clblast.o: ggml.c ggml.h
 	$(CC)  $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) $(CLBLAST_FLAGS) -c $< -o $@

-ggml_cublas.o: ggml.c ggml.h
-	$(CC)  $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) $(CUBLAS_FLAGS) -c $< -o $@
-
 ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h
 	$(CC)  $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -c $< -o $@

@ -248,7 +237,7 @@ gpttype_adapter.o: gpttype_adapter.cpp
 	$(CXX) $(CXXFLAGS) -c $< -o $@

 clean:
-	rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize-stats perplexity embedding benchmark-q4_0-matmult main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_noavx2.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_noavx2.so koboldcpp_openblas_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so gptj.exe gpt2.exe
+	rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize-stats perplexity embedding benchmark-q4_0-matmult main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_noavx2.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_noavx2.so koboldcpp_openblas_noavx2.so koboldcpp_clblast.so gptj.exe gpt2.exe

 main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
@ -270,9 +259,6 @@ koboldcpp_openblas_noavx2: ggml_openblas_noavx2.o ggml_rwkv.o ggml_v1_noavx2.o e

 koboldcpp_clblast: ggml_clblast.o ggml_rwkv.o ggml_v1.o expose.o common.o gpttype_adapter.o 
 	$(CLBLAST_BUILD)
-
-koboldcpp_cublas: ggml_cublas.o ggml_rwkv.o ggml_v1.o expose.o common.o gpttype_adapter.o 
-	$(CUBLAS_BUILD)
 		
 quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
--- a/include/cuda/CL/cl.h
+++ b/include/cuda/CL/cl.h
--- a/include/cuda/CL/cl.hpp
+++ b/include/cuda/CL/cl.hpp
--- a/include/cuda/CL/cl_d3d10.h
+++ b/include/cuda/CL/cl_d3d10.h
@ -1,129 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008-2020 The Khronos Group Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- ******************************************************************************/
-
-#ifndef __OPENCL_CL_D3D10_H
-#define __OPENCL_CL_D3D10_H
-
-#if defined(_MSC_VER)
-#if _MSC_VER >=1500
-#pragma warning( push )
-#pragma warning( disable : 4201 )
-#endif
-#endif
-#include <d3d10.h>
-#if defined(_MSC_VER)
-#if _MSC_VER >=1500
-#pragma warning( pop )
-#endif
-#endif
-
-#include <CL/cl.h>
-#include <CL/cl_platform.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/******************************************************************************
- * cl_khr_d3d10_sharing                                                       */
-#define cl_khr_d3d10_sharing 1
-
-typedef cl_uint cl_d3d10_device_source_khr;
-typedef cl_uint cl_d3d10_device_set_khr;
-
-/******************************************************************************/
-
-/* Error Codes */
-#define CL_INVALID_D3D10_DEVICE_KHR                  -1002
-#define CL_INVALID_D3D10_RESOURCE_KHR                -1003
-#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004
-#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005
-
-/* cl_d3d10_device_source_nv */
-#define CL_D3D10_DEVICE_KHR                          0x4010
-#define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011
-
-/* cl_d3d10_device_set_nv */
-#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
-#define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013
-
-/* cl_context_info */
-#define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
-#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
-
-/* cl_mem_info */
-#define CL_MEM_D3D10_RESOURCE_KHR                    0x4015
-
-/* cl_image_info */
-#define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016
-
-/* cl_command_type */
-#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
-#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018
-
-/******************************************************************************/
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
-    cl_platform_id             platform,
-    cl_d3d10_device_source_khr d3d_device_source,
-    void *                     d3d_object,
-    cl_d3d10_device_set_khr    d3d_device_set,
-    cl_uint                    num_entries,
-    cl_device_id *             devices,
-    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
-    cl_context     context,
-    cl_mem_flags   flags,
-    ID3D10Buffer * resource,
-    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
-    cl_context        context,
-    cl_mem_flags      flags,
-    ID3D10Texture2D * resource,
-    UINT              subresource,
-    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
-    cl_context        context,
-    cl_mem_flags      flags,
-    ID3D10Texture3D * resource,
-    UINT              subresource,
-    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_CL_D3D10_H */
-
--- a/include/cuda/CL/cl_d3d10_ext.h
+++ b/include/cuda/CL/cl_d3d10_ext.h
@ -1,122 +0,0 @@
-/**********************************************************************************
- * Copyright (c) 2008-2009 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- **********************************************************************************/
-
-#ifndef __OPENCL_CL_D3D10_EXT_H
-#define __OPENCL_CL_D3D10_EXT_H
-
-#include <d3d10.h>
-#include <CL/cl.h>
-#include <CL/cl_platform.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/******************************************************************************
- * cl_nv_d3d10_sharing                                                        */
-
-typedef cl_uint cl_d3d10_device_source_nv;
-typedef cl_uint cl_d3d10_device_set_nv;
-
-/******************************************************************************/
-
-// Error Codes
-#define CL_INVALID_D3D10_DEVICE_NV             -1002
-#define CL_INVALID_D3D10_RESOURCE_NV           -1003
-#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_NV  -1004
-#define CL_D3D10_RESOURCE_NOT_ACQUIRED_NV      -1005
-
-// cl_d3d10_device_source_nv
-#define CL_D3D10_DEVICE_NV                     0x4010
-#define CL_D3D10_DXGI_ADAPTER_NV               0x4011
-
-// cl_d3d10_device_set_nv
-#define CL_PREFERRED_DEVICES_FOR_D3D10_NV      0x4012
-#define CL_ALL_DEVICES_FOR_D3D10_NV            0x4013
-
-// cl_context_info
-#define CL_CONTEXT_D3D10_DEVICE_NV             0x4014
-
-// cl_mem_info
-#define CL_MEM_D3D10_RESOURCE_NV               0x4015
-
-// cl_image_info
-#define CL_IMAGE_D3D10_SUBRESOURCE_NV          0x4016
-
-// cl_command_type
-#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_NV    0x4017
-#define CL_COMMAND_RELEASE_D3D10_OBJECTS_NV    0x4018
-
-/******************************************************************************/
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10NV_fn)(
-    cl_platform_id            platform,
-    cl_d3d10_device_source_nv d3d_device_source,
-    void *                    d3d_object,
-    cl_d3d10_device_set_nv    d3d_device_set,
-    cl_uint                   num_entries, 
-    cl_device_id *            devices, 
-    cl_uint *                 num_devices) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferNV_fn)(
-    cl_context     context,
-    cl_mem_flags   flags,
-    ID3D10Buffer * resource,
-    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DNV_fn)(
-    cl_context        context,
-    cl_mem_flags      flags,
-    ID3D10Texture2D * resource,
-    UINT              subresource,
-    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DNV_fn)(
-    cl_context        context,
-    cl_mem_flags      flags,
-    ID3D10Texture3D * resource,
-    UINT              subresource,
-    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsNV_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsNV_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    cl_mem *         mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // __OPENCL_CL_D3D10_H
-
--- a/include/cuda/CL/cl_d3d11.h
+++ b/include/cuda/CL/cl_d3d11.h
@ -1,128 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008-2020 The Khronos Group Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- ******************************************************************************/
-
-#ifndef __OPENCL_CL_D3D11_H
-#define __OPENCL_CL_D3D11_H
-
-#if defined(_MSC_VER)
-#if _MSC_VER >=1500
-#pragma warning( push )
-#pragma warning( disable : 4201 )
-#endif
-#endif
-#include <d3d11.h>
-#if defined(_MSC_VER)
-#if _MSC_VER >=1500
-#pragma warning( pop )
-#endif
-#endif
-#include <CL/cl.h>
-#include <CL/cl_platform.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/******************************************************************************
- * cl_khr_d3d11_sharing                                                       */
-#define cl_khr_d3d11_sharing 1
-
-typedef cl_uint cl_d3d11_device_source_khr;
-typedef cl_uint cl_d3d11_device_set_khr;
-
-/******************************************************************************/
-
-/* Error Codes */
-#define CL_INVALID_D3D11_DEVICE_KHR                  -1006
-#define CL_INVALID_D3D11_RESOURCE_KHR                -1007
-#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR       -1008
-#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR           -1009
-
-/* cl_d3d11_device_source */
-#define CL_D3D11_DEVICE_KHR                          0x4019
-#define CL_D3D11_DXGI_ADAPTER_KHR                    0x401A
-
-/* cl_d3d11_device_set */
-#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR           0x401B
-#define CL_ALL_DEVICES_FOR_D3D11_KHR                 0x401C
-
-/* cl_context_info */
-#define CL_CONTEXT_D3D11_DEVICE_KHR                  0x401D
-#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
-
-/* cl_mem_info */
-#define CL_MEM_D3D11_RESOURCE_KHR                    0x401E
-
-/* cl_image_info */
-#define CL_IMAGE_D3D11_SUBRESOURCE_KHR               0x401F
-
-/* cl_command_type */
-#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR         0x4020
-#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR         0x4021
-
-/******************************************************************************/
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
-    cl_platform_id             platform,
-    cl_d3d11_device_source_khr d3d_device_source,
-    void *                     d3d_object,
-    cl_d3d11_device_set_khr    d3d_device_set,
-    cl_uint                    num_entries,
-    cl_device_id *             devices,
-    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
-    cl_context     context,
-    cl_mem_flags   flags,
-    ID3D11Buffer * resource,
-    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
-    cl_context        context,
-    cl_mem_flags      flags,
-    ID3D11Texture2D * resource,
-    UINT              subresource,
-    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
-    cl_context        context,
-    cl_mem_flags      flags,
-    ID3D11Texture3D * resource,
-    UINT              subresource,
-    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_CL_D3D11_H */
-
--- a/include/cuda/CL/cl_d3d11_ext.h
+++ b/include/cuda/CL/cl_d3d11_ext.h
@ -1,122 +0,0 @@
-/**********************************************************************************
- * Copyright (c) 2008-2009 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- **********************************************************************************/
-
-#ifndef __OPENCL_CL_D3D11_EXT_H
-#define __OPENCL_CL_D3D11_EXT_H
-
-#include <d3d11.h>
-#include <CL/cl.h>
-#include <CL/cl_platform.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/******************************************************************************
- * cl_nv_d3d11_sharing                                                        */
-
-typedef cl_uint cl_d3d11_device_source_nv;
-typedef cl_uint cl_d3d11_device_set_nv;
-
-/******************************************************************************/
-
-// Error Codes
-#define CL_INVALID_D3D11_DEVICE_NV             -1006
-#define CL_INVALID_D3D11_RESOURCE_NV           -1007
-#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_NV  -1008
-#define CL_D3D11_RESOURCE_NOT_ACQUIRED_NV      -1009
-
-// cl_d3d11_device_source_nv
-#define CL_D3D11_DEVICE_NV                     0x4019
-#define CL_D3D11_DXGI_ADAPTER_NV               0x401A
-
-// cl_d3d11_device_set_nv
-#define CL_PREFERRED_DEVICES_FOR_D3D11_NV      0x401B
-#define CL_ALL_DEVICES_FOR_D3D11_NV            0x401C
-
-// cl_context_info
-#define CL_CONTEXT_D3D11_DEVICE_NV             0x401D
-
-// cl_mem_info
-#define CL_MEM_D3D11_RESOURCE_NV               0x401E
-
-// cl_image_info
-#define CL_IMAGE_D3D11_SUBRESOURCE_NV          0x401F
-
-// cl_command_type
-#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_NV    0x4020
-#define CL_COMMAND_RELEASE_D3D11_OBJECTS_NV    0x4021
-
-/******************************************************************************/
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11NV_fn)(
-    cl_platform_id            platform,
-    cl_d3d11_device_source_nv d3d_device_source,
-    void *                    d3d_object,
-    cl_d3d11_device_set_nv    d3d_device_set,
-    cl_uint                   num_entries, 
-    cl_device_id *            devices, 
-    cl_uint *                 num_devices) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferNV_fn)(
-    cl_context     context,
-    cl_mem_flags   flags,
-    ID3D11Buffer * resource,
-    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DNV_fn)(
-    cl_context        context,
-    cl_mem_flags      flags,
-    ID3D11Texture2D * resource,
-    UINT              subresource,
-    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DNV_fn)(
-    cl_context        context,
-    cl_mem_flags      flags,
-    ID3D11Texture3D * resource,
-    UINT              subresource,
-    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsNV_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsNV_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    cl_mem *         mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // __OPENCL_CL_D3D11_H
-
--- a/include/cuda/CL/cl_d3d9_ext.h
+++ b/include/cuda/CL/cl_d3d9_ext.h
@ -1,143 +0,0 @@
-/**********************************************************************************
- * Copyright (c) 2008-2009 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- **********************************************************************************/
-
-#ifndef __OPENCL_CL_D3D9_EXT_H
-#define __OPENCL_CL_D3D9_EXT_H
-
-#include <d3d9.h>
-#include <CL/cl.h>
-#include <CL/cl_platform.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/******************************************************************************
- * cl_nv_d3d9_sharing                                                         */
-
-typedef cl_uint cl_d3d9_device_source_nv;
-typedef cl_uint cl_d3d9_device_set_nv;
-
-/******************************************************************************/
-
-// Error Codes
-#define CL_INVALID_D3D9_DEVICE_NV              -1010
-#define CL_INVALID_D3D9_RESOURCE_NV            -1011
-#define CL_D3D9_RESOURCE_ALREADY_ACQUIRED_NV   -1012
-#define CL_D3D9_RESOURCE_NOT_ACQUIRED_NV       -1013
-
-// cl_d3d9_device_source_nv
-#define CL_D3D9_DEVICE_NV                      0x4022
-#define CL_D3D9_ADAPTER_NAME_NV                0x4023
-
-// cl_d3d9_device_set_nv
-#define CL_PREFERRED_DEVICES_FOR_D3D9_NV       0x4024
-#define CL_ALL_DEVICES_FOR_D3D9_NV             0x4025
-
-// cl_context_info
-#define CL_CONTEXT_D3D9_DEVICE_NV              0x4026
-
-// cl_mem_info
-#define CL_MEM_D3D9_RESOURCE_NV                0x4027
-
-// cl_image_info
-#define CL_IMAGE_D3D9_FACE_NV                  0x4028
-#define CL_IMAGE_D3D9_LEVEL_NV                 0x4029
-
-// cl_command_type
-#define CL_COMMAND_ACQUIRE_D3D9_OBJECTS_NV     0x402A
-#define CL_COMMAND_RELEASE_D3D9_OBJECTS_NV     0x402B
-
-/******************************************************************************/
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D9NV_fn)(
-    cl_platform_id            platform,
-    cl_d3d9_device_source_nv  d3d_device_source,
-    void *                    d3d_object,
-    cl_d3d9_device_set_nv     d3d_device_set,
-    cl_uint                   num_entries, 
-    cl_device_id *            devices, 
-    cl_uint *                 num_devices) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9VertexBufferNV_fn)(
-    cl_context               context,
-    cl_mem_flags             flags,
-    IDirect3DVertexBuffer9 * resource,
-    cl_int *                 errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9IndexBufferNV_fn)(
-    cl_context              context,
-    cl_mem_flags            flags,
-    IDirect3DIndexBuffer9 * resource,
-    cl_int *                errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9SurfaceNV_fn)(
-    cl_context          context,
-    cl_mem_flags        flags,
-    IDirect3DSurface9 * resource,
-    cl_int *            errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9TextureNV_fn)(
-    cl_context         context,
-    cl_mem_flags       flags,
-    IDirect3DTexture9 *resource,
-    UINT               miplevel,
-    cl_int *           errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9CubeTextureNV_fn)(
-    cl_context              context,
-    cl_mem_flags            flags,
-    IDirect3DCubeTexture9 * resource,
-    D3DCUBEMAP_FACES        facetype,
-    UINT                    miplevel,
-    cl_int *                errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9VolumeTextureNV_fn)(
-    cl_context                context,
-    cl_mem_flags              flags,
-    IDirect3DVolumeTexture9 * resource,
-    UINT                      miplevel,
-    cl_int *                  errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D9ObjectsNV_fn)(
-    cl_command_queue command_queue,
-    cl_uint num_objects,
-    const cl_mem *mem_objects,
-    cl_uint num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D9ObjectsNV_fn)(
-    cl_command_queue command_queue,
-    cl_uint num_objects,
-    cl_mem *mem_objects,
-    cl_uint num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // __OPENCL_CL_D3D9_H
-
--- a/include/cuda/CL/cl_dx9_media_sharing.h
+++ b/include/cuda/CL/cl_dx9_media_sharing.h
@ -1,118 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008-2020 The Khronos Group Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- ******************************************************************************/
-
-#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
-#define __OPENCL_CL_DX9_MEDIA_SHARING_H
-
-#include <CL/cl.h>
-#include <CL/cl_platform.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/******************************************************************************/
-/* cl_khr_dx9_media_sharing                                                   */
-#define cl_khr_dx9_media_sharing 1
-
-typedef cl_uint             cl_dx9_media_adapter_type_khr;
-typedef cl_uint             cl_dx9_media_adapter_set_khr;
-    
-#if defined(_WIN32)
-#include <d3d9.h>
-typedef struct _cl_dx9_surface_info_khr
-{
-    IDirect3DSurface9 *resource;
-    HANDLE shared_handle;
-} cl_dx9_surface_info_khr;
-#endif
-
-
-/******************************************************************************/
-
-/* Error Codes */
-#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                -1010
-#define CL_INVALID_DX9_MEDIA_SURFACE_KHR                -1011
-#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR       -1012
-#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR           -1013
-
-/* cl_media_adapter_type_khr */
-#define CL_ADAPTER_D3D9_KHR                              0x2020
-#define CL_ADAPTER_D3D9EX_KHR                            0x2021
-#define CL_ADAPTER_DXVA_KHR                              0x2022
-
-/* cl_media_adapter_set_khr */
-#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR   0x2023
-#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR         0x2024
-
-/* cl_context_info */
-#define CL_CONTEXT_ADAPTER_D3D9_KHR                      0x2025
-#define CL_CONTEXT_ADAPTER_D3D9EX_KHR                    0x2026
-#define CL_CONTEXT_ADAPTER_DXVA_KHR                      0x2027
-
-/* cl_mem_info */
-#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                0x2028
-#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                0x2029
-
-/* cl_image_info */
-#define CL_IMAGE_DX9_MEDIA_PLANE_KHR                     0x202A
-
-/* cl_command_type */
-#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR        0x202B
-#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR        0x202C
-
-/******************************************************************************/
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
-    cl_platform_id                   platform,
-    cl_uint                          num_media_adapters,
-    cl_dx9_media_adapter_type_khr *  media_adapter_type,
-    void *                           media_adapters,
-    cl_dx9_media_adapter_set_khr     media_adapter_set,
-    cl_uint                          num_entries,
-    cl_device_id *                   devices,
-    cl_uint *                        num_devices) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
-    cl_context                    context,
-    cl_mem_flags                  flags,
-    cl_dx9_media_adapter_type_khr adapter_type,
-    void *                        surface_info,
-    cl_uint                       plane,                                                                          
-    cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_H */
-
--- a/include/cuda/CL/cl_egl.h
+++ b/include/cuda/CL/cl_egl.h
@ -1,123 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008-2020 The Khronos Group Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- ******************************************************************************/
-
-#ifndef __OPENCL_CL_EGL_H
-#define __OPENCL_CL_EGL_H
-
-#ifdef __APPLE__
-#else
-#include <CL/cl.h>
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
-#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR  0x202F
-#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR    0x202D
-#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR    0x202E
-
-/* Error type for clCreateFromEGLImageKHR */
-#define CL_INVALID_EGL_OBJECT_KHR             -1093
-#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR      -1092
-
-/* CLeglImageKHR is an opaque handle to an EGLImage */
-typedef void* CLeglImageKHR;
-
-/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
-typedef void* CLeglDisplayKHR;
-
-/* CLeglSyncKHR is an opaque handle to an EGLSync object */
-typedef void* CLeglSyncKHR;
-
-/* properties passed to clCreateFromEGLImageKHR */
-typedef intptr_t cl_egl_image_properties_khr;
-
-
-#define cl_khr_egl_image 1
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromEGLImageKHR(cl_context                  context,
-                        CLeglDisplayKHR             egldisplay,
-                        CLeglImageKHR               eglimage,
-                        cl_mem_flags                flags,
-                        const cl_egl_image_properties_khr * properties,
-                        cl_int *                    errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
-    cl_context                  context,
-    CLeglDisplayKHR             egldisplay,
-    CLeglImageKHR               eglimage,
-    cl_mem_flags                flags,
-    const cl_egl_image_properties_khr * properties,
-    cl_int *                    errcode_ret);
-
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueAcquireEGLObjectsKHR(cl_command_queue command_queue,
-                              cl_uint          num_objects,
-                              const cl_mem *   mem_objects,
-                              cl_uint          num_events_in_wait_list,
-                              const cl_event * event_wait_list,
-                              cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event);
-
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReleaseEGLObjectsKHR(cl_command_queue command_queue,
-                              cl_uint          num_objects,
-                              const cl_mem *   mem_objects,
-                              cl_uint          num_events_in_wait_list,
-                              const cl_event * event_wait_list,
-                              cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event);
-
-
-#define cl_khr_egl_event 1
-
-extern CL_API_ENTRY cl_event CL_API_CALL
-clCreateEventFromEGLSyncKHR(cl_context      context,
-                            CLeglSyncKHR    sync,
-                            CLeglDisplayKHR display,
-                            cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
-    cl_context      context,
-    CLeglSyncKHR    sync,
-    CLeglDisplayKHR display,
-    cl_int *        errcode_ret);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* __OPENCL_CL_EGL_H */
--- a/include/cuda/CL/cl_ext.h
+++ b/include/cuda/CL/cl_ext.h
--- a/include/cuda/CL/cl_gl.h
+++ b/include/cuda/CL/cl_gl.h
@ -1,154 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008-2020 The Khronos Group Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- ******************************************************************************/
-
-#ifndef __OPENCL_CL_GL_H
-#define __OPENCL_CL_GL_H
-
-#ifdef __APPLE__
-#include <OpenCL/cl.h>
-#else
-#include <CL/cl.h>
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef cl_uint     cl_gl_object_type;
-typedef cl_uint     cl_gl_texture_info;
-typedef cl_uint     cl_gl_platform_info;
-typedef struct __GLsync *cl_GLsync;
-
-/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
-#define CL_GL_OBJECT_BUFFER                     0x2000
-#define CL_GL_OBJECT_TEXTURE2D                  0x2001
-#define CL_GL_OBJECT_TEXTURE3D                  0x2002
-#define CL_GL_OBJECT_RENDERBUFFER               0x2003
-#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
-#define CL_GL_OBJECT_TEXTURE1D                  0x200F
-#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
-#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
-
-/* cl_gl_texture_info           */
-#define CL_GL_TEXTURE_TARGET                    0x2004
-#define CL_GL_MIPMAP_LEVEL                      0x2005
-#define CL_GL_NUM_SAMPLES                       0x2012
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLBuffer(cl_context     context,
-                     cl_mem_flags   flags,
-                     cl_GLuint      bufobj,
-                     cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLTexture(cl_context      context,
-                      cl_mem_flags    flags,
-                      cl_GLenum       target,
-                      cl_GLint        miplevel,
-                      cl_GLuint       texture,
-                      cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLRenderbuffer(cl_context   context,
-                           cl_mem_flags flags,
-                           cl_GLuint    renderbuffer,
-                           cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetGLObjectInfo(cl_mem                memobj,
-                  cl_gl_object_type *   gl_object_type,
-                  cl_GLuint *           gl_object_name) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetGLTextureInfo(cl_mem               memobj,
-                   cl_gl_texture_info   param_name,
-                   size_t               param_value_size,
-                   void *               param_value,
-                   size_t *             param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueAcquireGLObjects(cl_command_queue      command_queue,
-                          cl_uint               num_objects,
-                          const cl_mem *        mem_objects,
-                          cl_uint               num_events_in_wait_list,
-                          const cl_event *      event_wait_list,
-                          cl_event *            event) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReleaseGLObjects(cl_command_queue      command_queue,
-                          cl_uint               num_objects,
-                          const cl_mem *        mem_objects,
-                          cl_uint               num_events_in_wait_list,
-                          const cl_event *      event_wait_list,
-                          cl_event *            event) CL_API_SUFFIX__VERSION_1_0;
-
-
-/* Deprecated OpenCL 1.1 APIs */
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
-clCreateFromGLTexture2D(cl_context      context,
-                        cl_mem_flags    flags,
-                        cl_GLenum       target,
-                        cl_GLint        miplevel,
-                        cl_GLuint       texture,
-                        cl_int *        errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
-clCreateFromGLTexture3D(cl_context      context,
-                        cl_mem_flags    flags,
-                        cl_GLenum       target,
-                        cl_GLint        miplevel,
-                        cl_GLuint       texture,
-                        cl_int *        errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-
-/* cl_khr_gl_sharing extension  */
-
-#define cl_khr_gl_sharing 1
-
-typedef cl_uint     cl_gl_context_info;
-
-/* Additional Error Codes  */
-#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
-
-/* cl_gl_context_info  */
-#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
-#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
-
-/* Additional cl_context_properties  */
-#define CL_GL_CONTEXT_KHR                       0x2008
-#define CL_EGL_DISPLAY_KHR                      0x2009
-#define CL_GLX_DISPLAY_KHR                      0x200A
-#define CL_WGL_HDC_KHR                          0x200B
-#define CL_CGL_SHAREGROUP_KHR                   0x200C
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetGLContextInfoKHR(const cl_context_properties * properties,
-                      cl_gl_context_info            param_name,
-                      size_t                        param_value_size,
-                      void *                        param_value,
-                      size_t *                      param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
-    const cl_context_properties * properties,
-    cl_gl_context_info            param_name,
-    size_t                        param_value_size,
-    void *                        param_value,
-    size_t *                      param_value_size_ret);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_CL_GL_H */
--- a/include/cuda/CL/cl_gl_ext.h
+++ b/include/cuda/CL/cl_gl_ext.h
@ -1,44 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008-2020 The Khronos Group Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- ******************************************************************************/
-
-#ifndef __OPENCL_CL_GL_EXT_H
-#define __OPENCL_CL_GL_EXT_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef __APPLE__
-    #include <OpenCL/cl_gl.h>
-#else
-    #include <CL/cl_gl.h>
-#endif
-
-/* 
- *  cl_khr_gl_event extension
- */
-#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
-
-extern CL_API_ENTRY cl_event CL_API_CALL
-clCreateEventFromGLsyncKHR(cl_context context,
-                           cl_GLsync  sync,
-                           cl_int *   errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif	/* __OPENCL_CL_GL_EXT_H  */
--- a/include/cuda/CL/cl_platform.h
+++ b/include/cuda/CL/cl_platform.h
--- a/include/cuda/CL/opencl.h
+++ b/include/cuda/CL/opencl.h
@ -1,40 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008-2020 The Khronos Group Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- ******************************************************************************/
-
-#ifndef __OPENCL_H
-#define __OPENCL_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef __APPLE__
-#include <OpenCL/cl.h>
-#include <OpenCL/cl_gl.h>
-#include <OpenCL/cl_gl_ext.h>
-#include <OpenCL/cl_ext.h>
-#else
-#include <CL/cl.h>
-#include <CL/cl_gl.h>
-#include <CL/cl_gl_ext.h>
-#include <CL/cl_ext.h>
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_H   */
--- a/include/cuda/builtin_types.h
+++ b/include/cuda/builtin_types.h
@ -1,64 +0,0 @@
-/*
- * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#include "device_types.h"
-#if !defined(__CUDACC_RTC__)
-#define EXCLUDE_FROM_RTC
-#include "driver_types.h"
-#undef EXCLUDE_FROM_RTC
-#endif /* !__CUDACC_RTC__ */
-#include "surface_types.h"
-#include "texture_types.h"
-#include "vector_types.h"
--- a/include/cuda/channel_descriptor.h
+++ b/include/cuda/channel_descriptor.h
@ -1,595 +0,0 @@
-/*
- * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__CHANNEL_DESCRIPTOR_H__)
-#define __CHANNEL_DESCRIPTOR_H__
-
-#if defined(__cplusplus)
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#include "cuda_runtime_api.h"
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-/**
- * \addtogroup CUDART_HIGHLEVEL
- *
- * @{
- */
-
-/**
- * \brief \hl Returns a channel descriptor using the specified format
- *
- * Returns a channel descriptor with format \p f and number of bits of each
- * component \p x, \p y, \p z, and \p w.  The ::cudaChannelFormatDesc is
- * defined as:
- * \code
-  struct cudaChannelFormatDesc {
-    int x, y, z, w;
-    enum cudaChannelFormatKind f;
-  };
- * \endcode
- *
- * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
- * ::cudaChannelFormatKindUnsigned, cudaChannelFormatKindFloat,
- * ::cudaChannelFormatKindSignedNormalized8X1, ::cudaChannelFormatKindSignedNormalized8X2,
- * ::cudaChannelFormatKindSignedNormalized8X4,
- * ::cudaChannelFormatKindUnsignedNormalized8X1, ::cudaChannelFormatKindUnsignedNormalized8X2,
- * ::cudaChannelFormatKindUnsignedNormalized8X4,
- * ::cudaChannelFormatKindSignedNormalized16X1, ::cudaChannelFormatKindSignedNormalized16X2,
- * ::cudaChannelFormatKindSignedNormalized16X4,
- * ::cudaChannelFormatKindUnsignedNormalized16X1, ::cudaChannelFormatKindUnsignedNormalized16X2,
- * ::cudaChannelFormatKindUnsignedNormalized16X4
- * or ::cudaChannelFormatKindNV12.
- *
- * The format is specified by the template specialization.
- *
- * The template function specializes for the following scalar types:
- * char, signed char, unsigned char, short, unsigned short, int, unsigned int, long, unsigned long, and float.
- * The template function specializes for the following vector types:
- * char{1|2|4}, uchar{1|2|4}, short{1|2|4}, ushort{1|2|4}, int{1|2|4}, uint{1|2|4}, long{1|2|4}, ulong{1|2|4}, float{1|2|4}.
- * The template function specializes for following cudaChannelFormatKind enum values:
- * ::cudaChannelFormatKind{Uns|S}ignedNormalized{8|16}X{1|2|4}, and ::cudaChannelFormatKindNV12.
- *
- * Invoking the function on a type without a specialization defaults to creating a channel format of kind ::cudaChannelFormatKindNone
- *
- * \return
- * Channel descriptor with format \p f
- *
- * \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
- * ::cudaGetChannelDesc, ::cudaGetTextureReference,
- * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (High level)",
- * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, size_t) "cudaBindTexture (High level, inherited channel descriptor)",
- * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (High level)",
- * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (High level)",
- * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (High level, inherited channel descriptor)",
- * \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (High level)",
- * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (High level)"
- */
-template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
-{
-  return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
-}
-
-static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
-{
-  int e = (int)sizeof(unsigned short) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
-}
-
-static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
-{
-  int e = (int)sizeof(unsigned short) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
-}
-
-static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
-{
-  int e = (int)sizeof(unsigned short) * 8;
-
-  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
-}
-
-static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
-{
-  int e = (int)sizeof(unsigned short) * 8;
-
-  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
-{
-  int e = (int)sizeof(char) * 8;
-
-#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
-#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
-#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
-{
-  int e = (int)sizeof(signed char) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
-{
-  int e = (int)sizeof(unsigned char) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
-{
-  int e = (int)sizeof(signed char) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
-{
-  int e = (int)sizeof(unsigned char) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
-{
-  int e = (int)sizeof(signed char) * 8;
-
-  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
-{
-  int e = (int)sizeof(unsigned char) * 8;
-
-  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
-{
-  int e = (int)sizeof(signed char) * 8;
-
-  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
-{
-  int e = (int)sizeof(unsigned char) * 8;
-
-  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
-{
-  int e = (int)sizeof(short) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
-{
-  int e = (int)sizeof(unsigned short) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
-{
-  int e = (int)sizeof(short) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
-{
-  int e = (int)sizeof(unsigned short) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
-{
-  int e = (int)sizeof(short) * 8;
-
-  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
-{
-  int e = (int)sizeof(unsigned short) * 8;
-
-  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
-{
-  int e = (int)sizeof(short) * 8;
-
-  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
-{
-  int e = (int)sizeof(unsigned short) * 8;
-
-  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
-{
-  int e = (int)sizeof(int) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
-{
-  int e = (int)sizeof(unsigned int) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
-{
-  int e = (int)sizeof(int) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
-{
-  int e = (int)sizeof(unsigned int) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
-{
-  int e = (int)sizeof(int) * 8;
-
-  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
-{
-  int e = (int)sizeof(unsigned int) * 8;
-
-  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
-{
-  int e = (int)sizeof(int) * 8;
-
-  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
-{
-  int e = (int)sizeof(unsigned int) * 8;
-
-  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
-}
-
-#if !defined(__LP64__)
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
-{
-  int e = (int)sizeof(long) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
-{
-  int e = (int)sizeof(unsigned long) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
-{
-  int e = (int)sizeof(long) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
-{
-  int e = (int)sizeof(unsigned long) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
-{
-  int e = (int)sizeof(long) * 8;
-
-  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
-{
-  int e = (int)sizeof(unsigned long) * 8;
-
-  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
-{
-  int e = (int)sizeof(long) * 8;
-
-  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
-{
-  int e = (int)sizeof(unsigned long) * 8;
-
-  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
-}
-
-#endif /* !__LP64__ */
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
-{
-  int e = (int)sizeof(float) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
-{
-  int e = (int)sizeof(float) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
-{
-  int e = (int)sizeof(float) * 8;
-
-  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
-{
-  int e = (int)sizeof(float) * 8;
-
-  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
-}
-
-static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescNV12(void)
-{
-    int e = (int)sizeof(char) * 8;
-
-    return cudaCreateChannelDesc(e, e, e, 0, cudaChannelFormatKindNV12);
-}
-
-template<cudaChannelFormatKind> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
-{
-    return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
-}
-
-/* Signed 8-bit normalized integer formats */
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X1>(void)
-{
-    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedNormalized8X1);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X2>(void)
-{
-    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedNormalized8X2);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X4>(void)
-{
-    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindSignedNormalized8X4);
-}
-
-/* Unsigned 8-bit normalized integer formats */
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X1>(void)
-{
-    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized8X1);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X2>(void)
-{
-    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedNormalized8X2);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X4>(void)
-{
-    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedNormalized8X4);
-}
-
-/* Signed 16-bit normalized integer formats */
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X1>(void)
-{
-    return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindSignedNormalized16X1);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X2>(void)
-{
-    return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindSignedNormalized16X2);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X4>(void)
-{
-    return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindSignedNormalized16X4);
-}
-
-/* Unsigned 16-bit normalized integer formats */
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X1>(void)
-{
-    return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized16X1);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X2>(void)
-{
-    return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindUnsignedNormalized16X2);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X4>(void)
-{
-    return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindUnsignedNormalized16X4);
-}
-
-/* NV12 format */
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindNV12>(void)
-{
-    return cudaCreateChannelDesc(8, 8, 8, 0, cudaChannelFormatKindNV12);
-}
-
-/* BC1 format */
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1>(void)
-{
-    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1);
-}
-
-/* BC1sRGB format */
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1SRGB>(void)
-{
-    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1SRGB);
-}
-
-/* BC2 format */
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2>(void)
-{
-    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2);
-}
-
-/* BC2sRGB format */
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2SRGB>(void)
-{
-    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2SRGB);
-}
-
-/* BC3 format */
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3>(void)
-{
-    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3);
-}
-
-/* BC3sRGB format */
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3SRGB>(void)
-{
-    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3SRGB);
-}
-
-/* BC4 unsigned format */
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed4>(void)
-{
-    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed4);
-}
-
-/* BC4 signed format */
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed4>(void)
-{
-    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedBlockCompressed4);
-}
-
-/* BC5 unsigned format */
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed5>(void)
-{
-    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed5);
-}
-
-/* BC5 signed format */
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed5>(void)
-{
-    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedBlockCompressed5);
-}
-
-/* BC6H unsigned format */
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed6H>(void)
-{
-    return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindUnsignedBlockCompressed6H);
-}
-
-/* BC6H signed format */
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed6H>(void)
-{
-    return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindSignedBlockCompressed6H);
-}
-
-/* BC7 format */
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7>(void)
-{
-    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7);
-}
-
-/* BC7sRGB format */
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7SRGB>(void)
-{
-    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7SRGB);
-}
-
-#endif /* __cplusplus */
-
-/** @} */
-/** @} */ /* END CUDART_TEXTURE_HL */
-
-#endif /* !__CHANNEL_DESCRIPTOR_H__ */
--- a/include/cuda/common_functions.h
+++ b/include/cuda/common_functions.h
@ -1,65 +0,0 @@
-/*
- * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
-#if defined(_MSC_VER)
-#pragma message("common_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
-#else
-#warning "common_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
-#endif
-#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
-#endif
-
-#include "crt/common_functions.h"
-
-#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__)
-#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
-#endif
--- a/include/cuda/cooperative_groups.h
+++ b/include/cuda/cooperative_groups.h
--- a/include/cuda/crt/common_functions.h
+++ b/include/cuda/crt/common_functions.h
@ -1,310 +0,0 @@
-/*
- * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
-#if defined(_MSC_VER)
-#pragma message("crt/common_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
-#else
-#warning "crt/common_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
-#endif
-#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H__
-#endif
-
-#if !defined(__COMMON_FUNCTIONS_H__)
-#define __COMMON_FUNCTIONS_H__
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#if defined(__cplusplus) && defined(__CUDACC__)
-
-#include "builtin_types.h"
-#include "host_defines.h"
-
-#define __CUDACC_VER__ "__CUDACC_VER__ is no longer supported.  Use __CUDACC_VER_MAJOR__, __CUDACC_VER_MINOR__, and __CUDACC_VER_BUILD__ instead."
-
-#ifndef __CUDA_API_VER_MAJOR__
-#define __CUDA_API_VER_MAJOR__ __CUDACC_VER_MAJOR__
-#endif /* __CUDA_API_VER_MAJOR__ */
-
-#ifndef __CUDA_API_VER_MINOR__
-#define __CUDA_API_VER_MINOR__ __CUDACC_VER_MINOR__
-#endif /* __CUDA_API_VER_MINOR__ */
-
-#if !defined(__CUDACC_RTC__)
-#include <string.h>
-#include <time.h>
-
-extern "C"
-{
-#endif /* !__CUDACC_RTC__ */
-extern _CRTIMP __host__ __device__ __device_builtin__ __cudart_builtin__ clock_t __cdecl clock(void)
-#if defined(__QNX__)
-asm("clock32")
-#endif
-__THROW;
-extern         __host__ __device__ __device_builtin__ __cudart_builtin__ void*   __cdecl memset(void*, int, size_t) __THROW;
-extern         __host__ __device__ __device_builtin__ __cudart_builtin__ void*   __cdecl memcpy(void*, const void*, size_t) __THROW;
-#if !defined(__CUDACC_RTC__)
-}
-#endif /* !__CUDACC_RTC__ */
-
-#if defined(__CUDA_ARCH__)
-
-#if defined(__CUDACC_RTC__)
-inline __host__ __device__ void* operator new(size_t, void *p) { return p; }
-inline __host__ __device__ void* operator new[](size_t, void *p) { return p; }
-inline __host__ __device__ void operator delete(void*, void*) { }
-inline __host__ __device__ void operator delete[](void*, void*) { }
-#else /* !__CUDACC_RTC__ */
-#ifndef __CUDA_INTERNAL_SKIP_CPP_HEADERS__
-#include <new>
-#endif
-
-#if defined (__GNUC__)
-
-#define STD \
-        std::
-        
-#else /* __GNUC__ */
-
-#define STD
-
-#endif /* __GNUC__ */
-
-extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new(STD size_t, void*) throw();
-extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new[](STD size_t, void*) throw();
-extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*, void*) throw();
-extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*, void*) throw();
-# if __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__)
-extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*, STD size_t) throw();
-extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*, STD size_t) throw();
-#endif /* __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__)  || defined(__CUDA_ICC_CPP14__) */
-#endif /* __CUDACC_RTC__ */
-
-#if !defined(__CUDACC_RTC__)
-#include <stdio.h>
-#include <stdlib.h>
-#endif /* !__CUDACC_RTC__ */
-
-#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
-namespace std {
-#endif
-extern "C"
-{
-extern
-#if !defined(_MSC_VER) || _MSC_VER < 1900
-_CRTIMP
-#endif
-            
-#if defined(__GLIBC__) && defined(__GLIBC_MINOR__) && ( (__GLIBC__ < 2) || ( (__GLIBC__ == 2) && (__GLIBC_MINOR__ < 3) ) ) 
-__host__ __device__ __device_builtin__ __cudart_builtin__ int     __cdecl printf(const char*, ...) __THROW;
-#else /* newer glibc */
-__host__ __device__ __device_builtin__ __cudart_builtin__ int     __cdecl printf(const char*, ...);
-#endif /* defined(__GLIBC__) && defined(__GLIBC_MINOR__) && ( (__GLIBC__ < 2) || ( (__GLIBC__ == 2) && (__GLIBC_MINOR__ < 3) ) ) */
-
-
-extern _CRTIMP __host__ __device__ __cudart_builtin__ void*   __cdecl malloc(size_t) __THROW;
-extern _CRTIMP __host__ __device__ __cudart_builtin__ void    __cdecl free(void*) __THROW;
-
-#if defined(_MSC_VER)
-extern  __host__ __device__ __cudart_builtin__ void*   __cdecl _alloca(size_t);
-#endif
-
-#if defined(__QNX__)
-#undef alloca
-#define alloca(__S) __builtin_alloca(__S)
-#endif
-}
-#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
-} /* std */
-#endif
-
-#if !defined(__CUDACC_RTC__)
-#include <assert.h>
-#endif /* !__CUDACC_RTC__ */
-
-extern "C"
-{
-#if defined(__CUDACC_RTC__)
-extern __host__ __device__ void __assertfail(const char * __assertion, 
-                                             const char *__file,
-                                             unsigned int __line,
-                                             const char *__function,
-                                             size_t charsize);
-#elif defined(__APPLE__)
-#define __builtin_expect(exp,c) (exp)
-extern __host__ __device__ __cudart_builtin__ void __assert_rtn(
-  const char *, const char *, int, const char *);
-#elif defined(__ANDROID__)
-extern __host__ __device__ __cudart_builtin__ void __assert2(
-  const char *, int, const char *, const char *);
-#elif defined(__QNX__)
-#if !defined(_LIBCPP_VERSION)
-namespace std {
-#endif
-extern __host__ __device__ __cudart_builtin__ void __assert(
-  const char *, const char *, unsigned int, const char *);
-#if !defined(_LIBCPP_VERSION)
-}
-#endif
-#elif defined(__HORIZON__)
-extern __host__ __device__ __cudart_builtin__ void __assert_fail(
-  const char *, const char *, int, const char *);
-#elif defined(__GNUC__)
-extern __host__ __device__ __cudart_builtin__ void __assert_fail(
-  const char *, const char *, unsigned int, const char *)
-  __THROW; 
-#elif defined(_WIN32)
-extern __host__ __device__ __cudart_builtin__ _CRTIMP void __cdecl _wassert(
-  const wchar_t *, const wchar_t *, unsigned);
-#endif
-}
-
-#if defined(__CUDACC_RTC__)
-#ifdef NDEBUG
-#define assert(e) (static_cast<void>(0))
-#else /* !NDEBUG */
-#define __ASSERT_STR_HELPER(x) #x
-#define assert(e) ((e) ? static_cast<void>(0)\
-                       : __assertfail(__ASSERT_STR_HELPER(e), __FILE__,\
-                                      __LINE__, __PRETTY_FUNCTION__,\
-                                      sizeof(char)))
-#endif /* NDEBUG */
-__host__ __device__  void* operator new(size_t);
-__host__ __device__  void* operator new[](size_t);
-__host__ __device__  void operator delete(void*);
-__host__ __device__  void operator delete[](void*);
-# if __cplusplus >= 201402L
-__host__ __device__  void operator delete(void*, size_t);
-__host__ __device__  void operator delete[](void*, size_t);
-#endif /* __cplusplus >= 201402L */
-
-#if __cplusplus >= 201703L
-namespace std { enum class align_val_t : size_t {}; }
-__host__ __device__ void*   __cdecl operator new(size_t sz, std::align_val_t) noexcept;
-__host__ __device__ void*   __cdecl operator new[](size_t sz, std::align_val_t) noexcept;
-__host__ __device__ void    __cdecl operator delete(void* ptr, std::align_val_t) noexcept;
-__host__ __device__ void    __cdecl operator delete[](void* ptr, std::align_val_t) noexcept;
-__host__ __device__ void    __cdecl operator delete(void* ptr, size_t, std::align_val_t) noexcept;
-__host__ __device__ void    __cdecl operator delete[](void* ptr, size_t, std::align_val_t) noexcept;
-#endif  /* __cplusplus >= 201703L */
-
-#else /* !__CUDACC_RTC__ */
-#if defined (__GNUC__)
-
-#define __NV_GLIBCXX_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) 
-
-#if (__cplusplus >= 201103L)  && ((!(defined(__QNX__) && defined(_LIBCPP_VERSION))) || (defined(__QNX__) && __NV_GLIBCXX_VERSION >= 80300))
-#define THROWBADALLOC 
-#else
-#if defined(__ANDROID__) && !defined(_LIBCPP_VERSION) && (defined(__BIONIC__) || __NV_GLIBCXX_VERSION < 40900)
-#define THROWBADALLOC
-#else
-#define THROWBADALLOC  throw(STD bad_alloc)
-#endif
-#endif
-#define __DELETE_THROW throw()
-
-#undef __NV_GLIBCXX_VERSION
-
-#else /* __GNUC__ */
-
-#define THROWBADALLOC  throw(...)
-
-#endif /* __GNUC__ */
-
-extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new(STD size_t) THROWBADALLOC;
-extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new[](STD size_t) THROWBADALLOC;
-extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*) throw();
-extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*) throw();
-# if __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__)
-extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*, STD size_t) throw();
-extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*, STD size_t) throw();
-#endif /* __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__)  */
-
-#if __cpp_aligned_new
-extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new(STD size_t, std::align_val_t);
-extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new[](STD size_t, std::align_val_t);
-extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*, std::align_val_t) noexcept;
-extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*, std::align_val_t) noexcept;
-extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*, STD size_t, std::align_val_t) noexcept;
-extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*, STD size_t, std::align_val_t) noexcept;
-#endif  /* __cpp_aligned_new */
-
-#undef THROWBADALLOC
-#undef STD
-#endif /* __CUDACC_RTC__ */
-
-#endif /* __CUDA_ARCH__ */
-
-#endif /* __cplusplus && __CUDACC__ */
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#if defined(__CUDACC_RTC__) && (__CUDA_ARCH__ >= 350)
-#include "cuda_device_runtime_api.h"
-#endif
-
-#include "math_functions.h"
-
-#endif /* !__COMMON_FUNCTIONS_H__ */
-
-#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H__)
-#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H__
-#endif
--- a/include/cuda/crt/device_double_functions.h
+++ b/include/cuda/crt/device_double_functions.h
--- a/include/cuda/crt/device_double_functions.hpp
+++ b/include/cuda/crt/device_double_functions.hpp
@ -1,197 +0,0 @@
-/*
- * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
-#if defined(_MSC_VER)
-#pragma message("crt/device_double_functions.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
-#else
-#warning "crt/device_double_functions.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
-#endif
-#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_HPP__
-#endif
-
-#if !defined(__DEVICE_DOUBLE_FUNCTIONS_HPP__)
-#define __DEVICE_DOUBLE_FUNCTIONS_HPP__
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#if defined(__cplusplus) && defined(__CUDACC__)
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#if defined(__CUDACC_RTC__)
-#define __DEVICE_DOUBLE_FUNCTIONS_DECL__ __device__
-#else
-#define __DEVICE_DOUBLE_FUNCTIONS_DECL__ static __inline__ __device__
-#endif /* __CUDACC_RTC__ */
-
-#include "builtin_types.h"
-#include "device_types.h"
-#include "host_defines.h"
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-__DEVICE_DOUBLE_FUNCTIONS_DECL__ double fma(double a, double b, double c, enum cudaRoundMode mode)
-{
-  return mode == cudaRoundZero   ? __fma_rz(a, b, c) :
-         mode == cudaRoundPosInf ? __fma_ru(a, b, c) :
-         mode == cudaRoundMinInf ? __fma_rd(a, b, c) :
-                                   __fma_rn(a, b, c);
-}
-
-__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dmul(double a, double b, enum cudaRoundMode mode)
-{
-  return mode == cudaRoundZero   ? __dmul_rz(a, b) :
-         mode == cudaRoundPosInf ? __dmul_ru(a, b) :
-         mode == cudaRoundMinInf ? __dmul_rd(a, b) :
-                                   __dmul_rn(a, b);
-}
-
-__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dadd(double a, double b, enum cudaRoundMode mode)
-{
-  return mode == cudaRoundZero   ? __dadd_rz(a, b) :
-         mode == cudaRoundPosInf ? __dadd_ru(a, b) :
-         mode == cudaRoundMinInf ? __dadd_rd(a, b) :
-                                   __dadd_rn(a, b);
-}
-
-__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dsub(double a, double b, enum cudaRoundMode mode)
-{
-  return mode == cudaRoundZero   ? __dsub_rz(a, b) :
-         mode == cudaRoundPosInf ? __dsub_ru(a, b) :
-         mode == cudaRoundMinInf ? __dsub_rd(a, b) :
-                                   __dsub_rn(a, b);
-}
-
-__DEVICE_DOUBLE_FUNCTIONS_DECL__ int double2int(double a, enum cudaRoundMode mode)
-{
-  return mode == cudaRoundNearest ? __double2int_rn(a) :
-         mode == cudaRoundPosInf  ? __double2int_ru(a) :
-         mode == cudaRoundMinInf  ? __double2int_rd(a) :
-                                    __double2int_rz(a);
-}
-
-__DEVICE_DOUBLE_FUNCTIONS_DECL__ unsigned int double2uint(double a, enum cudaRoundMode mode)
-{
-  return mode == cudaRoundNearest ? __double2uint_rn(a) :
-         mode == cudaRoundPosInf  ? __double2uint_ru(a) :
-         mode == cudaRoundMinInf  ? __double2uint_rd(a) :
-                                    __double2uint_rz(a);
-}
-
-__DEVICE_DOUBLE_FUNCTIONS_DECL__ long long int double2ll(double a, enum cudaRoundMode mode)
-{
-  return mode == cudaRoundNearest ? __double2ll_rn(a) :
-         mode == cudaRoundPosInf  ? __double2ll_ru(a) :
-         mode == cudaRoundMinInf  ? __double2ll_rd(a) :
-                                    __double2ll_rz(a);
-}
-
-__DEVICE_DOUBLE_FUNCTIONS_DECL__ unsigned long long int double2ull(double a, enum cudaRoundMode mode)
-{
-  return mode == cudaRoundNearest ? __double2ull_rn(a) :
-         mode == cudaRoundPosInf  ? __double2ull_ru(a) :
-         mode == cudaRoundMinInf  ? __double2ull_rd(a) :
-                                    __double2ull_rz(a);
-}
-
-__DEVICE_DOUBLE_FUNCTIONS_DECL__ double ll2double(long long int a, enum cudaRoundMode mode)
-{
-  return mode == cudaRoundZero   ? __ll2double_rz(a) :
-         mode == cudaRoundPosInf ? __ll2double_ru(a) :
-         mode == cudaRoundMinInf ? __ll2double_rd(a) :
-                                   __ll2double_rn(a);
-}
-
-__DEVICE_DOUBLE_FUNCTIONS_DECL__ double ull2double(unsigned long long int a, enum cudaRoundMode mode)
-{
-  return mode == cudaRoundZero   ? __ull2double_rz(a) :
-         mode == cudaRoundPosInf ? __ull2double_ru(a) :
-         mode == cudaRoundMinInf ? __ull2double_rd(a) :
-                                   __ull2double_rn(a);
-}
-
-__DEVICE_DOUBLE_FUNCTIONS_DECL__ double int2double(int a, enum cudaRoundMode mode)
-{
-  return (double)a;
-}
-
-__DEVICE_DOUBLE_FUNCTIONS_DECL__ double uint2double(unsigned int a, enum cudaRoundMode mode)
-{
-  return (double)a;
-}
-
-__DEVICE_DOUBLE_FUNCTIONS_DECL__ double float2double(float a, enum cudaRoundMode mode)
-{
-  return (double)a;
-}
-
-#undef __DEVICE_DOUBLE_FUNCTIONS_DECL__
-
-#endif /* __cplusplus && __CUDACC__ */
-
-#endif /* !__DEVICE_DOUBLE_FUNCTIONS_HPP__ */
-
-#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_HPP__)
-#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_HPP__
-#endif
--- a/include/cuda/crt/device_functions.h
+++ b/include/cuda/crt/device_functions.h
--- a/include/cuda/crt/device_functions.hpp
+++ b/include/cuda/crt/device_functions.hpp
@ -1,212 +0,0 @@
-/*
- * Copyright 1993-2020 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
-#if defined(_MSC_VER)
-#pragma message("crt/device_functions.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
-#else
-#warning "crt/device_functions.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
-#endif
-#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_HPP__
-#endif
-
-#if !defined(__DEVICE_FUNCTIONS_HPP__)
-#define __DEVICE_FUNCTIONS_HPP__
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#if defined(__cplusplus) && defined(__CUDACC__)
-
-#if defined(__CUDACC_RTC__)
-#define __DEVICE_FUNCTIONS_DECL__ __device__
-#define __DEVICE_FUNCTIONS_STATIC_DECL__ __device__
-#else
-#define __DEVICE_FUNCTIONS_DECL__ __device__
-#define __DEVICE_FUNCTIONS_STATIC_DECL__ static __inline__ __device__
-#endif /* __CUDACC_RTC__ */
-
-#include "builtin_types.h"
-#include "device_types.h"
-#include "host_defines.h"
-
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-__DEVICE_FUNCTIONS_STATIC_DECL__ int mulhi(const int a, const int b)
-{
-  return __mulhi(a, b);
-}
-
-__DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int mulhi(const unsigned int a, const unsigned int b)
-{
-  return __umulhi(a, b);
-}
-
-__DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int mulhi(const int a, const unsigned int b)
-{
-  return __umulhi(static_cast<unsigned int>(a), b);
-}
-
-__DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int mulhi(const unsigned int a, const int b)
-{
-  return __umulhi(a, static_cast<unsigned int>(b));
-}
-
-__DEVICE_FUNCTIONS_STATIC_DECL__ long long int mul64hi(const long long int a, const long long int b)
-{
-  return __mul64hi(a, b);
-}
-
-__DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int mul64hi(const unsigned long long int a, const unsigned long long int b)
-{
-  return __umul64hi(a, b);
-}
-
-__DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int mul64hi(const long long int a, const unsigned long long int b)
-{
-  return __umul64hi(static_cast<unsigned long long int>(a), b);
-}
-
-__DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int mul64hi(const unsigned long long int a, const long long int b)
-{
-  return __umul64hi(a, static_cast<unsigned long long int>(b));
-}
-
-__DEVICE_FUNCTIONS_STATIC_DECL__ int float_as_int(const float a)
-{
-  return __float_as_int(a);
-}
-
-__DEVICE_FUNCTIONS_STATIC_DECL__ float int_as_float(const int a)
-{
-  return __int_as_float(a);
-}
-
-__DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int float_as_uint(const float a)
-{
-  return __float_as_uint(a);
-}
-
-__DEVICE_FUNCTIONS_STATIC_DECL__ float uint_as_float(const unsigned int a)
-{
-  return __uint_as_float(a);
-}
-__DEVICE_FUNCTIONS_STATIC_DECL__ float saturate(const float a)
-{
-  return __saturatef(a);
-}
-
-__DEVICE_FUNCTIONS_STATIC_DECL__ int mul24(const int a, const int b)
-{
-  return __mul24(a, b);
-}
-
-__DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int umul24(const unsigned int a, const unsigned int b)
-{
-  return __umul24(a, b);
-}
-
-__DEVICE_FUNCTIONS_STATIC_DECL__ int float2int(const float a, const enum cudaRoundMode mode)
-{
-  return (mode == cudaRoundNearest) ? __float2int_rn(a) :
-         (mode == cudaRoundPosInf ) ? __float2int_ru(a) :
-         (mode == cudaRoundMinInf ) ? __float2int_rd(a) :
-                                      __float2int_rz(a);
-}
-
-__DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int float2uint(const float a, const enum cudaRoundMode mode)
-{
-  return (mode == cudaRoundNearest) ? __float2uint_rn(a) :
-         (mode == cudaRoundPosInf ) ? __float2uint_ru(a) :
-         (mode == cudaRoundMinInf ) ? __float2uint_rd(a) :
-                                      __float2uint_rz(a);
-}
-
-__DEVICE_FUNCTIONS_STATIC_DECL__ float int2float(const int a, const enum cudaRoundMode mode)
-{
-  return (mode == cudaRoundZero  ) ? __int2float_rz(a) :
-         (mode == cudaRoundPosInf) ? __int2float_ru(a) :
-         (mode == cudaRoundMinInf) ? __int2float_rd(a) :
-                                     __int2float_rn(a);
-}
-
-__DEVICE_FUNCTIONS_STATIC_DECL__ float uint2float(const unsigned int a, const enum cudaRoundMode mode)
-{
-  return (mode == cudaRoundZero  ) ? __uint2float_rz(a) :
-         (mode == cudaRoundPosInf) ? __uint2float_ru(a) :
-         (mode == cudaRoundMinInf) ? __uint2float_rd(a) :
-                                     __uint2float_rn(a);
-}
-
-#undef __DEVICE_FUNCTIONS_DECL__
-#undef __DEVICE_FUNCTIONS_STATIC_DECL__
-
-#endif /* __cplusplus && __CUDACC__ */
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#endif /* !__DEVICE_FUNCTIONS_HPP__ */
-
-#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_HPP__)
-#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_HPP__
-#endif
--- a/include/cuda/crt/func_macro.h
+++ b/include/cuda/crt/func_macro.h
@ -1,57 +0,0 @@
-/*
- * NVIDIA_COPYRIGHT_BEGIN
- *
- * Copyright (c) 2008-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- *
- * NVIDIA_COPYRIGHT_END
- */
-
-#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
-#if defined(_MSC_VER)
-#pragma message("crt/func_macro.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
-#else
-#warning "crt/func_macro.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
-#endif
-#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_FUNC_MACRO_H__
-#endif
-
-#if !defined(__FUNC_MACRO_H__)
-#define __FUNC_MACRO_H__
-
-#if !defined(__CUDA_INTERNAL_COMPILATION__)
-
-#error -- incorrect inclusion of a cudart header file
-
-#endif /* !__CUDA_INTERNAL_COMPILATION__ */
-
-#if defined(__GNUC__)
-
-#define __func__(decl) \
-        inline decl
-
-#define __device_func__(decl) \
-        static __attribute__((__unused__)) decl
-
-#elif defined(_WIN32)
-
-#define __func__(decl) \
-        static inline decl
-
-#define __device_func__(decl) \
-        static decl
-
-#endif /* __GNUC__ */
-
-#endif /* __FUNC_MACRO_H__ */
-
-#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_FUNC_MACRO_H__)
-#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_FUNC_MACRO_H__
-#endif
--- a/include/cuda/crt/host_config.h
+++ b/include/cuda/crt/host_config.h
@ -1,293 +0,0 @@
-/*
- * Copyright 1993-2022 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
-#if defined(_MSC_VER)
-#pragma message("crt/host_config.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
-#else
-#warning "crt/host_config.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
-#endif
-#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__
-#endif
-
-#if !defined(__HOST_CONFIG_H__)
-#define __HOST_CONFIG_H__
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#if defined(__CUDACC__)
-
-#if defined(__CUDACC_RTC__)
-
-#define _CRTIMP
-#define __THROW
-
-#else /* __CUDACC_RTC__ */
-
-/* check for host compilers that are compatible with nvcc */
-#if !defined(__GNUC__) && !defined(_WIN32)
-
-#error --- !!! UNSUPPORTED COMPILER !!! ---
-
-#endif /* !__GNUC__ && !_WIN32 */
-
-/* check invalid configurations */
-#if defined(__PGIC__)
-#if !defined(__GNUC__) || !defined(__LP64__) || !defined(__linux__)
-#error -- unsupported pgc++ configuration! pgc++ is supported only on Linux x86_64!
-#endif /* !defined(__GNUC__) || !defined(__LP64__) || !defined(__linux__) */
-#endif  /* defined(__PGIC__) */
-
-#if defined(__powerpc__)
-#if !defined(__powerpc64__) || !defined(__LITTLE_ENDIAN__)
-#error -- unsupported PPC platform! Only 64-bit little endian PPC is supported!
-#endif /* !__powerpc64__ || !__LITTLE_ENDIAN__ */
-#endif /* __powerpc__ */
-
-#if defined(__APPLE__) && defined(__MACH__) && !defined(__clang__)
-#error -- clang and clang++ are the only supported host compilers on Mac OS X!
-#endif /* __APPLE__ && __MACH__ && !__clang__ */
-
-
-/* check host compiler version  */
-#if !__NV_NO_HOST_COMPILER_CHECK
-
-#if defined(__ICC)
-
-#if (__ICC != 1500 && __ICC != 1600 && __ICC != 1700 && __ICC != 1800 && !(__ICC >= 1900 && __ICC <= 2021)) || !defined(__GNUC__) || !defined(__LP64__)
-
-#error -- unsupported ICC configuration! Only ICC 15.0, ICC 16.0, ICC 17.0, ICC 18.0, ICC 19.x and 20.x on Linux x86_64 are supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
- 
-#endif /* (__ICC != 1500 && __ICC != 1600 && __ICC != 1700 && __ICC != 1800 && __ICC != 1900) || !__GNUC__ || !__LP64__ */
-
-#endif /* __ICC */
-
-#if defined(__powerpc__)
-
-#if defined(__ibmxl_vrm__) && !(__ibmxl_vrm__ >= 0x0d010000 && __ibmxl_vrm__ < 0x0d020000) && \
-                              !(__ibmxl_vrm__ >= 0x10010000 && __ibmxl_vrm__ < 0x10020000)
-
-#error -- unsupported xlC version! only xlC 13.1 and 16.1 are supported. The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
-
-#endif /* __ibmxl_vrm__ && !(__ibmxl_vrm__ >= 0x0d010000 && __ibmxl_vrm__ < 0x0d020000) &&
-                           !(__ibmxl_vrm__ >= 0x10010000 && __ibmxl_vrm__ < 0x10020000) */
-
-#endif /* __powerpc__ */
-
-#if defined(__GNUC__)
-
-#if __GNUC__ > 11
-
-#error -- unsupported GNU version! gcc versions later than 11 are not supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
-
-#endif /* __GNUC__ > 11 */
-
-
-#if defined(__clang__) && !defined(__ibmxl_vrm__) && !defined(__ICC) && !defined(__HORIZON__) && !defined(__APPLE__)
-
-#if (__clang_major__ >= 14) || (__clang_major__ < 3) || ((__clang_major__ == 3) &&  (__clang_minor__ < 3))
-#error -- unsupported clang version! clang version must be less than 14 and greater than 3.2 . The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
-
-#endif  /* (__clang_major__ >=  14) || (__clang_major__ < 3) || ((__clang_major__ == 3) &&  (__clang_minor__ < 3)) */
-
-#endif /* defined(__clang__) && !defined(__ibmxl_vrm__) && !defined(__ICC) && !defined(__HORIZON__) && !defined(__APPLE__) */
-
-
-#endif /* __GNUC__ */
-
-#if defined(_WIN32)
-
-#if _MSC_VER < 1910 || _MSC_VER >= 1940
-
-#error -- unsupported Microsoft Visual Studio version! Only the versions between 2017 and 2022 (inclusive) are supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
-
-#elif _MSC_VER >= 1910 && _MSC_VER < 1910
-
-#pragma message("support for this version of Microsoft Visual Studio has been deprecated! Only the versions between 2017 and 2022 (inclusive) are supported!")
-
-#endif /* (_MSC_VER < 1910 || _MSC_VER >= 1940) || (_MSC_VER >= 1910 && _MSC_VER < 1910) */
-
-#endif /* _WIN32 */
-#endif  /* !__NV_NO_HOST_COMPILER_CHECK */
-
-
-/* configure host compiler */
-#if defined(__APPLE__)
-
-#define _CRTIMP
-#define _ACRTIMP
-#define __THROW
-
-#if defined(__BLOCKS__) /* nvcc does not support closures */
-
-#undef __BLOCKS__
-
-#endif /* __BLOCKS__ */
-
-#elif defined(__ANDROID__)
-
-#define _CRTIMP
-#define _ACRTIMP
-#define __THROW
-
-#elif defined(__QNX__)
-
-#define _CRTIMP
-#define _ACRTIMP
-#define __THROW
-
-#elif defined(__HORIZON__)
-
-#define _CRTIMP
-#define _ACRTIMP
-#define __THROW
-
-#elif defined(__GNUC__)
-
-#define _CRTIMP
-#define _ACRTIMP
-
-#include <features.h> /* for __THROW */
-
-#elif defined(_WIN32)
-
-#if _MSC_VER >= 1500
-
-#undef _USE_DECLSPECS_FOR_SAL
-#define _USE_DECLSPECS_FOR_SAL \
-        1
-
-#endif /* _MSC_VER >= 1500 */
-
-#if !defined(_CRT_NONSTDC_NO_WARNINGS)
-
-#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */
-
-#endif /* !_CRT_NONSTDC_NO_WARNINGS */
-
-#if !defined(_CRT_SECURE_NO_WARNINGS)
-
-#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */
-
-#endif /* !_CRT_SECURE_NO_WARNINGS */
-
-#if !defined(NOMINMAX)
-
-#define NOMINMAX /* min and max are part of cuda runtime */
-
-#endif /* !NOMINMAX */
-
-#include <crtdefs.h> /* for _CRTIMP */
-#if _MSC_VER >= 1900
-#include <corecrt.h> /* for _ACRTIMP */
-#endif /* _MSC_VER >= 1900 */
-
-#define __THROW
-
-#endif /* __APPLE__ */
-
-#endif /* __CUDACC_RTC__ */
-
-
-#if defined(__cplusplus) && defined(__CUDA_ARCH__) && (defined(__PGIC__) || defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER)))
-
-#if __CUDACC_RTC__
-typedef char *va_list;
-#else /* !__CUDACC_RTC__ */
-#include <cstdarg>
-#endif /* __CUDACC_RTC__ */
-
-
-#undef va_start
-#undef va_end
-#undef va_arg
-
-#ifdef __PGIC__
-
-#undef __builtin_va_end
-
-#define va_start(v,l) __builtin_alt_va_start(v,l)
-#define va_end(v) __builtin_va_end(v)
-#define va_arg(v,l) __builtin_alt_va_arg(v,l)
-
-#if (__cplusplus >= 201103L)
-#undef va_copy
-#define va_copy(d,s)  __builtin_va_copy(d,s)
-#endif
-
-#else /* !__PGIC__ */
-
-
-#define va_start(ap, x) (__cu_va_start(&ap, x))
-#define va_end(ap) (__cu_va_end(&ap))
-#define va_arg(ap, t)  (*((t *)__cu_va_arg(&ap, (t *)0)))
-
-#if (_MSC_VER >= 1800) || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L))
-#undef va_copy
-#define va_copy(apd, aps) (__cu_va_copy(&(apd), &(aps)))
-#endif /* (_MSC_VER >= 1800)  || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L)) */
-#endif /* __PGIC__ */
-
-#endif /* defined(__cplusplus) && (defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER))) */
-
-
-
-#endif /* __CUDACC__ */
-
-#endif /* !__HOST_CONFIG_H__ */
-
-#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__)
-#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__
-#endif
--- a/include/cuda/crt/host_defines.h
+++ b/include/cuda/crt/host_defines.h
@ -1,246 +0,0 @@
-/*
- * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
-#if defined(_MSC_VER)
-#pragma message("crt/host_defines.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
-#else
-#warning "crt/host_defines.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
-#endif
-#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H__
-#endif
-
-#if !defined(__HOST_DEFINES_H__)
-#define __HOST_DEFINES_H__
-
-/* CUDA JIT mode (__CUDACC_RTC__) also uses GNU style attributes */
-#if defined(__GNUC__) || (defined(__PGIC__) && defined(__linux__)) || defined(__CUDA_LIBDEVICE__) || defined(__CUDACC_RTC__)
-
-#if defined(__CUDACC_RTC__)
-#define __volatile__ volatile
-#endif /* __CUDACC_RTC__ */
-
-#define __no_return__ \
-        __attribute__((noreturn))
-        
-#if defined(__CUDACC__) || defined(__CUDA_ARCH__) || defined(__CUDA_LIBDEVICE__)
-/* gcc allows users to define attributes with underscores, 
-   e.g., __attribute__((__noinline__)).
-   Consider a non-CUDA source file (e.g. .cpp) that has the 
-   above attribute specification, and includes this header file. In that case,
-   defining __noinline__ as below  would cause a gcc compilation error.
-   Hence, only define __noinline__ when the code is being processed
-   by a  CUDA compiler component.
-*/   
-#define __noinline__ \
-        __attribute__((noinline))
-#endif /* __CUDACC__  || __CUDA_ARCH__ || __CUDA_LIBDEVICE__ */
-        
-#define __forceinline__ \
-        __inline__ __attribute__((always_inline))
-#define __align__(n) \
-        __attribute__((aligned(n)))
-#define __thread__ \
-        __thread
-#define __import__
-#define __export__
-#define __cdecl
-#define __annotate__(a) \
-        __attribute__((a))
-#define __location__(a) \
-        __annotate__(a)
-#define CUDARTAPI
-#define CUDARTAPI_CDECL
-
-#elif defined(_MSC_VER)
-
-#if _MSC_VER >= 1400
-
-#define __restrict__ \
-        __restrict
-
-#else /* _MSC_VER >= 1400 */
-
-#define __restrict__
-
-#endif /* _MSC_VER >= 1400 */
-
-#define __inline__ \
-        __inline
-#define __no_return__ \
-        __declspec(noreturn)
-#define __noinline__ \
-        __declspec(noinline)
-#define __forceinline__ \
-        __forceinline
-#define __align__(n) \
-        __declspec(align(n))
-#define __thread__ \
-        __declspec(thread)
-#define __import__ \
-        __declspec(dllimport)
-#define __export__ \
-        __declspec(dllexport)
-#define __annotate__(a) \
-        __declspec(a)
-#define __location__(a) \
-        __annotate__(__##a##__)
-#define CUDARTAPI \
-        __stdcall
-#define CUDARTAPI_CDECL \
-        __cdecl
-
-#else /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
-
-#define __inline__
-
-#if !defined(__align__)
-
-#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for '__align__' !!! ---
-
-#endif /* !__align__ */
-
-#if !defined(CUDARTAPI)
-
-#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for 'CUDARTAPI' !!! ---
-
-#endif /* !CUDARTAPI */
-
-#endif /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
-
-#if (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !defined(__clang__)))) || \
-    (defined(_MSC_VER) && _MSC_VER < 1900) || \
-    (!defined(__GNUC__) && !defined(_MSC_VER))
-
-#define __specialization_static \
-        static
-
-#else /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
-         (_MSC_VER && _MSC_VER < 1900) ||
-         (!__GNUC__ && !_MSC_VER) */
-
-#define __specialization_static
-
-#endif /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
-         (_MSC_VER && _MSC_VER < 1900) ||
-         (!__GNUC__ && !_MSC_VER) */
-
-#if !defined(__CUDACC__) && !defined(__CUDA_LIBDEVICE__)
-
-#undef __annotate__
-#define __annotate__(a)
-
-#else /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */
-
-#define __launch_bounds__(...) \
-        __annotate__(launch_bounds(__VA_ARGS__))
-
-#endif /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */
-
-#if defined(__CUDACC__) || defined(__CUDA_LIBDEVICE__) || \
-    defined(__GNUC__) || defined(_WIN64)
-
-#define __builtin_align__(a) \
-        __align__(a)
-
-#else /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__ || _WIN64 */
-
-#define __builtin_align__(a)
-
-#endif /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__  || _WIN64 */
-
-#if defined(__CUDACC__) || !defined(__host__)
-#define __host__ \
-        __location__(host)
-#endif /* defined(__CUDACC__) || !defined(__host__) */
-#if defined(__CUDACC__) || !defined(__device__)
-#define __device__ \
-        __location__(device)
-#endif /* defined(__CUDACC__) || !defined(__device__) */
-#if defined(__CUDACC__) || !defined(__global__)
-#define __global__ \
-        __location__(global)
-#endif /* defined(__CUDACC__) || !defined(__global__) */
-#if defined(__CUDACC__) || !defined(__shared__)
-#define __shared__ \
-        __location__(shared)
-#endif /* defined(__CUDACC__) || !defined(__shared__) */
-#if defined(__CUDACC__) || !defined(__constant__)
-#define __constant__ \
-        __location__(constant)
-#endif /* defined(__CUDACC__) || !defined(__constant__) */
-#if defined(__CUDACC__) || !defined(__managed__)
-#define __managed__ \
-        __location__(managed)
-#endif /* defined(__CUDACC__) || !defined(__managed__) */
-        
-#if !defined(__CUDACC__)
-#define __device_builtin__
-#define __device_builtin_texture_type__
-#define __device_builtin_surface_type__
-#define __cudart_builtin__
-#else /* defined(__CUDACC__) */
-#define __device_builtin__ \
-        __location__(device_builtin)
-#define __device_builtin_texture_type__ \
-        __location__(device_builtin_texture_type)
-#define __device_builtin_surface_type__ \
-        __location__(device_builtin_surface_type)
-#define __cudart_builtin__ \
-        __location__(cudart_builtin)
-#endif /* !defined(__CUDACC__) */
-
-
-#endif /* !__HOST_DEFINES_H__ */
-
-#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H__)
-#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H__
-#endif
--- a/include/cuda/crt/host_runtime.h
+++ b/include/cuda/crt/host_runtime.h
@ -1,288 +0,0 @@
-/*
- * NVIDIA_COPYRIGHT_BEGIN
- *
- * Copyright (c) 2008-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- *
- * NVIDIA_COPYRIGHT_END
- */
-
-#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
-#if defined(_MSC_VER)
-#pragma message("crt/device_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
-#else
-#warning "crt/device_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
-#endif
-#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_RUNTIME_H__
-#endif
-
-#if !defined(__CUDA_INTERNAL_COMPILATION__)
-
-#define __CUDA_INTERNAL_COMPILATION__
-#define __text__
-#define __surf__
-#define __name__shadow_var(c, cpp) \
-        #c
-#define __name__text_var(c, cpp) \
-        #cpp
-#define __host__shadow_var(c, cpp) \
-        cpp
-#define __text_var(c, cpp) \
-        cpp
-#define __device_fun(fun) \
-        #fun
-#define __device_var(var) \
-        #var
-#define __device__text_var(c, cpp) \
-        #c
-#define __device__shadow_var(c, cpp) \
-        #c
-
-#if defined(_WIN32) && !defined(_WIN64)
-
-#define __pad__(f) \
-        f
-
-#else /* _WIN32 && !_WIN64 */
-
-#define __pad__(f)
-
-#endif /* _WIN32 && !_WIN64 */
-
-#include "builtin_types.h"
-#include "storage_class.h"
-
-#else /* !__CUDA_INTERNAL_COMPILATION__ */
-
-template <typename T>
-static inline T *__cudaAddressOf(T &val) 
-{
-    return (T *)((void *)(&(const_cast<char &>(reinterpret_cast<const volatile char &>(val)))));
-}
-
-#define __cudaRegisterBinary(X)                                                   \
-        __cudaFatCubinHandle = __cudaRegisterFatBinary((void*)&__fatDeviceText); \
-        { void (*callback_fp)(void **) =  (void (*)(void **))(X); (*callback_fp)(__cudaFatCubinHandle); __cudaRegisterFatBinaryEnd(__cudaFatCubinHandle); }\
-        atexit(__cudaUnregisterBinaryUtil)
-        
-#define __cudaRegisterVariable(handle, var, ext, size, constant, global) \
-        __cudaRegisterVar(handle, (char*)&__host##var, (char*)__device##var, __name##var, ext, size, constant, global)
-#define __cudaRegisterManagedVariable(handle, var, ext, size, constant, global) \
-        __cudaRegisterManagedVar(handle, (void **)&__host##var, (char*)__device##var, __name##var, ext, size, constant, global)
-
-#define __cudaRegisterGlobalTexture(handle, tex, dim, norm, ext) \
-        __cudaRegisterTexture(handle, (const struct textureReference*)&tex, (const void**)(void*)__device##tex, __name##tex, dim, norm, ext)
-#define __cudaRegisterGlobalSurface(handle, surf, dim, ext) \
-        __cudaRegisterSurface(handle, (const struct surfaceReference*)&surf, (const void**)(void*)__device##surf, __name##surf, dim, ext)
-#define __cudaRegisterEntry(handle, funptr, fun, thread_limit) \
-        __cudaRegisterFunction(handle, (const char*)funptr, (char*)__device_fun(fun), #fun, -1, (uint3*)0, (uint3*)0, (dim3*)0, (dim3*)0, (int*)0)
-
-extern "C" cudaError_t CUDARTAPI __cudaPopCallConfiguration(
-  dim3         *gridDim,
-  dim3         *blockDim,
-  size_t       *sharedMem,
-  void         *stream
-);
-
-#define __cudaLaunchPrologue(size) \
-        void * __args_arr[size]; \
-        int __args_idx = 0
-        
-#define __cudaSetupArg(arg, offset) \
-        __args_arr[__args_idx] = (void *)__cudaAddressOf(arg); ++__args_idx
-          
-#define __cudaSetupArgSimple(arg, offset) \
-        __args_arr[__args_idx] = (void *)(char *)&arg; ++__args_idx
-        
-#if defined(__GNUC__)
-#define __NV_ATTR_UNUSED_FOR_LAUNCH __attribute__((unused))
-#else  /* !__GNUC__ */
-#define __NV_ATTR_UNUSED_FOR_LAUNCH
-#endif  /* __GNUC__ */
-
-/* the use of __args_idx in the expression below avoids host compiler warning about it being an
-   unused variable when the launch has no arguments */
-#define __cudaLaunch(fun) \
-        { volatile static char *__f __NV_ATTR_UNUSED_FOR_LAUNCH;  __f = fun; \
-          dim3 __gridDim, __blockDim;\
-          size_t __sharedMem; \
-          cudaStream_t __stream; \
-          if (__cudaPopCallConfiguration(&__gridDim, &__blockDim, &__sharedMem, &__stream) != cudaSuccess) \
-            return; \
-          if (__args_idx == 0) {\
-            (void)cudaLaunchKernel(fun, __gridDim, __blockDim, &__args_arr[__args_idx], __sharedMem, __stream);\
-          } else { \
-            (void)cudaLaunchKernel(fun, __gridDim, __blockDim, &__args_arr[0], __sharedMem, __stream);\
-          }\
-        }
-
-#if defined(__GNUC__)
-#define __nv_dummy_param_ref(param) \
-        { volatile static void **__ref __attribute__((unused)); __ref = (volatile void **)param; }
-#else /* __GNUC__ */
-#define __nv_dummy_param_ref(param) \
-        { volatile static void **__ref; __ref = (volatile void **)param; }
-#endif /* __GNUC__ */
-
-static void ____nv_dummy_param_ref(void *param) __nv_dummy_param_ref(param)
-
-#define __REGISTERFUNCNAME_CORE(X) __cudaRegisterLinkedBinary##X
-#define __REGISTERFUNCNAME(X) __REGISTERFUNCNAME_CORE(X)
-
-extern "C" {
-void __REGISTERFUNCNAME( __NV_MODULE_ID ) ( void (*)(void **), void *, void *, void (*)(void *));
-}
-
-#define __TO_STRING_CORE(X) #X
-#define __TO_STRING(X) __TO_STRING_CORE(X)
-
-extern "C" {
-#if defined(_WIN32)
-#pragma data_seg("__nv_module_id")
-  static const __declspec(allocate("__nv_module_id")) unsigned char __module_id_str[] = __TO_STRING(__NV_MODULE_ID);
-#pragma data_seg()
-#elif defined(__APPLE__)
-  static const unsigned char __module_id_str[] __attribute__((section ("__NV_CUDA,__nv_module_id"))) = __TO_STRING(__NV_MODULE_ID);
-#else
-  static const unsigned char __module_id_str[] __attribute__((section ("__nv_module_id"))) = __TO_STRING(__NV_MODULE_ID);
-#endif
-
-#undef __FATIDNAME_CORE
-#undef __FATIDNAME
-#define __FATIDNAME_CORE(X) __fatbinwrap##X
-#define __FATIDNAME(X) __FATIDNAME_CORE(X)
-
-#define  ____cudaRegisterLinkedBinary(X) \
-{ __REGISTERFUNCNAME(__NV_MODULE_ID) (( void (*)(void **))(X), (void *)&__FATIDNAME(__NV_MODULE_ID), (void *)&__module_id_str, (void (*)(void *))&____nv_dummy_param_ref); }
-
-}
-
-extern "C" {
-extern void** CUDARTAPI __cudaRegisterFatBinary(
-  void *fatCubin
-);
-
-extern void CUDARTAPI __cudaRegisterFatBinaryEnd(
-  void **fatCubinHandle
-);
-
-extern void CUDARTAPI __cudaUnregisterFatBinary(
-  void **fatCubinHandle
-);
-
-extern void CUDARTAPI __cudaRegisterVar(
-        void **fatCubinHandle,
-        char  *hostVar,
-        char  *deviceAddress,
-  const char  *deviceName,
-        int    ext,
-        size_t size,
-        int    constant,
-        int    global
-);
-
-extern void CUDARTAPI __cudaRegisterManagedVar(
-        void **fatCubinHandle,
-        void **hostVarPtrAddress,
-        char  *deviceAddress,
-  const char  *deviceName,
-        int    ext,
-        size_t size,
-        int    constant,
-        int    global
-);
-
-extern char CUDARTAPI __cudaInitModule(
-        void **fatCubinHandle
-);
-
-extern void CUDARTAPI __cudaRegisterTexture(
-        void                    **fatCubinHandle,
-  const struct textureReference  *hostVar,
-  const void                    **deviceAddress,
-  const char                     *deviceName,
-        int                       dim,       
-        int                       norm,      
-        int                        ext        
-);
-
-extern void CUDARTAPI __cudaRegisterSurface(
-        void                    **fatCubinHandle,
-  const struct surfaceReference  *hostVar,
-  const void                    **deviceAddress,
-  const char                     *deviceName,
-        int                       dim,       
-        int                       ext        
-);
-
-extern void CUDARTAPI __cudaRegisterFunction(
-        void   **fatCubinHandle,
-  const char    *hostFun,
-        char    *deviceFun,
-  const char    *deviceName,
-        int      thread_limit,
-        uint3   *tid,
-        uint3   *bid,
-        dim3    *bDim,
-        dim3    *gDim,
-        int     *wSize
-);
-
-#if defined(__APPLE__)
-extern "C" int atexit(void (*)(void));
-
-#elif  defined(__GNUC__) && !defined(__ANDROID__) && !defined(__HORIZON__)
-extern int atexit(void(*)(void)) throw();
-
-#elif defined(__HORIZON__)
-
-// __TEMP_WAR__ 200132570 HOS : Disable atexit call until it works
-#define atexit(p)
-
-#else /* __GNUC__ && !__ANDROID__ */
-extern int __cdecl atexit(void(__cdecl *)(void));
-#endif
-
-}
-
-static void **__cudaFatCubinHandle;
-
-static void __cdecl __cudaUnregisterBinaryUtil(void)
-{
-  ____nv_dummy_param_ref((void *)&__cudaFatCubinHandle);
-  __cudaUnregisterFatBinary(__cudaFatCubinHandle);
-}
-
-static char __nv_init_managed_rt_with_module(void **handle)
-{
-  return __cudaInitModule(handle);
-}
-
-#include "common_functions.h"
-
-#pragma pack()
-
-#if defined(_WIN32)
-
-#pragma warning(disable: 4099)
-
-#if !defined(_WIN64)
-
-#pragma warning(disable: 4408)
-
-#endif /* !_WIN64 */
-
-#endif /* _WIN32 */
-
-#endif /* !__CUDA_INTERNAL_COMPILATION__ */
-
-#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_RUNTIME_H__)
-#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_RUNTIME_H__
-#endif
--- a/include/cuda/crt/math_functions.h
+++ b/include/cuda/crt/math_functions.h
--- a/include/cuda/crt/math_functions.hpp
+++ b/include/cuda/crt/math_functions.hpp
--- a/include/cuda/crt/mma.h
+++ b/include/cuda/crt/mma.h
@ -1,754 +0,0 @@
-/*
- * Copyright 2017-2020 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
-#if defined(_MSC_VER)
-#pragma message("crt/mma.h is an internal header file and must not be used directly.  Please use mma.h instead.")
-#else
-#warning "crt/mma.h is an internal header file and must not be used directly.  Please use mma.h instead."
-#endif
-#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H__
-#endif
-
-#if !defined(__CUDA_MMA_H__)
-#define __CUDA_MMA_H__
-
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
-
-#define __CUDA_MMA_DEVICE_DECL__ static __device__ __inline__
-
-#if defined(__cplusplus) && defined(__CUDACC__)
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-
-
-#ifndef __CUDA_ARCH__
-#define __DEF_IF_HOST { }
-#else  /* !__CUDA_ARCH__ */
-#define __DEF_IF_HOST ;
-#endif /* __CUDA_ARCH__ */
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 720
-#define __CUDA_IMMA__ 1
-#endif  /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 720 */
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 730
-#define __CUDA_SUBBYTE_IMMA__ 1
-#endif  /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 730 */
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
-#define __CUDA_AMPERE_MMA__ 1
-#endif  /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 */
-
-namespace nvcuda {
-namespace wmma {
-  
-  // utility functions
-#ifdef __CUDA_AMPERE_MMA__
-  inline __device__ float __float_to_tf32(float in) 
-  { 
-    float ret; 
-    asm("{\n  .reg .b32 __$1;"
-        "\n   cvt.rna.tf32.f32 __$1, %1;"
-        "\n   mov.b32 %0, __$1;\n}\n" : "=f"(ret) : "f"(in) ); 
-    return ret; 
-  }
-#endif  /* __CUDA_AMPERE_MMA__ */  
-  
-  // 
-  // tags 
-  // 
-  struct row_major;
-  struct col_major;
-  struct matrix_a;
-  struct matrix_b;
-  struct accumulator;
-
-#ifdef __CUDA_AMPERE_MMA__
-  namespace precision {
-    struct tf32;
-  }
-#endif  /* __CUDA_AMPERE_MMA__ */  
-#ifdef __CUDA_SUBBYTE_IMMA__
-  namespace experimental {
-    namespace precision {
-      struct u4; // 4-bit unsigned
-      struct s4; // 4-bit signed
-      struct b1; // 1-bit
-    }
-    enum bmmaBitOp { bmmaBitOpXOR = 1
-#ifdef __CUDA_AMPERE_MMA__
-                    , bmmaBitOpAND = 2
-#endif  /* __CUDA_AMPERE_MMA__ */
-    };
-    enum bmmaAccumulateOp { bmmaAccumulateOpPOPC = 1 };
-  }
-#endif  /* __CUDA_SUBBYTE_IMMA__ */
-
-  // 
-  // layout
-  //
-  enum layout_t {
-    mem_row_major, mem_col_major
-  };
-  
-  template <typename T>
-  struct helper_traits {
-    typedef T element_type;
-    typedef T storage_element_type;
-    typedef T fill_argument_type;
-  };
-
-#ifdef __CUDA_SUBBYTE_IMMA__
-  template<> struct helper_traits<experimental::precision::u4> {
-    typedef experimental::precision::u4 element_type;
-    typedef unsigned int storage_element_type;
-    typedef unsigned int fill_argument_type;
-  };
-
-  template<> struct helper_traits<experimental::precision::s4> {
-    typedef experimental::precision::s4 element_type;
-    typedef int storage_element_type;
-    typedef int fill_argument_type;
-  };
-  
-  template<> struct helper_traits<experimental::precision::b1> {
-    typedef experimental::precision::b1 element_type;
-    typedef unsigned int storage_element_type;
-    typedef unsigned int fill_argument_type;
-  };
-#endif /* __CUDA_SUBBYTE_IMMA__ */
-
-#ifdef __CUDA_AMPERE_MMA__
-  template<> struct helper_traits<precision::tf32> {
-    typedef precision::tf32 element_type;
-    typedef float storage_element_type;
-    typedef float fill_argument_type;
-  };
-#endif  /* __CUDA_AMPERE_MMA__ */
-  
-  // 
-  // The base fragment type
-  // 
-  /* note: alignment required for compiler implementation */
-  template <typename T, int size, int packed_size = size> 
-  struct __align__(8) __frag_base {
-
-    /* Number of elements in the fragment */
-    enum {num_elements = size};
-    
-    /* Number of storage elements in the fragment. 
-
-       The elements of the fragment are packed together when the 
-       fragment element type is experimental::precision::u4, 
-       experimental::precision::s4 or experimental::precision::b1.
-       When elements are packed, num_storage_elements 
-       will be smaller than num_elements.
-    */
-    enum {num_storage_elements = packed_size};
-
-    /* element type of the fragment */
-    typedef T element_type;
-
-    /* element type of the storage representation. 
-    
-       The mapping from element_type to storage_element_type is as follows:
-       experimental::precision::u4 -> unsigned (8 elements in 1 storage element)
-       experimental::precision::s4 -> int (8 elements in 1 storage element)
-       experimental::precision::b1 -> unsigned (32 elements in 1 storage element)
-       precision::tf32             -> float (1 element in 1 storage element)       
-       all other types T           -> T
-    */
-    typedef typename helper_traits<T>::storage_element_type storage_element_type;
-
-    /* Storage for the (possibly packed) fragment elements. */
-    storage_element_type x[num_storage_elements];
-  };
-
-  template <typename FragEleType, typename StorageType, typename ArgType>
-  static inline __device__ StorageType __get_storage_value(ArgType in) { return in; }
-
-#ifdef __CUDA_SUBBYTE_IMMA__
-  template<>
-  __device__ inline unsigned 
-  __get_storage_value<experimental::precision::u4, unsigned, unsigned>(unsigned in)
-  {
-    /* For experimental::precision::u4 fragment element type, pack 8 elements into a single 
-       32-bit unsigned int storage element */
-    unsigned val = in & 0xf;
-    return (val | (val << 4) | (val << 8) | (val << 12) | (val << 16) |
-            (val << 20) | (val << 24) | (val << 28));
-  };
-
-  template<>
-  __device__ inline int
-  __get_storage_value<experimental::precision::s4, int, int>(int in)
-  {
-    /* For experimental::precision::s4 fragment element type, pack 8 elements into a single 
-       32-bit signed int storage element */
-    int val = in & 0xf;
-    return (val | (val << 4) | (val << 8) | (val << 12) | (val << 16) |
-            (val << 20) | (val << 24) | (val << 28));
-  };
-  
-  template<>
-  __device__ inline unsigned 
-  __get_storage_value<experimental::precision::b1, unsigned, unsigned>(unsigned in)
-  {
-    /* For experimental::precision::b1 fragment element type, pack 32 elements into a 
-       single 32-bit unsigned int storage element */
-    return (in & 0x1) ? 0xFFFFFFFFU : 0;
-  }
-#endif  /* __CUDA_SUBBYTE_IMMA__ */
-
-  template <typename FragEleType, int size, int packed_size>
-    __CUDA_MMA_DEVICE_DECL__ void fill_fragment(__frag_base<FragEleType, size, packed_size>& f, 
-       /*  The mapping from fragment element type (FragEleType) to fill_argument_type is:
-       experimental::precision::u4 -> unsigned (only lower 4 bits taken)
-       experimental::precision::s4 -> int (only lower 4 bits taken)
-       experimental::precision::b1 -> unsigned (only lowest 1 bit taken)
-       precision::tf32             -> float
-       all other types T           -> T
-       */        
-   const typename helper_traits<FragEleType>::fill_argument_type & in) {
-
-   /* get the (possibly packed) storage element value. See the specializations above for fragment
-      element types where the storage representation is packed */
-   typedef typename helper_traits<FragEleType>::storage_element_type storage_type;
-   storage_type v = __get_storage_value<FragEleType, storage_type>(in);
-#pragma unroll
-    for (int i=0; i< f.num_storage_elements; i++)
-      f.x[i] = v; 
-  }
-  
-  // 
-  // Fragment template
-  // 
-  template<typename Use, int m, int n, int k, typename T, typename Layout=void> class fragment;
-
-  // 
-  // Fragments for 16x16x16
-  // 
-  template<> class fragment<matrix_a, 16, 16, 16, __half, row_major> : public __frag_base<__half, 16> {};
-  template<> class fragment<matrix_a, 16, 16, 16, __half, col_major> : public __frag_base<__half, 16> {};
-  template<> class fragment<matrix_b, 16, 16, 16, __half, row_major> : public __frag_base<__half, 16> {};
-  template<> class fragment<matrix_b, 16, 16, 16, __half, col_major> : public __frag_base<__half, 16> {};
-  template<> class fragment<accumulator, 16, 16, 16, __half> : public __frag_base<__half, 8> {};
-  template<> class fragment<accumulator, 16, 16, 16, float> : public __frag_base<float, 8> {};
-
-#ifdef __CUDA_IMMA__
-  template<> class fragment<matrix_a, 16, 16, 16, signed char, row_major> : public __frag_base<signed char, 8> {};
-  template<> class fragment<matrix_a, 16, 16, 16, signed char, col_major> : public __frag_base<signed char, 8> {};
-  template<> class fragment<matrix_a, 16, 16, 16, unsigned char, row_major> : public __frag_base<unsigned char, 8> {};
-  template<> class fragment<matrix_a, 16, 16, 16, unsigned char, col_major> : public __frag_base<unsigned char, 8> {};
-  template<> class fragment<matrix_b, 16, 16, 16, signed char, row_major> : public __frag_base<signed char, 8> {};
-  template<> class fragment<matrix_b, 16, 16, 16, signed char, col_major> : public __frag_base<signed char, 8> {};  
-  template<> class fragment<matrix_b, 16, 16, 16, unsigned char, row_major> : public __frag_base<unsigned char, 8> {};
-  template<> class fragment<matrix_b, 16, 16, 16, unsigned char, col_major> : public __frag_base<unsigned char, 8> {};  
-  template<> class fragment<accumulator, 16, 16, 16, int> : public __frag_base<int, 8> {};
-#endif  /* __CUDA_IMMA__ */
-
-#ifdef __CUDA_AMPERE_MMA__
-  template<> class fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 8> {};
-  template<> class fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 8> {};
-  template<> class fragment<matrix_b, 16, 16, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 8> {};
-  template<> class fragment<matrix_b, 16, 16, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 8> {};
-#endif  /* __CUDA_AMPERE_MMA__ */
-  
-  // 
-  // Fragments for 32x8x16
-  // 
-  template<> class fragment<matrix_a, 32, 8, 16, __half, row_major> : public __frag_base<__half, 16> {};
-  template<> class fragment<matrix_a, 32, 8, 16, __half, col_major> : public __frag_base<__half, 16> {};
-  template<> class fragment<matrix_b, 32, 8, 16, __half, row_major> : public __frag_base<__half, 16> {};
-  template<> class fragment<matrix_b, 32, 8, 16, __half, col_major> : public __frag_base<__half, 16> {};
-  template<> class fragment<accumulator, 32, 8, 16, __half> : public __frag_base<__half, 8> {};
-  template<> class fragment<accumulator, 32, 8, 16, float> : public __frag_base<float, 8> {};
-
-#ifdef __CUDA_IMMA__
-  template<> class fragment<matrix_a, 32, 8, 16, signed char, row_major> : public __frag_base<signed char, 16> {};
-  template<> class fragment<matrix_a, 32, 8, 16, signed char, col_major> : public __frag_base<signed char, 16> {};
-  template<> class fragment<matrix_a, 32, 8, 16, unsigned char, row_major> : public __frag_base<unsigned char, 16> {};
-  template<> class fragment<matrix_a, 32, 8, 16, unsigned char, col_major> : public __frag_base<unsigned char, 16> {};
-  template<> class fragment<matrix_b, 32, 8, 16, signed char, row_major> : public __frag_base<signed char, 4> {};
-  template<> class fragment<matrix_b, 32, 8, 16, signed char, col_major> : public __frag_base<signed char, 4> {};
-  template<> class fragment<matrix_b, 32, 8, 16, unsigned char, row_major> : public __frag_base<unsigned char, 4> {};
-  template<> class fragment<matrix_b, 32, 8, 16, unsigned char, col_major> : public __frag_base<unsigned char, 4> {};
-  template<> class fragment<accumulator, 32, 8, 16, int> : public __frag_base<int, 8> {};
-#endif  /* __CUDA_IMMA__ */
-
-#ifdef __CUDA_AMPERE_MMA__
-  template<> class fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 16> {};
-  template<> class fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 16> {};
-  template<> class fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 4> {};
-  template<> class fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 4> {};
-#endif  /* __CUDA_AMPERE_MMA__ */
-  
-  // 
-  // Fragments for 8x32x16
-  // 
-  template<> class fragment<matrix_a, 8, 32, 16, __half, row_major> : public __frag_base<__half, 16> {};
-  template<> class fragment<matrix_a, 8, 32, 16, __half, col_major> : public __frag_base<__half, 16> {};
-  template<> class fragment<matrix_b, 8, 32, 16, __half, row_major> : public __frag_base<__half, 16> {};
-  template<> class fragment<matrix_b, 8, 32, 16, __half, col_major> : public __frag_base<__half, 16> {};
-  template<> class fragment<accumulator, 8, 32, 16, __half> : public __frag_base<__half, 8> {};
-  template<> class fragment<accumulator, 8, 32, 16, float> : public __frag_base<float, 8> {};
-
-#ifdef __CUDA_IMMA__
-  template<> class fragment<matrix_a, 8, 32, 16, signed char, row_major> : public __frag_base<signed char, 4> {};
-  template<> class fragment<matrix_a, 8, 32, 16, signed char, col_major> : public __frag_base<signed char, 4> {};
-  template<> class fragment<matrix_a, 8, 32, 16, unsigned char, row_major> : public __frag_base<unsigned char, 4> {};
-  template<> class fragment<matrix_a, 8, 32, 16, unsigned char, col_major> : public __frag_base<unsigned char, 4> {};
-  template<> class fragment<matrix_b, 8, 32, 16, signed char, row_major> : public __frag_base<signed char, 16> {};
-  template<> class fragment<matrix_b, 8, 32, 16, signed char, col_major> : public __frag_base<signed char, 16> {};
-  template<> class fragment<matrix_b, 8, 32, 16, unsigned char, row_major> : public __frag_base<unsigned char, 16> {};
-  template<> class fragment<matrix_b, 8, 32, 16, unsigned char, col_major> : public __frag_base<unsigned char, 16> {};
-  template<> class fragment<accumulator, 8, 32, 16, int> : public __frag_base<int, 8> {};
-#endif  /* __CUDA_IMMA__ */
-
-#ifdef __CUDA_AMPERE_MMA__
-  template<> class fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 4> {};
-  template<> class fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 4> {};
-  template<> class fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 16> {};
-  template<> class fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 16> {};
-#endif  /* __CUDA_AMPERE_MMA__ */  
-  
-#ifdef __CUDA_SUBBYTE_IMMA__
-  // 
-  // Fragments for 8x8x32
-  // 
-  template<> class fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major> : public __frag_base<experimental::precision::u4, 8, 1> {};
-  template<> class fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major> : public __frag_base<experimental::precision::s4, 8, 1> {};
-  template<> class fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major> : public __frag_base<experimental::precision::u4, 8, 1> {};
-  template<> class fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major> : public __frag_base<experimental::precision::s4, 8, 1> {};
-  template<> class fragment<accumulator, 8, 8, 32, int> : public __frag_base<int, 2> {};
-
-  // 
-  // Fragments for 8x8x128
-  // 
-  template<> class fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major> : public __frag_base<experimental::precision::b1, 32, 1> {};
-  template<> class fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major> : public __frag_base<experimental::precision::b1, 32, 1> {};
-  template<> class fragment<accumulator, 8, 8, 128, int> : public __frag_base<int, 2> {};
-#endif  /* __CUDA_SUBBYTE_IMMA__ */
-
-#ifdef __CUDA_AMPERE_MMA__
-  //
-  // Fragments for 16x16x8
-  //
-  template<> class fragment<matrix_a, 16, 16, 8, precision::tf32, row_major> : public __frag_base<precision::tf32, 4> {};
-  template<> class fragment<matrix_a, 16, 16, 8, precision::tf32, col_major> : public __frag_base<precision::tf32, 4> {};
-  template<> class fragment<matrix_b, 16, 16, 8, precision::tf32, row_major> : public __frag_base<precision::tf32, 4> {};
-  template<> class fragment<matrix_b, 16, 16, 8, precision::tf32, col_major> : public __frag_base<precision::tf32, 4> {};
-  template<> class fragment<accumulator, 16, 16, 8, float> : public __frag_base<float, 8> {};
-  
-  //
-  // Fragments for 8x8x4
-  //
-  template<> class fragment<matrix_a, 8, 8, 4, double, row_major> : public __frag_base<double, 1> {};
-  template<> class fragment<matrix_a, 8, 8, 4, double, col_major> : public __frag_base<double, 1> {};
-  template<> class fragment<matrix_b, 8, 8, 4, double, row_major> : public __frag_base<double, 1> {};
-  template<> class fragment<matrix_b, 8, 8, 4, double, col_major> : public __frag_base<double, 1> {};
-  template<> class fragment<accumulator, 8, 8, 4, double> : public __frag_base<double, 2> {};
-#endif  /* __CUDA_AMPERE_MMA__ */  
-
-  
-  // 
-  // Load functions for frags of shape m16n16k16
-  // 
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 16, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
-
-#ifdef __CUDA_IMMA__
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 16, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
-#endif  /* __CUDA_IMMA__ */
-  
-#ifdef __CUDA_AMPERE_MMA__
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
-#endif  /* __CUDA_AMPERE_MMA__ */
-
-  //
-  // Load functions for frags of shape m32n8k16
-  // 
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
-
-#ifdef __CUDA_IMMA__
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
-#endif  /* __CUDA_IMMA__ */
-
-#ifdef __CUDA_AMPERE_MMA__
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
-#endif  /* __CUDA_AMPERE_MMA__ */
-
-  //
-  // Load functions for frags of shape m8n32k16
-  // 
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
-  
-#ifdef __CUDA_IMMA__
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
-#endif  /* __CUDA_IMMA__ */
-
-#ifdef __CUDA_AMPERE_MMA__
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
-#endif  /* __CUDA_AMPERE_MMA__ */
-
-#ifdef __CUDA_SUBBYTE_IMMA__
-  //
-  // Load functions for frags of shape m8n8k32
-  // 
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
-  
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 32, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
-
-  //
-  // Load functions for frags of shape m8n8k128
-  // 
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 128, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
-
-#endif  /* __CUDA_SUBBYTE_IMMA__ */
-
-
-#ifdef __CUDA_AMPERE_MMA__
-  //
-  // Load functions for frags of shape m16n16k8
-  // 
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 8, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
-  
-  //
-  // Load functions for frags of shape m8n8k4
-  // 
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 4, double, row_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 4, double, col_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 4, double, row_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 4, double, col_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 4, double>& a, const double* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
-#endif  /* __CUDA_AMPERE_MMA__ */
-
-  // 
-  // Store functions for frags of shape m16n16k16
-  // 
-  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 16, 16, 16, __half>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 16, 16, 16, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
-#ifdef __CUDA_IMMA__
-  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 16, 16, 16, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
-#endif  /* __CUDA_IMMA__ */  
-
-  // 
-  // Store functions for frags of shape m32n8k16
-  // 
-  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 32, 8, 16, __half>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 32, 8, 16, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
-#ifdef __CUDA_IMMA__
-  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 32, 8, 16, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
-#endif  /* __CUDA_IMMA__ */
-
-  // 
-  // Store functions for frags of shape m8n32k16
-  // 
-  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 8, 32, 16, __half>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 8, 32, 16, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
-#ifdef __CUDA_IMMA__
-  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 32, 16, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
-#endif  /* __CUDA_IMMA__ */
-
-#ifdef __CUDA_SUBBYTE_IMMA__
-  // 
-  // Store functions for frags of shape m8n8k32
-  // 
-  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 8, 32, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
-
-  // 
-  // Store functions for frags of shape m8n8k128
-  // 
-  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 8, 128, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
-
-#endif  /* __CUDA_SUBBYTE_IMMA__ */
-
-#ifdef __CUDA_AMPERE_MMA__
-  //
-  // Store functions for frags of shape m16n16k8
-  // 
-  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 16, 16, 8, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
-
-  //
-  // Store functions for frags of shape m8n8k4
-  // 
-  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(double *p, const fragment<accumulator, 8, 8, 4, double>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
-#endif  /* __CUDA_AMPERE_MMA__ */
-
-  // 
-  // MMA functions for shape m16n16k16
-  // 
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
-
-#ifdef __CUDA_IMMA__  
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const fragment<matrix_b,16, 16, 16, signed char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const fragment<matrix_b,16, 16, 16, signed char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const fragment<matrix_b,16, 16, 16, signed char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const fragment<matrix_b,16, 16, 16, signed char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
-#endif  /* __CUDA_IMMA__ */
-
-#ifdef __CUDA_AMPERE_MMA__
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
-#endif  /* __CUDA_AMPERE_MMA__ */
-
-  // 
-  // MMA functions for shape m32n8k16
-  // 
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
-
-#ifdef __CUDA_IMMA__  
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
-#endif  /* __CUDA_IMMA__ */
-
-#ifdef __CUDA_AMPERE_MMA__
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
-#endif  /* __CUDA_AMPERE_MMA__ */
-
-  // 
-  // MMA functions for shape m8n32k16
-  // 
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
-  
-#ifdef __CUDA_IMMA__  
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
-#endif  /* __CUDA_IMMA__ */
-
-#ifdef __CUDA_AMPERE_MMA__
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
-#endif  /* __CUDA_AMPERE_MMA__ */
-
-#ifdef __CUDA_SUBBYTE_IMMA__  
-  // 
-  // MMA functions for shape m8n8k32
-  // 
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 32, int>& d, const fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major>& a, const fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major>& b, const fragment<accumulator, 8, 8, 32, int>& c, bool satf=false) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 32, int>& d, const fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major>& a, const fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major>& b, const fragment<accumulator, 8, 8, 32, int>& c, bool satf=false) __DEF_IF_HOST
-  
-
-  // 
-  // MMA functions for shape m8n8k128
-  // 
-  __CUDA_MMA_DEVICE_DECL__ void bmma_sync(fragment<accumulator, 8, 8, 128, int>& d, const fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major>& a, const fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major>& b, const fragment<accumulator, 8, 8, 128, int>& c,
-                                          experimental::bmmaBitOp = experimental::bmmaBitOpXOR, 
-                                          experimental::bmmaAccumulateOp = experimental::bmmaAccumulateOpPOPC) __DEF_IF_HOST
-
-#endif  /* __CUDA_SUBBYTE_IMMA__ */
-
-#ifdef __CUDA_AMPERE_MMA__
-  // 
-  // MMA functions for shape m16n16k8
-  // 
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
-
-  // 
-  // MMA functions for shape m8n8k4
-  // 
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, row_major>& a, const fragment<matrix_b, 8, 8, 4, double, col_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, col_major>& a, const fragment<matrix_b, 8, 8, 4, double, col_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, row_major>& a, const fragment<matrix_b, 8, 8, 4, double, row_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
-  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, col_major>& a, const fragment<matrix_b, 8, 8, 4, double, row_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
-#endif  /* __CUDA_AMPERE_MMA__ */
-};
-};
-
-#undef __DEF_IF_HOST
-#undef __CUDA_IMMA__
-#undef __CUDA_SUBBYTE_IMMA__
-#undef __CUDA_AMPERE_MMA__
-#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
-
-#endif /* __cplusplus && __CUDACC__ */
-
-#undef __CUDA_MMA_DEVICE_DECL__
-
-#if defined(__CUDA_ARCH__)
-#include "mma.hpp"
-#endif /* defined(__CUDA_ARCH__) */
-
-
-#endif /* !__CUDA_MMA_H__ */
-
-#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H__)
-#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H__
-#endif
--- a/include/cuda/crt/mma.hpp
+++ b/include/cuda/crt/mma.hpp
--- a/include/cuda/crt/nvfunctional
+++ b/include/cuda/crt/nvfunctional
@ -1,621 +0,0 @@
-/*
- * NVIDIA_COPYRIGHT_BEGIN
- *
- * Copyright (c) 2014-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- *
- * NVIDIA_COPYRIGHT_END
- */
-
-#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
-#if defined(_MSC_VER)
-#pragma message("crt/nvfunctional is an internal header file and must not be used directly.  Please use nvfunctional instead.")
-#else
-#warning "crt/nvfunctional is an internal header file and must not be used directly.  Please use nvfunctional instead."
-#endif
-#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H__
-#endif
-
-#ifndef __NV_LIBCXX_FUNCTIONAL_H__
-#define __NV_LIBCXX_FUNCTIONAL_H__
-
-#if __cplusplus < 201103L 
-  #if defined(_MSC_VER)
-    #if _MSC_VER < 1800
-      #error This library requires VS 2013 and above
-    #endif /* _MSC_VER < 1800 */
-  #else /* !_MSC_VER */
-    #error This library requires support for the ISO C++ 2011 standard
-  #endif /* _MSC_VER */
-#endif /* __cplusplus */
-
-#if defined(_MSC_VER)
-  #define __NV_ALIGNOF __alignof
-  #define __NV_NOEXCEPT
-  #define __NV_CONSTEXPR
-#else /* !_MSC_VER */
-  #define __NV_ALIGNOF alignof
-  #define __NV_NOEXCEPT noexcept
-  #define __NV_CONSTEXPR constexpr
-#endif /* _MSC_VER */
-
-#include <type_traits>
-#include <cstddef>
-#include <new>
-
-// n3290 20.8
-namespace nvstd
-{
-
-namespace internal {
-
-// D.8.1 base (deprecated) [depr.base]
-template <class _Arg, class _Result>
-struct unary_function
-{
-  typedef _Arg argument_type;
-  typedef _Result result_type;
-};
-
-template <class _Arg1, class _Arg2, class _Result>
-struct binary_function
-{
-  typedef _Arg1 first_argument_type;
-  typedef _Arg2 second_argument_type;
-  typedef _Result result_type;
-};
-
-// move
-template <class _T>
-inline __device__ __host__
-typename std::remove_reference<_T>::type&& move(_T&& __t) __NV_NOEXCEPT
-{
-  return static_cast<typename std::remove_reference<_T>::type&&>(__t);
-}
-
-// 20.2.2 swap [utility.swap]
-// swap
-template<class _T, 
-         class = typename std::enable_if<
-                   std::is_move_constructible<_T>::value &&
-                   std::is_move_assignable<_T>::value>::type>
-inline __device__ __host__
-void swap(_T& __a, _T& __b) 
-#if !defined(_MSC_VER)
-noexcept(std::is_nothrow_move_constructible<_T>::value &&
-         std::is_nothrow_move_assignable<_T>::value)
-#endif /* !defined(_MSC_VER) */
-{
-  _T __t(internal::move(__a));
-  __a = internal::move(__b);
-  __b = internal::move(__t);
-}
-
-// 20.2.3 forward/move helpers [forward]
-// forward
-template <class _T> 
-inline __device__ __host__
-_T&& forward(typename std::remove_reference<_T>::type& __t) __NV_NOEXCEPT
-{
-  return static_cast<_T&&>(__t);
-}
-
-template <class _T> 
-inline __device__ __host__
-_T&& forward(typename std::remove_reference<_T>::type&& __t) __NV_NOEXCEPT
-{
-  static_assert(!std::is_lvalue_reference<_T>::value,
-                "Error: __t is instantiated with an lvalue reference type");
-  return static_cast<_T&&>(__t);
-}
-
-} // namespace internal
-
-namespace __functional_helpers
-{
-
-struct __dummy_class;
-
-// Store small functors locally:
-// a functor is legitimate to local storage if it is one of the following types:
-// * member object pointer;
-// * member function pointer;
-// * closure type of size less than or equal to the largest size of 
-//   the above types;
-// * function pointer;
-// * any callable class whose size is less than or equal to
-//   the largest one of the above types;
-union _Small_functor_types 
-{
-  void *__obj;
-  void (*__func_ptr)();
-  void (__dummy_class::*mem_fn_ptr)();
-};
-
-struct _Small_functor_data {
-  char __data[sizeof(_Small_functor_types)];
-};
-
-template <class _RetType, class ..._ArgTypes>
-struct __maybe_base_function
-{ };
-
-template <class _RetType, class _T1>
-struct __maybe_base_function<_RetType(_T1)>
-  : public internal::unary_function<_T1, _RetType>
-{ };
-
-template <class _RetType, class _T1, class _T2>
-struct __maybe_base_function<_RetType(_T1, _T2)>
-  : public internal::binary_function<_T1, _T2, _RetType>
-{ };
-
-} // namespace __functional_helpers
-
-// 20.8.11 Polymorphic function wrappers [func.wrap]
-
-// 20.8.11.1 Class bad_function_call [func.wrap.badcall]
-// unimplemented because of exception
-// class bad_function_call : public std::exception
-
-// 20.8.11.2 Class template function [func.wrap.func]
-
-template<class> class function; // undefined
-
-// Simplified version of template class function, which
-//   * does not support allocator_arg_t;
-//   * does not support target and target_type that rely on RTTI
-//   * does not throw bad_function_call exception on invoking a NULL target
-template <class _RetType, class ..._ArgTypes>
-class function<_RetType(_ArgTypes...)> 
-  : public __functional_helpers::__maybe_base_function<_RetType(_ArgTypes...)>
-{
-  __functional_helpers::_Small_functor_data __small_functor_data;
-  void *__obj;
-  typedef _RetType(*__meta_fn_type)(void *, _ArgTypes...);
-  __meta_fn_type __meta_fn;
-  typedef void(*__cloner_type)(function &, const function &);
-  __cloner_type __cloner;
-  typedef void(*__destructor_type)(function *);
-  __destructor_type __destructor;
-
-  #pragma nv_exec_check_disable
-  template <class _F>
-  __device__ __host__
-  __NV_CONSTEXPR bool __use_small_functor_data() const
-  {
-    return (sizeof(_F) <= sizeof(__small_functor_data) &&
-            __NV_ALIGNOF(_F) <= __NV_ALIGNOF(
-                                  __functional_helpers::_Small_functor_types));
-  }
-
-  #pragma nv_exec_check_disable
-  __device__ __host__
-  void* __get_small_functor_data() const
-  {
-    return (void*)(&__small_functor_data.__data[0]);
-  }
-
-  #pragma nv_exec_check_disable
-  __device__ __host__
-  bool __is_small_functor_data() const
-  {
-    return __obj == __get_small_functor_data();
-  }
-
-  #pragma nv_exec_check_disable
-  template <class _F>
-  __device__ __host__
-  static _F& __get_functor(void *__p)
-  {
-    return *((_F*)__p);
-  }
-
-  #pragma nv_exec_check_disable
-  template <class _F>
-  __device__ __host__
-  static bool __is_empty_functor(const _F& /*__p*/)
-  {
-    return false;
-  }
-
-  #pragma nv_exec_check_disable
-  template <class _F>
-  __device__ __host__
-  static bool __is_empty_functor(const _F* __p)
-  {
-    return !__p;
-  }
-  
-  #pragma nv_exec_check_disable
-  template <class _Res, class _C>
-  __device__ __host__
-  static bool __is_empty_functor(const _Res _C::* __p)
-  {
-    return !__p;
-  }
- 
-  #pragma nv_exec_check_disable
-  template <class _Res, class... _Args>
-  __device__ __host__
-  static bool __is_empty_functor(const function<_Res(_Args...)>& __p)
-  {
-    return !__p;
-  }
-  
-  template <class _F>
-  struct __make_cloner
-  {
-    #pragma nv_exec_check_disable
-    __device__ __host__
-    static void __clone_data(function &__dest, const function &__src)
-    {
-      if (__dest.__use_small_functor_data<_F>()) {
-        __dest.__obj = __dest.__get_small_functor_data();
-        new (__dest.__obj) _F(__src.__get_functor<_F>(__src.__obj));
-      }
-      else {
-        __dest.__obj = new _F(__src.__get_functor<_F>(__src.__obj));
-      }
-    }
-  };
-
-  template <class _F>
-  struct __make_destructor
-  {
-    #pragma nv_exec_check_disable
-    __device__ __host__
-    static void __destruct(function *__fn)
-    {
-      if (__fn->__use_small_functor_data<_F>()) {
-        (__fn->__get_functor<_F>(__fn->__obj)).~_F();
-      }
-      else {
-        delete (_F*)(__fn->__obj);
-      }
-    }
-  };
-
-  // We cannot simple define __make_functor in the following way:
-  // template <class _T, _F>
-  // __make_functor;
-  // template <class _RetType1, class _F, class... _ArgTypes1>
-  // struct __make_functor<_RetType1(_ArgTypes1...), _F> 
-  //
-  // because VS 2013 cannot unpack _RetType1(_ArgTypes1...)
-  template <class _RetType1, class _F, class... _ArgTypes1>
-  struct __make_functor
-  {
-    typedef _RetType1 type;
-
-    #pragma nv_exec_check_disable
-    __device__ __host__
-    static _RetType1 __invoke(void *__d, _ArgTypes1... __args)
-    {
-      return __get_functor<_F>(__d)(
-               internal::forward<_ArgTypes1>(__args)...);
-    }
-  };
-
-  template <class _RetType1, class _C, class _M, class... _ArgTypes1>
-  struct __make_functor<_RetType1, _M _C::*,_ArgTypes1...>
-  {
-    typedef _RetType1 type;
-    typedef _RetType1(*_Fn)(_ArgTypes1...);
-
-    #pragma nv_exec_check_disable    
-    __device__ __host__
-    static _RetType1 __invoke(void *__d, _ArgTypes1... __args)
-    {
-      return __get_functor<_Fn>(__d)(
-               internal::forward<_ArgTypes1>(__args)...);
-    }
-  };
-
-// workaround for GCC version below 4.8
-#if (__GNUC__ == 4) && (__GNUC_MINOR__ < 8)
-  template <class _F>
-  struct __check_callability
-    : public std::integral_constant<bool, 
-                                    !std::is_same<_F, std::nullptr_t>::value>
-  { };
-#elif defined(_MSC_VER)
-  // simulate VC 2013's behavior...
-  template <class _F>
-  struct __check_callability1
-    : public 
-        std::integral_constant<bool, 
-          // std::result_of does not handle member pointers well 
-          std::is_member_pointer<_F>::value ||
-          std::is_convertible<
-            _RetType,
-            typename std::result_of<_F(_ArgTypes...)>::type
-          >::value
-        >
-  { };
-
-  template <class _F>
-  struct __check_callability
-    : public std::integral_constant<
-               bool,
-               !std::is_same<_F, function>::value && 
-               __check_callability1<typename std::remove_cv<_F>::type>::value>
-  { };
-#else /* !((__GNUC__ == 4) && (__GNUC_MINOR__ < 8)) _MSC_VER */
-  template <class _F,
-            class _T = typename std::result_of<_F(_ArgTypes...)>::type>
-  struct __check_callability
-    : public std::integral_constant<
-               bool,
-               !std::is_same<_F, function>::value && 
-                 std::is_convertible< _T, _RetType>::value>
-  { };
-#endif /* __GNUC__ == 4) && (__GNUC_MINOR__ < 8) */
-
-  #pragma nv_exec_check_disable
-  __device__ __host__
-  void __destroy()
-  {
-    if (__obj) {
-      __destructor(this);
-      __obj = 0;
-    }
-  }
-  
-  #pragma nv_exec_check_disable 
-  __device__ __host__
-  void __clear()
-  {
-    __obj = 0;
-    __meta_fn = 0;
-    __cloner = 0;
-    __destructor = 0;
-  }
-
-public:
-  typedef _RetType result_type;
-
-/* 
- * These typedef(s) are derived from __maybe_base_function
- * typedef T1 argument_type;        // only if sizeof...(ArgTypes) == 1 and
- *                                  // the type in ArgTypes is T1
- * typedef T1 first_argument_type;  // only if sizeof...(ArgTypes) == 2 and
- *                                  // ArgTypes contains T1 and T2
- * typedef T2 second_argument_type; // only if sizeof...(ArgTypes) == 2 and
- *                                  // ArgTypes contains T1 and T2
- */
-
-  // 20.8.11.2.1 construct/copy/destroy [func.wrap.con]
-  
-  #pragma nv_exec_check_disable 
-  __device__ __host__ 
-  function() __NV_NOEXCEPT
-    : __obj(0), __meta_fn(0), __cloner(0), __destructor(0) {}
-
-  #pragma nv_exec_check_disable 
-  __device__ __host__ 
-  function(std::nullptr_t) __NV_NOEXCEPT
-    : __obj(0), __meta_fn(0), __cloner(0), __destructor(0) {}
-
-  #pragma nv_exec_check_disable 
-  __device__ __host__ 
-  function(const function &__fn)
-  {
-    if (__fn.__obj == 0) {
-      __clear();
-    }
-    else {
-      __meta_fn = __fn.__meta_fn;
-      __destructor = __fn.__destructor;
-      __fn.__cloner(*this, __fn);
-      __cloner = __fn.__cloner;
-    }
-  }
-
-  #pragma nv_exec_check_disable 
-  __device__ __host__ 
-  function(function &&__fn)
-  {
-    __fn.swap(*this);
-  }
-
-  // VS 2013 cannot process __check_callability type trait.
-  // So, we check callability using static_assert instead of
-  // using SFINAE such as
-  // template<class _F, 
-  //          class = typename std::enable_if<
-  //                    __check_callability<_F>::value
-  //         >::type>
-  
-  #pragma nv_exec_check_disable   
-  template<class _F>
-  __device__ __host__ 
-  function(_F);
-
-  // copy and swap
-  #pragma nv_exec_check_disable   
-  __device__ __host__
-  function& operator=(const function& __fn)
-  {
-    function(__fn).swap(*this);
-    return *this;
-  }
-
-  #pragma nv_exec_check_disable 
-  __device__ __host__
-  function& operator=(function&& __fn)
-  {
-    function(internal::move(__fn)).swap(*this);
-    return *this;
-  }
-
-  #pragma nv_exec_check_disable 
-  __device__ __host__
-  function& operator=(std::nullptr_t)
-  {
-    __destroy();
-    return *this;
-  }
-
-  #pragma nv_exec_check_disable
-  template<class _F>
-  __device__ __host__
-  function&
-  operator=(_F&& __fn) 
-  {
-    static_assert(__check_callability<_F>::value,
-                  "Unable to create functor object!");
-    function(internal::forward<_F>(__fn)).swap(*this);
-    return *this;
-  }
-
-  #pragma nv_exec_check_disable
-  __device__ __host__
-  ~function()
-  {
-    __destroy();
-  }
-
-  // 20.8.11.2.2 function modifiers [func.wrap.func.mod]
-  #pragma nv_exec_check_disable 
-  __device__ __host__
-  void swap(function& __fn) __NV_NOEXCEPT
-  {
-    internal::swap(__meta_fn, __fn.__meta_fn);
-    internal::swap(__cloner, __fn.__cloner);
-    internal::swap(__destructor, __fn.__destructor);
-
-    if (__is_small_functor_data() && __fn.__is_small_functor_data()) {
-      internal::swap(__small_functor_data, __fn.__small_functor_data);
-    }
-    else if (__is_small_functor_data()) {
-      internal::swap(__small_functor_data, __fn.__small_functor_data);
-      internal::swap(__obj, __fn.__obj);
-      __fn.__obj = __fn.__get_small_functor_data();
-    }
-    else if (__fn.__is_small_functor_data()) {
-      internal::swap(__small_functor_data, __fn.__small_functor_data);
-      internal::swap(__obj, __fn.__obj);
-      __obj = __get_small_functor_data();
-    }
-    else {
-      internal::swap(__obj, __fn.__obj);
-    }
-  }
-
-  // 20.8.11.2.3 function capacity [func.wrap.func.cap]
-  #pragma nv_exec_check_disable   
-  __device__ __host__
-  explicit operator bool() const __NV_NOEXCEPT
-  {
-    return __obj;
-  }
-
-  // 20.8.11.2.4 function invocation [func.wrap.func.inv]
-  // function::operator() can only be called in device code
-  // to avoid cross-execution space calls
-  #pragma nv_exec_check_disable   
-  __device__ __host__
-  _RetType operator()(_ArgTypes...) const;
-
-};
-
-// Out-of-line definitions
-#pragma nv_exec_check_disable
-template<class _RetType, class... _ArgTypes>
-template<class _F>
-__device__ __host__
-function<_RetType(_ArgTypes...)>::function(_F __fn)
-  : __obj(0), __meta_fn(0), __cloner(0), __destructor(0)
-{
-  static_assert(__check_callability<_F>::value,
-                "Unable to construct functor object!");
-  if (__is_empty_functor(__fn))
-    return;
-  __meta_fn = &__make_functor<_RetType, _F, _ArgTypes...>::__invoke;
-  __cloner = &__make_cloner<_F>::__clone_data;
-  __destructor = &__make_destructor<_F>::__destruct;
-
-  if (__use_small_functor_data<_F>()) {
-    __obj = __get_small_functor_data();
-    new ((void*)__obj) _F(internal::move(__fn));
-  }
-  else {
-    __obj = new _F(internal::move(__fn));
-  }
-}
-
-#pragma nv_exec_check_disable 
-template <class _RetType, class..._ArgTypes>
-__device__ __host__
-_RetType
-function<_RetType(_ArgTypes...)>::operator()(_ArgTypes... __args) const
-{
-  return __meta_fn(__obj, internal::forward<_ArgTypes>(__args)...);
-}
-
-// 20.8.11.2.6, Null pointer comparisons:
-
-#pragma nv_exec_check_disable 
-template <class _R, class... _ArgTypes>
-__device__ __host__
-bool operator==(const function<_R(_ArgTypes...)>& __fn, std::nullptr_t) 
-__NV_NOEXCEPT
-{
-  return !__fn;
-}
-
-#pragma nv_exec_check_disable 
-template <class _R, class... _ArgTypes>
-__device__ __host__
-bool operator==(std::nullptr_t, const function<_R(_ArgTypes...)>& __fn)
-__NV_NOEXCEPT
-{
-  return !__fn;
-}
-
-#pragma nv_exec_check_disable 
-template <class _R, class... _ArgTypes>
-__device__ __host__
-bool operator!=(const function<_R(_ArgTypes...)>& __fn, std::nullptr_t)
-__NV_NOEXCEPT
-{
-  return static_cast<bool>(__fn);
-}
-
-#pragma nv_exec_check_disable 
-template <class _R, class... _ArgTypes>
-__device__ __host__
-bool operator!=(std::nullptr_t, const function<_R(_ArgTypes...)>& __fn)
-__NV_NOEXCEPT
-{
-  return static_cast<bool>(__fn);
-}
-
-// 20.8.11.2.7, specialized algorithms:
-#pragma nv_exec_check_disable 
-template <class _R, class... _ArgTypes>
-__device__ __host__
-void swap(function<_R(_ArgTypes...)>& __fn1, function<_R(_ArgTypes...)>& __fn2)
-{
-  __fn1.swap(__fn2);
-}
-
-} // namespace nvstd
-
-#undef __NV_NOEXCEPT
-#undef __NV_CONSTEXPR
-#undef __NV_ALIGNOF
-
-#endif // __NV_LIBCXX_FUNCTIONAL_H__
-
-#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H__)
-#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H__
-#endif
--- a/include/cuda/crt/sm_70_rt.h
+++ b/include/cuda/crt/sm_70_rt.h
@ -1,131 +0,0 @@
-/*
- * Copyright 2017-2018 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
-#if defined(_MSC_VER)
-#pragma message("crt/sm_70_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
-#else
-#warning "crt/sm_70_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
-#endif
-#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_H__
-#endif
-
-#if !defined(__SM_70_RT_H__)
-#define __SM_70_RT_H__
-
-#if defined(__CUDACC_RTC__)
-#define __SM_70_RT_DECL__ __host__ __device__
-#else /* !__CUDACC_RTC__ */
-#define __SM_70_RT_DECL__ static __device__ __inline__
-#endif /* __CUDACC_RTC__ */
-
-#if defined(__cplusplus) && defined(__CUDACC__)
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#include "builtin_types.h"
-#include "device_types.h"
-#include "host_defines.h"
-
-#ifndef __CUDA_ARCH__
-#define __DEF_IF_HOST { }
-#else  /* !__CUDA_ARCH__ */
-#define __DEF_IF_HOST ;
-#endif /* __CUDA_ARCH__ */
-
-
-/******************************************************************************
- *                                   match                                   *
- ******************************************************************************/
-__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned value) __DEF_IF_HOST
-__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, int value) __DEF_IF_HOST
-__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long value) __DEF_IF_HOST
-__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long value) __DEF_IF_HOST
-__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long long value) __DEF_IF_HOST
-__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long long value) __DEF_IF_HOST
-__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, float value) __DEF_IF_HOST
-__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, double value) __DEF_IF_HOST
-
-__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned value, int *pred) __DEF_IF_HOST
-__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, int value, int *pred) __DEF_IF_HOST
-__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long value, int *pred) __DEF_IF_HOST
-__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long value, int *pred) __DEF_IF_HOST
-__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long long value, int *pred) __DEF_IF_HOST
-__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long long value, int *pred) __DEF_IF_HOST
-__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, float value, int *pred) __DEF_IF_HOST
-__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, double value, int *pred) __DEF_IF_HOST
-
-__SM_70_RT_DECL__ void __nanosleep(unsigned int ns) __DEF_IF_HOST
-
-__SM_70_RT_DECL__ unsigned short int atomicCAS(unsigned short int *address, unsigned short int compare, unsigned short int val) __DEF_IF_HOST
-
-#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
-
-#endif /* __cplusplus && __CUDACC__ */
-
-#undef __DEF_IF_HOST
-#undef __SM_70_RT_DECL__
-
-#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
-#include "sm_70_rt.hpp"
-#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
-
-#endif /* !__SM_70_RT_H__ */
-
-#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_H__)
-#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_H__
-#endif
--- a/include/cuda/crt/sm_70_rt.hpp
+++ b/include/cuda/crt/sm_70_rt.hpp
@ -1,192 +0,0 @@
-/*
- * Copyright 2017-2021 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
-#if defined(_MSC_VER)
-#pragma message("crt/sm_70_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
-#else
-#warning "crt/sm_70_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
-#endif
-#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_HPP__
-#endif
-
-#if !defined(__SM_70_RT_HPP__)
-#define __SM_70_RT_HPP__
-
-#if defined(__CUDACC_RTC__)
-#define __SM_70_RT_DECL__ __host__ __device__
-#else /* !__CUDACC_RTC__ */
-#define __SM_70_RT_DECL__ static __device__ __inline__
-#endif /* __CUDACC_RTC__ */
-
-#if defined(__cplusplus) && defined(__CUDACC__)
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#include "builtin_types.h"
-#include "device_types.h"
-#include "host_defines.h"
-
-/*******************************************************************************
-*                                                                              *
-*  Below are implementations of SM-7.0 builtin functions which are included as *
-*  source (instead of being built in to the compiler)                          *
-*                                                                              *
-*******************************************************************************/
-
-//
-// __match_any_sync
-//
-__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned value) {
-  return __match32_any_sync(mask, value);
-}
-
-__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, int value) {
-  return __match32_any_sync(mask, value);
-}
-
-__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long value) {
-  return (sizeof(long) == sizeof(long long)) ?
-    __match64_any_sync(mask, (unsigned long long)value):
-    __match32_any_sync(mask, (unsigned)value);
-}
-
-__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long value) {
-  return (sizeof(long) == sizeof(long long)) ?
-    __match64_any_sync(mask, (unsigned long long)value):
-    __match32_any_sync(mask, (unsigned)value);
-}
-
-__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long long value) {
-  return __match64_any_sync(mask, value);
-}
-
-__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long long value) {
-  return __match64_any_sync(mask, value);
-}
-
-__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, float value) {
-  return __match32_any_sync(mask, __float_as_uint(value));
-}
-
-__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, double value) {
-  return __match64_any_sync(mask, __double_as_longlong(value));
-}
-
-//
-// __match_all_sync
-//
-__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned value, int *pred) {
-  return __match32_all_sync(mask, value, pred);
-}
-
-__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, int value, int *pred) {
-  return __match32_all_sync(mask, value, pred);
-}
-
-__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long value, int *pred) {
-  return (sizeof(long) == sizeof(long long)) ?
-    __match64_all_sync(mask, (unsigned long long)value, pred):
-    __match32_all_sync(mask, (unsigned)value, pred);
-}
-
-__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long value, int *pred) {
-  return (sizeof(long) == sizeof(long long)) ?
-    __match64_all_sync(mask, (unsigned long long)value, pred):
-    __match32_all_sync(mask, (unsigned)value, pred);
-}
-
-__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long long value, int *pred) {
-  return __match64_all_sync(mask, value, pred);
-}
-
-__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long long value, int *pred) {
-  return __match64_all_sync(mask, value, pred);
-}
-
-__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, float value, int *pred) {
-  return __match32_all_sync(mask, __float_as_uint(value), pred);
-}
-
-__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, double value, int *pred) {
-  return __match64_all_sync(mask, __double_as_longlong(value), pred);
-}
-
-__SM_70_RT_DECL__ void __nanosleep(unsigned int ns) {
-    asm volatile("nanosleep.u32 %0;" :: "r"(ns));
-}
-
-
-extern "C" __device__ __device_builtin__
-unsigned short __usAtomicCAS(unsigned short *, unsigned short, unsigned short);
-
-__SM_70_RT_DECL__ unsigned short int atomicCAS(unsigned short int *address, unsigned short int compare, unsigned short int val) {
-  return __usAtomicCAS(address, compare, val);
-}
-
-
-#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
-
-#endif /* __cplusplus && __CUDACC__ */
-
-#undef __SM_70_RT_DECL__
-
-#endif /* !__SM_70_RT_HPP__ */
-
-#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_HPP__)
-#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_HPP__
-#endif
--- a/include/cuda/crt/sm_80_rt.h
+++ b/include/cuda/crt/sm_80_rt.h
@ -1,158 +0,0 @@
-/*
- * Copyright 2017-2021 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
-#if defined(_MSC_VER)
-#pragma message("crt/sm_80_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
-#else
-#warning "crt/sm_80_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
-#endif
-#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_H__
-#endif
-
-#if !defined(__SM_80_RT_H__)
-#define __SM_80_RT_H__
-
-#if defined(__CUDACC_RTC__)
-#define __SM_80_RT_DECL__ __host__ __device__
-#else /* !__CUDACC_RTC__ */
-#define __SM_80_RT_DECL__ static __device__ __inline__
-#endif /* __CUDACC_RTC__ */
-
-#if defined(__cplusplus) && defined(__CUDACC__)
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#include "builtin_types.h"
-#include "device_types.h"
-#include "host_defines.h"
-
-#ifndef __CUDA_ARCH__
-#define __DEF_IF_HOST { }
-#else  /* !__CUDA_ARCH__ */
-#define __DEF_IF_HOST ;
-#endif /* __CUDA_ARCH__ */
-
-
-/******************************************************************************
- *                                   reduce                                   *
- ******************************************************************************/
-__SM_80_RT_DECL__ unsigned __reduce_add_sync(unsigned mask, unsigned value) __DEF_IF_HOST
-__SM_80_RT_DECL__ unsigned __reduce_min_sync(unsigned mask, unsigned value) __DEF_IF_HOST
-__SM_80_RT_DECL__ unsigned __reduce_max_sync(unsigned mask, unsigned value) __DEF_IF_HOST
-
-__SM_80_RT_DECL__ int __reduce_add_sync(unsigned mask, int value) __DEF_IF_HOST
-__SM_80_RT_DECL__ int __reduce_min_sync(unsigned mask, int value) __DEF_IF_HOST
-__SM_80_RT_DECL__ int __reduce_max_sync(unsigned mask, int value) __DEF_IF_HOST
-
-__SM_80_RT_DECL__ unsigned __reduce_and_sync(unsigned mask, unsigned value) __DEF_IF_HOST
-__SM_80_RT_DECL__ unsigned __reduce_or_sync(unsigned mask, unsigned value) __DEF_IF_HOST
-__SM_80_RT_DECL__ unsigned __reduce_xor_sync(unsigned mask, unsigned value) __DEF_IF_HOST
-
-
-extern "C" {
-inline __device__ void *__nv_associate_access_property(const void *ptr, 
-                                                       unsigned long long property) {
-  extern __device__ void *__nv_associate_access_property_impl(const void *,
-                                                              unsigned long long);
-  return __nv_associate_access_property_impl(ptr, property);
-}
-
-inline __device__  void __nv_memcpy_async_shared_global_4(void *dst, 
-                                                          const void *src, 
-                                                          unsigned src_size) {
-  extern __device__ void __nv_memcpy_async_shared_global_4_impl(void *, 
-                                                                const void *, 
-                                                                unsigned);
-  __nv_memcpy_async_shared_global_4_impl(dst, src, src_size);
-}
-
-inline __device__  void __nv_memcpy_async_shared_global_8(void *dst, 
-                                                          const void *src, 
-                                                          unsigned src_size) {
-  extern __device__ void __nv_memcpy_async_shared_global_8_impl(void *, 
-                                                                const void *, 
-                                                                unsigned);
-  __nv_memcpy_async_shared_global_8_impl(dst, src, src_size);
-}
-
-inline __device__  void __nv_memcpy_async_shared_global_16(void *dst, 
-                                                          const void *src, 
-                                                          unsigned src_size) {
-  extern __device__ void __nv_memcpy_async_shared_global_16_impl(void *, 
-                                                                const void *, 
-                                                                unsigned);
-  __nv_memcpy_async_shared_global_16_impl(dst, src, src_size);
-}
-
-}
-#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 800 */
-
-#endif /* __cplusplus && __CUDACC__ */
-
-#undef __DEF_IF_HOST
-#undef __SM_80_RT_DECL__
-
-#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
-#include "sm_80_rt.hpp"
-#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
-
-#endif /* !__SM_80_RT_H__ */
-
-#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_H__)
-#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_H__
-#endif
--- a/include/cuda/crt/sm_80_rt.hpp
+++ b/include/cuda/crt/sm_80_rt.hpp
@ -1,148 +0,0 @@
-/*
- * Copyright 2017-2021 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
-#if defined(_MSC_VER)
-#pragma message("crt/sm_80_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
-#else
-#warning "crt/sm_80_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
-#endif
-#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_HPP__
-#endif
-
-#if !defined(__SM_80_RT_HPP__)
-#define __SM_80_RT_HPP__
-
-#if defined(__CUDACC_RTC__)
-#define __SM_80_RT_DECL__ __host__ __device__
-#else /* !__CUDACC_RTC__ */
-#define __SM_80_RT_DECL__ static __device__ __inline__
-#endif /* __CUDACC_RTC__ */
-
-#if defined(__cplusplus) && defined(__CUDACC__)
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#include "builtin_types.h"
-#include "device_types.h"
-#include "host_defines.h"
-
-/*******************************************************************************
-*                                                                              *
-*  Below are implementations of SM-8.0 builtin functions which are included as *
-*  source (instead of being built in to the compiler)                          *
-*                                                                              *
-*******************************************************************************/
-
-extern "C" { 
-  __device_builtin__ __device__ unsigned __reduce_add_sync_unsigned_impl(unsigned, unsigned);
-  __device_builtin__ __device__ unsigned __reduce_min_sync_unsigned_impl(unsigned, unsigned);
-  __device_builtin__ __device__ unsigned __reduce_max_sync_unsigned_impl(unsigned, unsigned);
-  __device_builtin__ __device__ int __reduce_add_sync_signed_impl(unsigned, int);
-  __device_builtin__ __device__ int __reduce_min_sync_signed_impl(unsigned, int);
-  __device_builtin__ __device__ int __reduce_max_sync_signed_impl(unsigned, int);
-  __device_builtin__ __device__ unsigned __reduce_or_sync_unsigned_impl(unsigned, unsigned);
-  __device_builtin__ __device__ unsigned __reduce_and_sync_unsigned_impl(unsigned, unsigned);
-  __device_builtin__ __device__ unsigned __reduce_xor_sync_unsigned_impl(unsigned, unsigned);
-}
-
-__SM_80_RT_DECL__ unsigned __reduce_add_sync(unsigned mask, unsigned value) {
-  return __reduce_add_sync_unsigned_impl(mask, value);
-}
-
-__SM_80_RT_DECL__ unsigned __reduce_min_sync(unsigned mask, unsigned value) {
-  return __reduce_min_sync_unsigned_impl(mask, value);
-}
-
-__SM_80_RT_DECL__ unsigned __reduce_max_sync(unsigned mask, unsigned value) {
-  return __reduce_max_sync_unsigned_impl(mask, value);
-}
-
-__SM_80_RT_DECL__ int __reduce_add_sync(unsigned mask, int value) {
-  return __reduce_add_sync_signed_impl(mask, value);
-}
-  
-__SM_80_RT_DECL__ int __reduce_min_sync(unsigned mask, int value) {
-  return __reduce_min_sync_signed_impl(mask, value);
-}
-
-__SM_80_RT_DECL__ int __reduce_max_sync(unsigned mask, int value) {
-  return __reduce_max_sync_signed_impl(mask, value);
-}
-
-__SM_80_RT_DECL__ unsigned __reduce_and_sync(unsigned mask, unsigned value) {
-  return __reduce_and_sync_unsigned_impl(mask, value);
-}
-
-__SM_80_RT_DECL__ unsigned __reduce_or_sync(unsigned mask, unsigned value) {
-  return __reduce_or_sync_unsigned_impl(mask, value);
-}
-
-__SM_80_RT_DECL__ unsigned __reduce_xor_sync(unsigned mask, unsigned value) {
-  return __reduce_xor_sync_unsigned_impl(mask, value);
-}
-#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 800 */
-
-#endif /* __cplusplus && __CUDACC__ */
-
-#undef __SM_80_RT_DECL__
-
-#endif /* !__SM_80_RT_HPP__ */
-
-#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_HPP__)
-#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_HPP__
-#endif
--- a/include/cuda/crt/storage_class.h
+++ b/include/cuda/crt/storage_class.h
@ -1,142 +0,0 @@
-/*
- * NVIDIA_COPYRIGHT_BEGIN
- *
- * Copyright (c) 2008-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- *
- * NVIDIA_COPYRIGHT_END
- */
-
-#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
-#if defined(_MSC_VER)
-#pragma message("crt/storage_class.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
-#else
-#warning "crt/storage_class.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
-#endif
-#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_STORAGE_CLASS_H__
-#endif
-
-#if !defined(__STORAGE_CLASS_H__)
-#define __STORAGE_CLASS_H__
-
-#if !defined(__var_used__)
-
-#define __var_used__
-
-#endif /* __var_used__ */
-
-#if !defined(__loc_sc__)
-
-#define __loc_sc__(loc, size, sc) \
-        __storage##_##sc##size##loc loc
-
-#endif /* !__loc_sc__ */
-
-#if !defined(__storage___device__)
-#define __storage___device__ static __var_used__
-#endif /* __storage___device__ */
-
-#if !defined(__storage_extern__device__)
-#define __storage_extern__device__ static __var_used__
-#endif /* __storage_extern__device__ */
-
-#if !defined(__storage_auto__device__)
-#define __storage_auto__device__ @@@ COMPILER @@@ ERROR @@@
-#endif /* __storage_auto__device__ */
-
-#if !defined(__storage_static__device__)
-#define __storage_static__device__ static __var_used__
-#endif /* __storage_static__device__ */
-
-#if !defined(__storage___constant__)
-#define __storage___constant__ static __var_used__
-#endif /* __storage___constant__ */
-
-#if !defined(__storage_extern__constant__)
-#define __storage_extern__constant__ static __var_used__
-#endif /* __storage_extern__constant__ */
-
-#if !defined(__storage_auto__constant__)
-#define __storage_auto__constant__ @@@ COMPILER @@@ ERROR @@@
-#endif /* __storage_auto__constant__ */
-
-#if !defined(__storage_static__constant__)
-#define __storage_static__constant__ static __var_used__
-#endif /* __storage_static__constant__ */
-
-#if !defined(__storage___shared__)
-#define __storage___shared__ static __var_used__
-#endif /* __storage___shared__ */
-
-#if !defined(__storage_extern__shared__)
-#define __storage_extern__shared__ static __var_used__
-#endif /* __storage_extern__shared__ */
-
-#if !defined(__storage_auto__shared__)
-#define __storage_auto__shared__ static
-#endif /* __storage_auto__shared__ */
-
-#if !defined(__storage_static__shared__)
-#define __storage_static__shared__ static __var_used__
-#endif /* __storage_static__shared__ */
-
-#if !defined(__storage__unsized__shared__)
-#define __storage__unsized__shared__ @@@ COMPILER @@@ ERROR @@@
-#endif /* __storage__unsized__shared__ */
-
-#if !defined(__storage_extern_unsized__shared__)
-#define __storage_extern_unsized__shared__ static __var_used__
-#endif /* __storage_extern_unsized__shared__ */
-
-#if !defined(__storage_auto_unsized__shared__)
-#define __storage_auto_unsized__shared__ @@@ COMPILER @@@ ERROR @@@
-#endif /* __storage_auto_unsized__shared__ */
-
-#if !defined(__storage_static_unsized__shared__)
-#define __storage_static_unsized__shared__ @@@ COMPILER @@@ ERROR @@@
-#endif /* __storage_static_unsized__shared__ */
-
-#if !defined(__storage___text__)
-#define __storage___text__ static __var_used__
-#endif /* __storage___text__ */
-
-#if !defined(__storage_extern__text__)
-#define __storage_extern__text__ static __var_used__
-#endif /* __storage_extern__text__ */
-
-#if !defined(__storage_auto__text__)
-#define __storage_auto__text__ @@@ COMPILER @@@ ERROR @@@
-#endif /* __storage_auto__text__ */
-
-#if !defined(__storage_static__text__)
-#define __storage_static__text__ static __var_used__
-#endif /* __storage_static__text__ */
-
-#if !defined(__storage___surf__)
-#define __storage___surf__ static __var_used__
-#endif /* __storage___surf__ */
-
-#if !defined(__storage_extern__surf__)
-#define __storage_extern__surf__ static __var_used__
-#endif /* __storage_extern__surf__ */
-
-#if !defined(__storage_auto__surf__)
-#define __storage_auto__surf__ @@@ COMPILER @@@ ERROR @@@
-#endif /* __storage_auto__surf__ */
-
-#if !defined(__storage_static__surf__)
-#define __storage_static__surf__ static __var_used__
-#endif /* __storage_static__surf__ */
-
-#endif /* !__STORAGE_CLASS_H__ */
-
-#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_STORAGE_CLASS_H__)
-#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_STORAGE_CLASS_H__
-#endif
--- a/include/cuda/cuComplex.h
+++ b/include/cuda/cuComplex.h
@ -1,348 +0,0 @@
-/*
- * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(CU_COMPLEX_H_)
-#define CU_COMPLEX_H_
-
-#if !defined(__CUDACC_RTC__)
-#if defined(__GNUC__)
-#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)))
-#pragma GCC diagnostic ignored "-Wunused-function"
-#endif
-#endif
-#endif
-
-/* When trying to include C header file in C++ Code extern "C" is required
- * But the Standard QNX headers already have ifdef extern in them when compiling C++ Code
- * extern "C" cannot be nested
- * Hence keep the header out of extern "C" block
- */
-
-#if !defined(__CUDACC__)
-#include <math.h>       /* import fabsf, sqrt */
-#endif /* !defined(__CUDACC__) */
-
-#if defined(__cplusplus)
-extern "C" {
-#endif /* __cplusplus */
-
-#include "vector_types.h"
-
-typedef float2 cuFloatComplex;
-
-__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x) 
-{ 
-    return x.x; 
-}
-
-__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x) 
-{ 
-    return x.y; 
-}
-
-__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex 
-                                                             (float r, float i)
-{
-    cuFloatComplex res;
-    res.x = r;
-    res.y = i;
-    return res;
-}
-
-__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)
-{
-    return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x));
-}
-__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x,
-                                                              cuFloatComplex y)
-{
-    return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y), 
-                                cuCimagf(x) + cuCimagf(y));
-}
-
-__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x,
-                                                              cuFloatComplex y)
-{
-        return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y), 
-                                    cuCimagf(x) - cuCimagf(y));
-}
-
-/* This implementation could suffer from intermediate overflow even though
- * the final result would be in range. However, various implementations do
- * not guard against this (presumably to avoid losing performance), so we 
- * don't do it either to stay competitive.
- */
-__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x,
-                                                              cuFloatComplex y)
-{
-    cuFloatComplex prod;
-    prod = make_cuFloatComplex  ((cuCrealf(x) * cuCrealf(y)) - 
-                                 (cuCimagf(x) * cuCimagf(y)),
-                                 (cuCrealf(x) * cuCimagf(y)) + 
-                                 (cuCimagf(x) * cuCrealf(y)));
-    return prod;
-}
-
-/* This implementation guards against intermediate underflow and overflow
- * by scaling. Such guarded implementations are usually the default for
- * complex library implementations, with some also offering an unguarded,
- * faster version.
- */
-__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x,
-                                                              cuFloatComplex y)
-{
-    cuFloatComplex quot;
-    float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));
-    float oos = 1.0f / s;
-    float ars = cuCrealf(x) * oos;
-    float ais = cuCimagf(x) * oos;
-    float brs = cuCrealf(y) * oos;
-    float bis = cuCimagf(y) * oos;
-    s = (brs * brs) + (bis * bis);
-    oos = 1.0f / s;
-    quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,
-                                ((ais * brs) - (ars * bis)) * oos);
-    return quot;
-}
-
-/* 
- * We would like to call hypotf(), but it's not available on all platforms.
- * This discrete implementation guards against intermediate underflow and 
- * overflow by scaling. Otherwise we would lose half the exponent range. 
- * There are various ways of doing guarded computation. For now chose the 
- * simplest and fastest solution, however this may suffer from inaccuracies 
- * if sqrt and division are not IEEE compliant. 
- */
-__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
-{
-    float a = cuCrealf(x);
-    float b = cuCimagf(x);
-    float v, w, t;
-    a = fabsf(a);
-    b = fabsf(b);
-    if (a > b) {
-        v = a;
-        w = b; 
-    } else {
-        v = b;
-        w = a;
-    }
-    t = w / v;
-    t = 1.0f + t * t;
-    t = v * sqrtf(t);
-    if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {
-        t = v + w;
-    }
-    return t;
-}
-
-/* Double precision */
-typedef double2 cuDoubleComplex;
-
-__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x) 
-{ 
-    return x.x; 
-}
-
-__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x) 
-{ 
-    return x.y; 
-}
-
-__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex 
-                                                           (double r, double i)
-{
-    cuDoubleComplex res;
-    res.x = r;
-    res.y = i;
-    return res;
-}
-
-__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x)
-{
-    return make_cuDoubleComplex (cuCreal(x), -cuCimag(x));
-}
-
-__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x,
-                                                             cuDoubleComplex y)
-{
-    return make_cuDoubleComplex (cuCreal(x) + cuCreal(y), 
-                                 cuCimag(x) + cuCimag(y));
-}
-
-__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x,
-                                                             cuDoubleComplex y)
-{
-    return make_cuDoubleComplex (cuCreal(x) - cuCreal(y), 
-                                 cuCimag(x) - cuCimag(y));
-}
-
-/* This implementation could suffer from intermediate overflow even though
- * the final result would be in range. However, various implementations do
- * not guard against this (presumably to avoid losing performance), so we 
- * don't do it either to stay competitive.
- */
-__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x,
-                                                             cuDoubleComplex y)
-{
-    cuDoubleComplex prod;
-    prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) - 
-                                 (cuCimag(x) * cuCimag(y)),
-                                 (cuCreal(x) * cuCimag(y)) + 
-                                 (cuCimag(x) * cuCreal(y)));
-    return prod;
-}
-
-/* This implementation guards against intermediate underflow and overflow
- * by scaling. Such guarded implementations are usually the default for
- * complex library implementations, with some also offering an unguarded,
- * faster version.
- */
-__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x,
-                                                             cuDoubleComplex y)
-{
-    cuDoubleComplex quot;
-    double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y)));
-    double oos = 1.0 / s;
-    double ars = cuCreal(x) * oos;
-    double ais = cuCimag(x) * oos;
-    double brs = cuCreal(y) * oos;
-    double bis = cuCimag(y) * oos;
-    s = (brs * brs) + (bis * bis);
-    oos = 1.0 / s;
-    quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos,
-                                 ((ais * brs) - (ars * bis)) * oos);
-    return quot;
-}
-
-/* This implementation guards against intermediate underflow and overflow
- * by scaling. Otherwise we would lose half the exponent range. There are
- * various ways of doing guarded computation. For now chose the simplest
- * and fastest solution, however this may suffer from inaccuracies if sqrt
- * and division are not IEEE compliant.
- */
-__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x)
-{
-    double a = cuCreal(x);
-    double b = cuCimag(x);
-    double v, w, t;
-    a = fabs(a);
-    b = fabs(b);
-    if (a > b) {
-        v = a;
-        w = b; 
-    } else {
-        v = b;
-        w = a;
-    }
-    t = w / v;
-    t = 1.0 + t * t;
-    t = v * sqrt(t);
-    if ((v == 0.0) || 
-        (v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) {
-        t = v + w;
-    }
-    return t;
-}
-
-#if defined(__cplusplus)
-}
-#endif /* __cplusplus */
-
-/* aliases */
-typedef cuFloatComplex cuComplex;
-__host__ __device__ static __inline__ cuComplex make_cuComplex (float x, 
-                                                                float y) 
-{ 
-    return make_cuFloatComplex (x, y); 
-}
-
-/* float-to-double promotion */
-__host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble
-                                                      (cuFloatComplex c)
-{
-    return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));
-}
-
-__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat
-(cuDoubleComplex c)
-{
-	return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));
-}
-
-
-__host__ __device__ static __inline__  cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d)
-{
-    float real_res;
-    float imag_res;
-    
-    real_res = (cuCrealf(x) *  cuCrealf(y)) + cuCrealf(d);
-    imag_res = (cuCrealf(x) *  cuCimagf(y)) + cuCimagf(d);
-            
-    real_res = -(cuCimagf(x) * cuCimagf(y))  + real_res;  
-    imag_res =  (cuCimagf(x) *  cuCrealf(y)) + imag_res;          
-     
-    return make_cuComplex(real_res, imag_res);
-}
-
-__host__ __device__ static __inline__  cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d)
-{
-    double real_res;
-    double imag_res;
-    
-    real_res = (cuCreal(x) *  cuCreal(y)) + cuCreal(d);
-    imag_res = (cuCreal(x) *  cuCimag(y)) + cuCimag(d);
-            
-    real_res = -(cuCimag(x) * cuCimag(y))  + real_res;  
-    imag_res =  (cuCimag(x) *  cuCreal(y)) + imag_res;     
-     
-    return make_cuDoubleComplex(real_res, imag_res);
-}
-
-#endif /* !defined(CU_COMPLEX_H_) */
--- a/include/cuda/cublas.h
+++ b/include/cuda/cublas.h
@ -1,887 +0,0 @@
-/*
- * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-/*
- * This is the public header file for the CUBLAS library, defining the API
- *
- * CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines)
- * on top of the CUDA runtime.
- */
-
-#if !defined(CUBLAS_H_)
-#define CUBLAS_H_
-
-#include <cuda_runtime.h>
-
-#ifndef CUBLASWINAPI
-#ifdef _WIN32
-#define CUBLASWINAPI __stdcall
-#else
-#define CUBLASWINAPI
-#endif
-#endif
-
-#undef CUBLASAPI
-#ifdef __CUDACC__
-#define CUBLASAPI __host__
-#else
-#define CUBLASAPI
-#endif
-
-#include "cublas_api.h"
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/* CUBLAS data types */
-#define cublasStatus cublasStatus_t
-
-cublasStatus CUBLASWINAPI cublasInit(void);
-cublasStatus CUBLASWINAPI cublasShutdown(void);
-cublasStatus CUBLASWINAPI cublasGetError(void);
-
-cublasStatus CUBLASWINAPI cublasGetVersion(int* version);
-cublasStatus CUBLASWINAPI cublasAlloc(int n, int elemSize, void** devicePtr);
-
-cublasStatus CUBLASWINAPI cublasFree(void* devicePtr);
-
-cublasStatus CUBLASWINAPI cublasSetKernelStream(cudaStream_t stream);
-
-/* ---------------- CUBLAS BLAS1 functions ---------------- */
-/* NRM2 */
-float CUBLASWINAPI cublasSnrm2(int n, const float* x, int incx);
-double CUBLASWINAPI cublasDnrm2(int n, const double* x, int incx);
-float CUBLASWINAPI cublasScnrm2(int n, const cuComplex* x, int incx);
-double CUBLASWINAPI cublasDznrm2(int n, const cuDoubleComplex* x, int incx);
-/*------------------------------------------------------------------------*/
-/* DOT */
-float CUBLASWINAPI cublasSdot(int n, const float* x, int incx, const float* y, int incy);
-double CUBLASWINAPI cublasDdot(int n, const double* x, int incx, const double* y, int incy);
-cuComplex CUBLASWINAPI cublasCdotu(int n, const cuComplex* x, int incx, const cuComplex* y, int incy);
-cuComplex CUBLASWINAPI cublasCdotc(int n, const cuComplex* x, int incx, const cuComplex* y, int incy);
-cuDoubleComplex CUBLASWINAPI cublasZdotu(int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy);
-cuDoubleComplex CUBLASWINAPI cublasZdotc(int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy);
-/*------------------------------------------------------------------------*/
-/* SCAL */
-void CUBLASWINAPI cublasSscal(int n, float alpha, float* x, int incx);
-void CUBLASWINAPI cublasDscal(int n, double alpha, double* x, int incx);
-void CUBLASWINAPI cublasCscal(int n, cuComplex alpha, cuComplex* x, int incx);
-void CUBLASWINAPI cublasZscal(int n, cuDoubleComplex alpha, cuDoubleComplex* x, int incx);
-
-void CUBLASWINAPI cublasCsscal(int n, float alpha, cuComplex* x, int incx);
-void CUBLASWINAPI cublasZdscal(int n, double alpha, cuDoubleComplex* x, int incx);
-/*------------------------------------------------------------------------*/
-/* AXPY */
-void CUBLASWINAPI cublasSaxpy(int n, float alpha, const float* x, int incx, float* y, int incy);
-void CUBLASWINAPI cublasDaxpy(int n, double alpha, const double* x, int incx, double* y, int incy);
-void CUBLASWINAPI cublasCaxpy(int n, cuComplex alpha, const cuComplex* x, int incx, cuComplex* y, int incy);
-void CUBLASWINAPI
-cublasZaxpy(int n, cuDoubleComplex alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
-/*------------------------------------------------------------------------*/
-/* COPY */
-void CUBLASWINAPI cublasScopy(int n, const float* x, int incx, float* y, int incy);
-void CUBLASWINAPI cublasDcopy(int n, const double* x, int incx, double* y, int incy);
-void CUBLASWINAPI cublasCcopy(int n, const cuComplex* x, int incx, cuComplex* y, int incy);
-void CUBLASWINAPI cublasZcopy(int n, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
-/*------------------------------------------------------------------------*/
-/* SWAP */
-void CUBLASWINAPI cublasSswap(int n, float* x, int incx, float* y, int incy);
-void CUBLASWINAPI cublasDswap(int n, double* x, int incx, double* y, int incy);
-void CUBLASWINAPI cublasCswap(int n, cuComplex* x, int incx, cuComplex* y, int incy);
-void CUBLASWINAPI cublasZswap(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
-/*------------------------------------------------------------------------*/
-/* AMAX */
-int CUBLASWINAPI cublasIsamax(int n, const float* x, int incx);
-int CUBLASWINAPI cublasIdamax(int n, const double* x, int incx);
-int CUBLASWINAPI cublasIcamax(int n, const cuComplex* x, int incx);
-int CUBLASWINAPI cublasIzamax(int n, const cuDoubleComplex* x, int incx);
-/*------------------------------------------------------------------------*/
-/* AMIN */
-int CUBLASWINAPI cublasIsamin(int n, const float* x, int incx);
-int CUBLASWINAPI cublasIdamin(int n, const double* x, int incx);
-
-int CUBLASWINAPI cublasIcamin(int n, const cuComplex* x, int incx);
-int CUBLASWINAPI cublasIzamin(int n, const cuDoubleComplex* x, int incx);
-/*------------------------------------------------------------------------*/
-/* ASUM */
-float CUBLASWINAPI cublasSasum(int n, const float* x, int incx);
-double CUBLASWINAPI cublasDasum(int n, const double* x, int incx);
-float CUBLASWINAPI cublasScasum(int n, const cuComplex* x, int incx);
-double CUBLASWINAPI cublasDzasum(int n, const cuDoubleComplex* x, int incx);
-/*------------------------------------------------------------------------*/
-/* ROT */
-void CUBLASWINAPI cublasSrot(int n, float* x, int incx, float* y, int incy, float sc, float ss);
-void CUBLASWINAPI cublasDrot(int n, double* x, int incx, double* y, int incy, double sc, double ss);
-void CUBLASWINAPI cublasCrot(int n, cuComplex* x, int incx, cuComplex* y, int incy, float c, cuComplex s);
-void CUBLASWINAPI
-cublasZrot(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, double sc, cuDoubleComplex cs);
-void CUBLASWINAPI cublasCsrot(int n, cuComplex* x, int incx, cuComplex* y, int incy, float c, float s);
-void CUBLASWINAPI cublasZdrot(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, double c, double s);
-/*------------------------------------------------------------------------*/
-/* ROTG */
-void CUBLASWINAPI cublasSrotg(float* sa, float* sb, float* sc, float* ss);
-void CUBLASWINAPI cublasDrotg(double* sa, double* sb, double* sc, double* ss);
-void CUBLASWINAPI cublasCrotg(cuComplex* ca, cuComplex cb, float* sc, cuComplex* cs);
-void CUBLASWINAPI cublasZrotg(cuDoubleComplex* ca, cuDoubleComplex cb, double* sc, cuDoubleComplex* cs);
-/*------------------------------------------------------------------------*/
-/* ROTM */
-void CUBLASWINAPI cublasSrotm(int n, float* x, int incx, float* y, int incy, const float* sparam);
-void CUBLASWINAPI cublasDrotm(int n, double* x, int incx, double* y, int incy, const double* sparam);
-/*------------------------------------------------------------------------*/
-/* ROTMG */
-void CUBLASWINAPI cublasSrotmg(float* sd1, float* sd2, float* sx1, const float* sy1, float* sparam);
-void CUBLASWINAPI cublasDrotmg(double* sd1, double* sd2, double* sx1, const double* sy1, double* sparam);
-
-/* --------------- CUBLAS BLAS2 functions  ---------------- */
-/* GEMV */
-void CUBLASWINAPI cublasSgemv(char trans,
-                              int m,
-                              int n,
-                              float alpha,
-                              const float* A,
-                              int lda,
-                              const float* x,
-                              int incx,
-                              float beta,
-                              float* y,
-                              int incy);
-void CUBLASWINAPI cublasDgemv(char trans,
-                              int m,
-                              int n,
-                              double alpha,
-                              const double* A,
-                              int lda,
-                              const double* x,
-                              int incx,
-                              double beta,
-                              double* y,
-                              int incy);
-void CUBLASWINAPI cublasCgemv(char trans,
-                              int m,
-                              int n,
-                              cuComplex alpha,
-                              const cuComplex* A,
-                              int lda,
-                              const cuComplex* x,
-                              int incx,
-                              cuComplex beta,
-                              cuComplex* y,
-                              int incy);
-void CUBLASWINAPI cublasZgemv(char trans,
-                              int m,
-                              int n,
-                              cuDoubleComplex alpha,
-                              const cuDoubleComplex* A,
-                              int lda,
-                              const cuDoubleComplex* x,
-                              int incx,
-                              cuDoubleComplex beta,
-                              cuDoubleComplex* y,
-                              int incy);
-/*------------------------------------------------------------------------*/
-/* GBMV */
-void CUBLASWINAPI cublasSgbmv(char trans,
-                              int m,
-                              int n,
-                              int kl,
-                              int ku,
-                              float alpha,
-                              const float* A,
-                              int lda,
-                              const float* x,
-                              int incx,
-                              float beta,
-                              float* y,
-                              int incy);
-void CUBLASWINAPI cublasDgbmv(char trans,
-                              int m,
-                              int n,
-                              int kl,
-                              int ku,
-                              double alpha,
-                              const double* A,
-                              int lda,
-                              const double* x,
-                              int incx,
-                              double beta,
-                              double* y,
-                              int incy);
-void CUBLASWINAPI cublasCgbmv(char trans,
-                              int m,
-                              int n,
-                              int kl,
-                              int ku,
-                              cuComplex alpha,
-                              const cuComplex* A,
-                              int lda,
-                              const cuComplex* x,
-                              int incx,
-                              cuComplex beta,
-                              cuComplex* y,
-                              int incy);
-void CUBLASWINAPI cublasZgbmv(char trans,
-                              int m,
-                              int n,
-                              int kl,
-                              int ku,
-                              cuDoubleComplex alpha,
-                              const cuDoubleComplex* A,
-                              int lda,
-                              const cuDoubleComplex* x,
-                              int incx,
-                              cuDoubleComplex beta,
-                              cuDoubleComplex* y,
-                              int incy);
-/*------------------------------------------------------------------------*/
-/* TRMV */
-void CUBLASWINAPI cublasStrmv(char uplo, char trans, char diag, int n, const float* A, int lda, float* x, int incx);
-void CUBLASWINAPI cublasDtrmv(char uplo, char trans, char diag, int n, const double* A, int lda, double* x, int incx);
-void CUBLASWINAPI
-cublasCtrmv(char uplo, char trans, char diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx);
-void CUBLASWINAPI
-cublasZtrmv(char uplo, char trans, char diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
-/*------------------------------------------------------------------------*/
-/* TBMV */
-void CUBLASWINAPI
-cublasStbmv(char uplo, char trans, char diag, int n, int k, const float* A, int lda, float* x, int incx);
-void CUBLASWINAPI
-cublasDtbmv(char uplo, char trans, char diag, int n, int k, const double* A, int lda, double* x, int incx);
-void CUBLASWINAPI
-cublasCtbmv(char uplo, char trans, char diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx);
-void CUBLASWINAPI cublasZtbmv(
-    char uplo, char trans, char diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
-/*------------------------------------------------------------------------*/
-/* TPMV */
-void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float* AP, float* x, int incx);
-
-void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double* AP, double* x, int incx);
-
-void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex* AP, cuComplex* x, int incx);
-
-void CUBLASWINAPI
-cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx);
-/*------------------------------------------------------------------------*/
-/* TRSV */
-void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float* A, int lda, float* x, int incx);
-
-void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double* A, int lda, double* x, int incx);
-
-void CUBLASWINAPI
-cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx);
-
-void CUBLASWINAPI
-cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
-/*------------------------------------------------------------------------*/
-/* TPSV */
-void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float* AP, float* x, int incx);
-
-void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double* AP, double* x, int incx);
-
-void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex* AP, cuComplex* x, int incx);
-
-void CUBLASWINAPI
-cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx);
-/*------------------------------------------------------------------------*/
-/* TBSV */
-void CUBLASWINAPI
-cublasStbsv(char uplo, char trans, char diag, int n, int k, const float* A, int lda, float* x, int incx);
-
-void CUBLASWINAPI
-cublasDtbsv(char uplo, char trans, char diag, int n, int k, const double* A, int lda, double* x, int incx);
-void CUBLASWINAPI
-cublasCtbsv(char uplo, char trans, char diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx);
-
-void CUBLASWINAPI cublasZtbsv(
-    char uplo, char trans, char diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
-/*------------------------------------------------------------------------*/
-/* SYMV/HEMV */
-void CUBLASWINAPI cublasSsymv(
-    char uplo, int n, float alpha, const float* A, int lda, const float* x, int incx, float beta, float* y, int incy);
-void CUBLASWINAPI cublasDsymv(char uplo,
-                              int n,
-                              double alpha,
-                              const double* A,
-                              int lda,
-                              const double* x,
-                              int incx,
-                              double beta,
-                              double* y,
-                              int incy);
-void CUBLASWINAPI cublasChemv(char uplo,
-                              int n,
-                              cuComplex alpha,
-                              const cuComplex* A,
-                              int lda,
-                              const cuComplex* x,
-                              int incx,
-                              cuComplex beta,
-                              cuComplex* y,
-                              int incy);
-void CUBLASWINAPI cublasZhemv(char uplo,
-                              int n,
-                              cuDoubleComplex alpha,
-                              const cuDoubleComplex* A,
-                              int lda,
-                              const cuDoubleComplex* x,
-                              int incx,
-                              cuDoubleComplex beta,
-                              cuDoubleComplex* y,
-                              int incy);
-/*------------------------------------------------------------------------*/
-/* SBMV/HBMV */
-void CUBLASWINAPI cublasSsbmv(char uplo,
-                              int n,
-                              int k,
-                              float alpha,
-                              const float* A,
-                              int lda,
-                              const float* x,
-                              int incx,
-                              float beta,
-                              float* y,
-                              int incy);
-void CUBLASWINAPI cublasDsbmv(char uplo,
-                              int n,
-                              int k,
-                              double alpha,
-                              const double* A,
-                              int lda,
-                              const double* x,
-                              int incx,
-                              double beta,
-                              double* y,
-                              int incy);
-void CUBLASWINAPI cublasChbmv(char uplo,
-                              int n,
-                              int k,
-                              cuComplex alpha,
-                              const cuComplex* A,
-                              int lda,
-                              const cuComplex* x,
-                              int incx,
-                              cuComplex beta,
-                              cuComplex* y,
-                              int incy);
-void CUBLASWINAPI cublasZhbmv(char uplo,
-                              int n,
-                              int k,
-                              cuDoubleComplex alpha,
-                              const cuDoubleComplex* A,
-                              int lda,
-                              const cuDoubleComplex* x,
-                              int incx,
-                              cuDoubleComplex beta,
-                              cuDoubleComplex* y,
-                              int incy);
-/*------------------------------------------------------------------------*/
-/* SPMV/HPMV */
-void CUBLASWINAPI
-cublasSspmv(char uplo, int n, float alpha, const float* AP, const float* x, int incx, float beta, float* y, int incy);
-void CUBLASWINAPI cublasDspmv(
-    char uplo, int n, double alpha, const double* AP, const double* x, int incx, double beta, double* y, int incy);
-void CUBLASWINAPI cublasChpmv(char uplo,
-                              int n,
-                              cuComplex alpha,
-                              const cuComplex* AP,
-                              const cuComplex* x,
-                              int incx,
-                              cuComplex beta,
-                              cuComplex* y,
-                              int incy);
-void CUBLASWINAPI cublasZhpmv(char uplo,
-                              int n,
-                              cuDoubleComplex alpha,
-                              const cuDoubleComplex* AP,
-                              const cuDoubleComplex* x,
-                              int incx,
-                              cuDoubleComplex beta,
-                              cuDoubleComplex* y,
-                              int incy);
-
-/*------------------------------------------------------------------------*/
-/* GER */
-void CUBLASWINAPI
-cublasSger(int m, int n, float alpha, const float* x, int incx, const float* y, int incy, float* A, int lda);
-void CUBLASWINAPI
-cublasDger(int m, int n, double alpha, const double* x, int incx, const double* y, int incy, double* A, int lda);
-
-void CUBLASWINAPI cublasCgeru(
-    int m, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda);
-void CUBLASWINAPI cublasCgerc(
-    int m, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda);
-void CUBLASWINAPI cublasZgeru(int m,
-                              int n,
-                              cuDoubleComplex alpha,
-                              const cuDoubleComplex* x,
-                              int incx,
-                              const cuDoubleComplex* y,
-                              int incy,
-                              cuDoubleComplex* A,
-                              int lda);
-void CUBLASWINAPI cublasZgerc(int m,
-                              int n,
-                              cuDoubleComplex alpha,
-                              const cuDoubleComplex* x,
-                              int incx,
-                              const cuDoubleComplex* y,
-                              int incy,
-                              cuDoubleComplex* A,
-                              int lda);
-/*------------------------------------------------------------------------*/
-/* SYR/HER */
-void CUBLASWINAPI cublasSsyr(char uplo, int n, float alpha, const float* x, int incx, float* A, int lda);
-void CUBLASWINAPI cublasDsyr(char uplo, int n, double alpha, const double* x, int incx, double* A, int lda);
-
-void CUBLASWINAPI cublasCher(char uplo, int n, float alpha, const cuComplex* x, int incx, cuComplex* A, int lda);
-void CUBLASWINAPI
-cublasZher(char uplo, int n, double alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* A, int lda);
-
-/*------------------------------------------------------------------------*/
-/* SPR/HPR */
-void CUBLASWINAPI cublasSspr(char uplo, int n, float alpha, const float* x, int incx, float* AP);
-void CUBLASWINAPI cublasDspr(char uplo, int n, double alpha, const double* x, int incx, double* AP);
-void CUBLASWINAPI cublasChpr(char uplo, int n, float alpha, const cuComplex* x, int incx, cuComplex* AP);
-void CUBLASWINAPI cublasZhpr(char uplo, int n, double alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* AP);
-/*------------------------------------------------------------------------*/
-/* SYR2/HER2 */
-void CUBLASWINAPI
-cublasSsyr2(char uplo, int n, float alpha, const float* x, int incx, const float* y, int incy, float* A, int lda);
-void CUBLASWINAPI
-cublasDsyr2(char uplo, int n, double alpha, const double* x, int incx, const double* y, int incy, double* A, int lda);
-void CUBLASWINAPI cublasCher2(char uplo,
-                              int n,
-                              cuComplex alpha,
-                              const cuComplex* x,
-                              int incx,
-                              const cuComplex* y,
-                              int incy,
-                              cuComplex* A,
-                              int lda);
-void CUBLASWINAPI cublasZher2(char uplo,
-                              int n,
-                              cuDoubleComplex alpha,
-                              const cuDoubleComplex* x,
-                              int incx,
-                              const cuDoubleComplex* y,
-                              int incy,
-                              cuDoubleComplex* A,
-                              int lda);
-
-/*------------------------------------------------------------------------*/
-/* SPR2/HPR2 */
-void CUBLASWINAPI
-cublasSspr2(char uplo, int n, float alpha, const float* x, int incx, const float* y, int incy, float* AP);
-void CUBLASWINAPI
-cublasDspr2(char uplo, int n, double alpha, const double* x, int incx, const double* y, int incy, double* AP);
-void CUBLASWINAPI cublasChpr2(
-    char uplo, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* AP);
-void CUBLASWINAPI cublasZhpr2(char uplo,
-                              int n,
-                              cuDoubleComplex alpha,
-                              const cuDoubleComplex* x,
-                              int incx,
-                              const cuDoubleComplex* y,
-                              int incy,
-                              cuDoubleComplex* AP);
-/* ------------------------BLAS3 Functions ------------------------------- */
-/* GEMM */
-void CUBLASWINAPI cublasSgemm(char transa,
-                              char transb,
-                              int m,
-                              int n,
-                              int k,
-                              float alpha,
-                              const float* A,
-                              int lda,
-                              const float* B,
-                              int ldb,
-                              float beta,
-                              float* C,
-                              int ldc);
-void CUBLASWINAPI cublasDgemm(char transa,
-                              char transb,
-                              int m,
-                              int n,
-                              int k,
-                              double alpha,
-                              const double* A,
-                              int lda,
-                              const double* B,
-                              int ldb,
-                              double beta,
-                              double* C,
-                              int ldc);
-void CUBLASWINAPI cublasCgemm(char transa,
-                              char transb,
-                              int m,
-                              int n,
-                              int k,
-                              cuComplex alpha,
-                              const cuComplex* A,
-                              int lda,
-                              const cuComplex* B,
-                              int ldb,
-                              cuComplex beta,
-                              cuComplex* C,
-                              int ldc);
-void CUBLASWINAPI cublasZgemm(char transa,
-                              char transb,
-                              int m,
-                              int n,
-                              int k,
-                              cuDoubleComplex alpha,
-                              const cuDoubleComplex* A,
-                              int lda,
-                              const cuDoubleComplex* B,
-                              int ldb,
-                              cuDoubleComplex beta,
-                              cuDoubleComplex* C,
-                              int ldc);
-/* -------------------------------------------------------*/
-/* SYRK */
-void CUBLASWINAPI
-cublasSsyrk(char uplo, char trans, int n, int k, float alpha, const float* A, int lda, float beta, float* C, int ldc);
-void CUBLASWINAPI cublasDsyrk(
-    char uplo, char trans, int n, int k, double alpha, const double* A, int lda, double beta, double* C, int ldc);
-
-void CUBLASWINAPI cublasCsyrk(char uplo,
-                              char trans,
-                              int n,
-                              int k,
-                              cuComplex alpha,
-                              const cuComplex* A,
-                              int lda,
-                              cuComplex beta,
-                              cuComplex* C,
-                              int ldc);
-void CUBLASWINAPI cublasZsyrk(char uplo,
-                              char trans,
-                              int n,
-                              int k,
-                              cuDoubleComplex alpha,
-                              const cuDoubleComplex* A,
-                              int lda,
-                              cuDoubleComplex beta,
-                              cuDoubleComplex* C,
-                              int ldc);
-/* ------------------------------------------------------- */
-/* HERK */
-void CUBLASWINAPI cublasCherk(
-    char uplo, char trans, int n, int k, float alpha, const cuComplex* A, int lda, float beta, cuComplex* C, int ldc);
-void CUBLASWINAPI cublasZherk(char uplo,
-                              char trans,
-                              int n,
-                              int k,
-                              double alpha,
-                              const cuDoubleComplex* A,
-                              int lda,
-                              double beta,
-                              cuDoubleComplex* C,
-                              int ldc);
-/* ------------------------------------------------------- */
-/* SYR2K */
-void CUBLASWINAPI cublasSsyr2k(char uplo,
-                               char trans,
-                               int n,
-                               int k,
-                               float alpha,
-                               const float* A,
-                               int lda,
-                               const float* B,
-                               int ldb,
-                               float beta,
-                               float* C,
-                               int ldc);
-
-void CUBLASWINAPI cublasDsyr2k(char uplo,
-                               char trans,
-                               int n,
-                               int k,
-                               double alpha,
-                               const double* A,
-                               int lda,
-                               const double* B,
-                               int ldb,
-                               double beta,
-                               double* C,
-                               int ldc);
-void CUBLASWINAPI cublasCsyr2k(char uplo,
-                               char trans,
-                               int n,
-                               int k,
-                               cuComplex alpha,
-                               const cuComplex* A,
-                               int lda,
-                               const cuComplex* B,
-                               int ldb,
-                               cuComplex beta,
-                               cuComplex* C,
-                               int ldc);
-
-void CUBLASWINAPI cublasZsyr2k(char uplo,
-                               char trans,
-                               int n,
-                               int k,
-                               cuDoubleComplex alpha,
-                               const cuDoubleComplex* A,
-                               int lda,
-                               const cuDoubleComplex* B,
-                               int ldb,
-                               cuDoubleComplex beta,
-                               cuDoubleComplex* C,
-                               int ldc);
-/* ------------------------------------------------------- */
-/* HER2K */
-void CUBLASWINAPI cublasCher2k(char uplo,
-                               char trans,
-                               int n,
-                               int k,
-                               cuComplex alpha,
-                               const cuComplex* A,
-                               int lda,
-                               const cuComplex* B,
-                               int ldb,
-                               float beta,
-                               cuComplex* C,
-                               int ldc);
-
-void CUBLASWINAPI cublasZher2k(char uplo,
-                               char trans,
-                               int n,
-                               int k,
-                               cuDoubleComplex alpha,
-                               const cuDoubleComplex* A,
-                               int lda,
-                               const cuDoubleComplex* B,
-                               int ldb,
-                               double beta,
-                               cuDoubleComplex* C,
-                               int ldc);
-
-/*------------------------------------------------------------------------*/
-/* SYMM*/
-void CUBLASWINAPI cublasSsymm(char side,
-                              char uplo,
-                              int m,
-                              int n,
-                              float alpha,
-                              const float* A,
-                              int lda,
-                              const float* B,
-                              int ldb,
-                              float beta,
-                              float* C,
-                              int ldc);
-void CUBLASWINAPI cublasDsymm(char side,
-                              char uplo,
-                              int m,
-                              int n,
-                              double alpha,
-                              const double* A,
-                              int lda,
-                              const double* B,
-                              int ldb,
-                              double beta,
-                              double* C,
-                              int ldc);
-
-void CUBLASWINAPI cublasCsymm(char side,
-                              char uplo,
-                              int m,
-                              int n,
-                              cuComplex alpha,
-                              const cuComplex* A,
-                              int lda,
-                              const cuComplex* B,
-                              int ldb,
-                              cuComplex beta,
-                              cuComplex* C,
-                              int ldc);
-
-void CUBLASWINAPI cublasZsymm(char side,
-                              char uplo,
-                              int m,
-                              int n,
-                              cuDoubleComplex alpha,
-                              const cuDoubleComplex* A,
-                              int lda,
-                              const cuDoubleComplex* B,
-                              int ldb,
-                              cuDoubleComplex beta,
-                              cuDoubleComplex* C,
-                              int ldc);
-/*------------------------------------------------------------------------*/
-/* HEMM*/
-void CUBLASWINAPI cublasChemm(char side,
-                              char uplo,
-                              int m,
-                              int n,
-                              cuComplex alpha,
-                              const cuComplex* A,
-                              int lda,
-                              const cuComplex* B,
-                              int ldb,
-                              cuComplex beta,
-                              cuComplex* C,
-                              int ldc);
-void CUBLASWINAPI cublasZhemm(char side,
-                              char uplo,
-                              int m,
-                              int n,
-                              cuDoubleComplex alpha,
-                              const cuDoubleComplex* A,
-                              int lda,
-                              const cuDoubleComplex* B,
-                              int ldb,
-                              cuDoubleComplex beta,
-                              cuDoubleComplex* C,
-                              int ldc);
-
-/*------------------------------------------------------------------------*/
-/* TRSM*/
-void CUBLASWINAPI cublasStrsm(char side,
-                              char uplo,
-                              char transa,
-                              char diag,
-                              int m,
-                              int n,
-                              float alpha,
-                              const float* A,
-                              int lda,
-                              float* B,
-                              int ldb);
-
-void CUBLASWINAPI cublasDtrsm(char side,
-                              char uplo,
-                              char transa,
-                              char diag,
-                              int m,
-                              int n,
-                              double alpha,
-                              const double* A,
-                              int lda,
-                              double* B,
-                              int ldb);
-
-void CUBLASWINAPI cublasCtrsm(char side,
-                              char uplo,
-                              char transa,
-                              char diag,
-                              int m,
-                              int n,
-                              cuComplex alpha,
-                              const cuComplex* A,
-                              int lda,
-                              cuComplex* B,
-                              int ldb);
-
-void CUBLASWINAPI cublasZtrsm(char side,
-                              char uplo,
-                              char transa,
-                              char diag,
-                              int m,
-                              int n,
-                              cuDoubleComplex alpha,
-                              const cuDoubleComplex* A,
-                              int lda,
-                              cuDoubleComplex* B,
-                              int ldb);
-/*------------------------------------------------------------------------*/
-/* TRMM*/
-void CUBLASWINAPI cublasStrmm(char side,
-                              char uplo,
-                              char transa,
-                              char diag,
-                              int m,
-                              int n,
-                              float alpha,
-                              const float* A,
-                              int lda,
-                              float* B,
-                              int ldb);
-void CUBLASWINAPI cublasDtrmm(char side,
-                              char uplo,
-                              char transa,
-                              char diag,
-                              int m,
-                              int n,
-                              double alpha,
-                              const double* A,
-                              int lda,
-                              double* B,
-                              int ldb);
-void CUBLASWINAPI cublasCtrmm(char side,
-                              char uplo,
-                              char transa,
-                              char diag,
-                              int m,
-                              int n,
-                              cuComplex alpha,
-                              const cuComplex* A,
-                              int lda,
-                              cuComplex* B,
-                              int ldb);
-void CUBLASWINAPI cublasZtrmm(char side,
-                              char uplo,
-                              char transa,
-                              char diag,
-                              int m,
-                              int n,
-                              cuDoubleComplex alpha,
-                              const cuDoubleComplex* A,
-                              int lda,
-                              cuDoubleComplex* B,
-                              int ldb);
-
-#if defined(__cplusplus)
-}
-#endif /* __cplusplus */
-
-#endif /* !defined(CUBLAS_H_) */
--- a/include/cuda/cublasLt.h
+++ b/include/cuda/cublasLt.h
--- a/include/cuda/cublasXt.h
+++ b/include/cuda/cublasXt.h
@ -1,693 +0,0 @@
-/*
- * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-/*   cublasXt : Host API, Out of Core and Multi-GPU BLAS Library
-
-*/
-
-#if !defined(CUBLAS_XT_H_)
-#define CUBLAS_XT_H_
-
-#include "driver_types.h"
-#include "cuComplex.h" /* import complex data type */
-
-#include "cublas_v2.h"
-
-#if defined(__cplusplus)
-extern "C" {
-#endif /* __cplusplus */
-
-struct cublasXtContext;
-typedef struct cublasXtContext* cublasXtHandle_t;
-
-cublasStatus_t CUBLASWINAPI cublasXtCreate(cublasXtHandle_t* handle);
-cublasStatus_t CUBLASWINAPI cublasXtDestroy(cublasXtHandle_t handle);
-cublasStatus_t CUBLASWINAPI cublasXtGetNumBoards(int nbDevices, int deviceId[], int* nbBoards);
-cublasStatus_t CUBLASWINAPI cublasXtMaxBoards(int* nbGpuBoards);
-/* This routine selects the Gpus that the user want to use for CUBLAS-XT */
-cublasStatus_t CUBLASWINAPI cublasXtDeviceSelect(cublasXtHandle_t handle, int nbDevices, int deviceId[]);
-
-/* This routine allows to change the dimension of the tiles ( blockDim x blockDim ) */
-cublasStatus_t CUBLASWINAPI cublasXtSetBlockDim(cublasXtHandle_t handle, int blockDim);
-cublasStatus_t CUBLASWINAPI cublasXtGetBlockDim(cublasXtHandle_t handle, int* blockDim);
-
-typedef enum { CUBLASXT_PINNING_DISABLED = 0, CUBLASXT_PINNING_ENABLED = 1 } cublasXtPinnedMemMode_t;
-/* This routine allows to CUBLAS-XT to pin the Host memory if it find out that some of the matrix passed
-   are not pinned : Pinning/Unpinning the Host memory is still a costly operation
-   It is better if the user controls the memory on its own (by pinning/unpinning oly when necessary)
-*/
-cublasStatus_t CUBLASWINAPI cublasXtGetPinningMemMode(cublasXtHandle_t handle, cublasXtPinnedMemMode_t* mode);
-cublasStatus_t CUBLASWINAPI cublasXtSetPinningMemMode(cublasXtHandle_t handle, cublasXtPinnedMemMode_t mode);
-
-/* This routines is to provide a CPU Blas routines, used for too small sizes or hybrid computation */
-typedef enum {
-  CUBLASXT_FLOAT = 0,
-  CUBLASXT_DOUBLE = 1,
-  CUBLASXT_COMPLEX = 2,
-  CUBLASXT_DOUBLECOMPLEX = 3,
-} cublasXtOpType_t;
-
-typedef enum {
-  CUBLASXT_GEMM = 0,
-  CUBLASXT_SYRK = 1,
-  CUBLASXT_HERK = 2,
-  CUBLASXT_SYMM = 3,
-  CUBLASXT_HEMM = 4,
-  CUBLASXT_TRSM = 5,
-  CUBLASXT_SYR2K = 6,
-  CUBLASXT_HER2K = 7,
-
-  CUBLASXT_SPMM = 8,
-  CUBLASXT_SYRKX = 9,
-  CUBLASXT_HERKX = 10,
-  CUBLASXT_TRMM = 11,
-  CUBLASXT_ROUTINE_MAX = 12,
-} cublasXtBlasOp_t;
-
-/* Currently only 32-bit integer BLAS routines are supported */
-cublasStatus_t CUBLASWINAPI cublasXtSetCpuRoutine(cublasXtHandle_t handle,
-                                                  cublasXtBlasOp_t blasOp,
-                                                  cublasXtOpType_t type,
-                                                  void* blasFunctor);
-
-/* Specified the percentage of work that should done by the CPU, default is 0 (no work) */
-cublasStatus_t CUBLASWINAPI cublasXtSetCpuRatio(cublasXtHandle_t handle,
-                                                cublasXtBlasOp_t blasOp,
-                                                cublasXtOpType_t type,
-                                                float ratio);
-
-/* GEMM */
-cublasStatus_t CUBLASWINAPI cublasXtSgemm(cublasXtHandle_t handle,
-                                          cublasOperation_t transa,
-                                          cublasOperation_t transb,
-                                          size_t m,
-                                          size_t n,
-                                          size_t k,
-                                          const float* alpha,
-                                          const float* A,
-                                          size_t lda,
-                                          const float* B,
-                                          size_t ldb,
-                                          const float* beta,
-                                          float* C,
-                                          size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtDgemm(cublasXtHandle_t handle,
-                                          cublasOperation_t transa,
-                                          cublasOperation_t transb,
-                                          size_t m,
-                                          size_t n,
-                                          size_t k,
-                                          const double* alpha,
-                                          const double* A,
-                                          size_t lda,
-                                          const double* B,
-                                          size_t ldb,
-                                          const double* beta,
-                                          double* C,
-                                          size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtCgemm(cublasXtHandle_t handle,
-                                          cublasOperation_t transa,
-                                          cublasOperation_t transb,
-                                          size_t m,
-                                          size_t n,
-                                          size_t k,
-                                          const cuComplex* alpha,
-                                          const cuComplex* A,
-                                          size_t lda,
-                                          const cuComplex* B,
-                                          size_t ldb,
-                                          const cuComplex* beta,
-                                          cuComplex* C,
-                                          size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtZgemm(cublasXtHandle_t handle,
-                                          cublasOperation_t transa,
-                                          cublasOperation_t transb,
-                                          size_t m,
-                                          size_t n,
-                                          size_t k,
-                                          const cuDoubleComplex* alpha,
-                                          const cuDoubleComplex* A,
-                                          size_t lda,
-                                          const cuDoubleComplex* B,
-                                          size_t ldb,
-                                          const cuDoubleComplex* beta,
-                                          cuDoubleComplex* C,
-                                          size_t ldc);
-/* ------------------------------------------------------- */
-/* SYRK */
-cublasStatus_t CUBLASWINAPI cublasXtSsyrk(cublasXtHandle_t handle,
-                                          cublasFillMode_t uplo,
-                                          cublasOperation_t trans,
-                                          size_t n,
-                                          size_t k,
-                                          const float* alpha,
-                                          const float* A,
-                                          size_t lda,
-                                          const float* beta,
-                                          float* C,
-                                          size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtDsyrk(cublasXtHandle_t handle,
-                                          cublasFillMode_t uplo,
-                                          cublasOperation_t trans,
-                                          size_t n,
-                                          size_t k,
-                                          const double* alpha,
-                                          const double* A,
-                                          size_t lda,
-                                          const double* beta,
-                                          double* C,
-                                          size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtCsyrk(cublasXtHandle_t handle,
-                                          cublasFillMode_t uplo,
-                                          cublasOperation_t trans,
-                                          size_t n,
-                                          size_t k,
-                                          const cuComplex* alpha,
-                                          const cuComplex* A,
-                                          size_t lda,
-                                          const cuComplex* beta,
-                                          cuComplex* C,
-                                          size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtZsyrk(cublasXtHandle_t handle,
-                                          cublasFillMode_t uplo,
-                                          cublasOperation_t trans,
-                                          size_t n,
-                                          size_t k,
-                                          const cuDoubleComplex* alpha,
-                                          const cuDoubleComplex* A,
-                                          size_t lda,
-                                          const cuDoubleComplex* beta,
-                                          cuDoubleComplex* C,
-                                          size_t ldc);
-/* -------------------------------------------------------------------- */
-/* HERK */
-cublasStatus_t CUBLASWINAPI cublasXtCherk(cublasXtHandle_t handle,
-                                          cublasFillMode_t uplo,
-                                          cublasOperation_t trans,
-                                          size_t n,
-                                          size_t k,
-                                          const float* alpha,
-                                          const cuComplex* A,
-                                          size_t lda,
-                                          const float* beta,
-                                          cuComplex* C,
-                                          size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtZherk(cublasXtHandle_t handle,
-                                          cublasFillMode_t uplo,
-                                          cublasOperation_t trans,
-                                          size_t n,
-                                          size_t k,
-                                          const double* alpha,
-                                          const cuDoubleComplex* A,
-                                          size_t lda,
-                                          const double* beta,
-                                          cuDoubleComplex* C,
-                                          size_t ldc);
-/* -------------------------------------------------------------------- */
-/* SYR2K */
-cublasStatus_t CUBLASWINAPI cublasXtSsyr2k(cublasXtHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           size_t n,
-                                           size_t k,
-                                           const float* alpha,
-                                           const float* A,
-                                           size_t lda,
-                                           const float* B,
-                                           size_t ldb,
-                                           const float* beta,
-                                           float* C,
-                                           size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtDsyr2k(cublasXtHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           size_t n,
-                                           size_t k,
-                                           const double* alpha,
-                                           const double* A,
-                                           size_t lda,
-                                           const double* B,
-                                           size_t ldb,
-                                           const double* beta,
-                                           double* C,
-                                           size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtCsyr2k(cublasXtHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           size_t n,
-                                           size_t k,
-                                           const cuComplex* alpha,
-                                           const cuComplex* A,
-                                           size_t lda,
-                                           const cuComplex* B,
-                                           size_t ldb,
-                                           const cuComplex* beta,
-                                           cuComplex* C,
-                                           size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtZsyr2k(cublasXtHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           size_t n,
-                                           size_t k,
-                                           const cuDoubleComplex* alpha,
-                                           const cuDoubleComplex* A,
-                                           size_t lda,
-                                           const cuDoubleComplex* B,
-                                           size_t ldb,
-                                           const cuDoubleComplex* beta,
-                                           cuDoubleComplex* C,
-                                           size_t ldc);
-/* -------------------------------------------------------------------- */
-/* HERKX : variant extension of HERK */
-cublasStatus_t CUBLASWINAPI cublasXtCherkx(cublasXtHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           size_t n,
-                                           size_t k,
-                                           const cuComplex* alpha,
-                                           const cuComplex* A,
-                                           size_t lda,
-                                           const cuComplex* B,
-                                           size_t ldb,
-                                           const float* beta,
-                                           cuComplex* C,
-                                           size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtZherkx(cublasXtHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           size_t n,
-                                           size_t k,
-                                           const cuDoubleComplex* alpha,
-                                           const cuDoubleComplex* A,
-                                           size_t lda,
-                                           const cuDoubleComplex* B,
-                                           size_t ldb,
-                                           const double* beta,
-                                           cuDoubleComplex* C,
-                                           size_t ldc);
-
-/* -------------------------------------------------------------------- */
-/* TRSM */
-cublasStatus_t CUBLASWINAPI cublasXtStrsm(cublasXtHandle_t handle,
-                                          cublasSideMode_t side,
-                                          cublasFillMode_t uplo,
-                                          cublasOperation_t trans,
-                                          cublasDiagType_t diag,
-                                          size_t m,
-                                          size_t n,
-                                          const float* alpha,
-                                          const float* A,
-                                          size_t lda,
-                                          float* B,
-                                          size_t ldb);
-
-cublasStatus_t CUBLASWINAPI cublasXtDtrsm(cublasXtHandle_t handle,
-                                          cublasSideMode_t side,
-                                          cublasFillMode_t uplo,
-                                          cublasOperation_t trans,
-                                          cublasDiagType_t diag,
-                                          size_t m,
-                                          size_t n,
-                                          const double* alpha,
-                                          const double* A,
-                                          size_t lda,
-                                          double* B,
-                                          size_t ldb);
-
-cublasStatus_t CUBLASWINAPI cublasXtCtrsm(cublasXtHandle_t handle,
-                                          cublasSideMode_t side,
-                                          cublasFillMode_t uplo,
-                                          cublasOperation_t trans,
-                                          cublasDiagType_t diag,
-                                          size_t m,
-                                          size_t n,
-                                          const cuComplex* alpha,
-                                          const cuComplex* A,
-                                          size_t lda,
-                                          cuComplex* B,
-                                          size_t ldb);
-
-cublasStatus_t CUBLASWINAPI cublasXtZtrsm(cublasXtHandle_t handle,
-                                          cublasSideMode_t side,
-                                          cublasFillMode_t uplo,
-                                          cublasOperation_t trans,
-                                          cublasDiagType_t diag,
-                                          size_t m,
-                                          size_t n,
-                                          const cuDoubleComplex* alpha,
-                                          const cuDoubleComplex* A,
-                                          size_t lda,
-                                          cuDoubleComplex* B,
-                                          size_t ldb);
-/* -------------------------------------------------------------------- */
-/* SYMM : Symmetric Multiply Matrix*/
-cublasStatus_t CUBLASWINAPI cublasXtSsymm(cublasXtHandle_t handle,
-                                          cublasSideMode_t side,
-                                          cublasFillMode_t uplo,
-                                          size_t m,
-                                          size_t n,
-                                          const float* alpha,
-                                          const float* A,
-                                          size_t lda,
-                                          const float* B,
-                                          size_t ldb,
-                                          const float* beta,
-                                          float* C,
-                                          size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtDsymm(cublasXtHandle_t handle,
-                                          cublasSideMode_t side,
-                                          cublasFillMode_t uplo,
-                                          size_t m,
-                                          size_t n,
-                                          const double* alpha,
-                                          const double* A,
-                                          size_t lda,
-                                          const double* B,
-                                          size_t ldb,
-                                          const double* beta,
-                                          double* C,
-                                          size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtCsymm(cublasXtHandle_t handle,
-                                          cublasSideMode_t side,
-                                          cublasFillMode_t uplo,
-                                          size_t m,
-                                          size_t n,
-                                          const cuComplex* alpha,
-                                          const cuComplex* A,
-                                          size_t lda,
-                                          const cuComplex* B,
-                                          size_t ldb,
-                                          const cuComplex* beta,
-                                          cuComplex* C,
-                                          size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtZsymm(cublasXtHandle_t handle,
-                                          cublasSideMode_t side,
-                                          cublasFillMode_t uplo,
-                                          size_t m,
-                                          size_t n,
-                                          const cuDoubleComplex* alpha,
-                                          const cuDoubleComplex* A,
-                                          size_t lda,
-                                          const cuDoubleComplex* B,
-                                          size_t ldb,
-                                          const cuDoubleComplex* beta,
-                                          cuDoubleComplex* C,
-                                          size_t ldc);
-/* -------------------------------------------------------------------- */
-/* HEMM : Hermitian Matrix Multiply */
-cublasStatus_t CUBLASWINAPI cublasXtChemm(cublasXtHandle_t handle,
-                                          cublasSideMode_t side,
-                                          cublasFillMode_t uplo,
-                                          size_t m,
-                                          size_t n,
-                                          const cuComplex* alpha,
-                                          const cuComplex* A,
-                                          size_t lda,
-                                          const cuComplex* B,
-                                          size_t ldb,
-                                          const cuComplex* beta,
-                                          cuComplex* C,
-                                          size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtZhemm(cublasXtHandle_t handle,
-                                          cublasSideMode_t side,
-                                          cublasFillMode_t uplo,
-                                          size_t m,
-                                          size_t n,
-                                          const cuDoubleComplex* alpha,
-                                          const cuDoubleComplex* A,
-                                          size_t lda,
-                                          const cuDoubleComplex* B,
-                                          size_t ldb,
-                                          const cuDoubleComplex* beta,
-                                          cuDoubleComplex* C,
-                                          size_t ldc);
-
-/* -------------------------------------------------------------------- */
-/* SYRKX : variant extension of SYRK  */
-cublasStatus_t CUBLASWINAPI cublasXtSsyrkx(cublasXtHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           size_t n,
-                                           size_t k,
-                                           const float* alpha,
-                                           const float* A,
-                                           size_t lda,
-                                           const float* B,
-                                           size_t ldb,
-                                           const float* beta,
-                                           float* C,
-                                           size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtDsyrkx(cublasXtHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           size_t n,
-                                           size_t k,
-                                           const double* alpha,
-                                           const double* A,
-                                           size_t lda,
-                                           const double* B,
-                                           size_t ldb,
-                                           const double* beta,
-                                           double* C,
-                                           size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtCsyrkx(cublasXtHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           size_t n,
-                                           size_t k,
-                                           const cuComplex* alpha,
-                                           const cuComplex* A,
-                                           size_t lda,
-                                           const cuComplex* B,
-                                           size_t ldb,
-                                           const cuComplex* beta,
-                                           cuComplex* C,
-                                           size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtZsyrkx(cublasXtHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           size_t n,
-                                           size_t k,
-                                           const cuDoubleComplex* alpha,
-                                           const cuDoubleComplex* A,
-                                           size_t lda,
-                                           const cuDoubleComplex* B,
-                                           size_t ldb,
-                                           const cuDoubleComplex* beta,
-                                           cuDoubleComplex* C,
-                                           size_t ldc);
-/* -------------------------------------------------------------------- */
-/* HER2K : variant extension of HERK  */
-cublasStatus_t CUBLASWINAPI cublasXtCher2k(cublasXtHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           size_t n,
-                                           size_t k,
-                                           const cuComplex* alpha,
-                                           const cuComplex* A,
-                                           size_t lda,
-                                           const cuComplex* B,
-                                           size_t ldb,
-                                           const float* beta,
-                                           cuComplex* C,
-                                           size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtZher2k(cublasXtHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           size_t n,
-                                           size_t k,
-                                           const cuDoubleComplex* alpha,
-                                           const cuDoubleComplex* A,
-                                           size_t lda,
-                                           const cuDoubleComplex* B,
-                                           size_t ldb,
-                                           const double* beta,
-                                           cuDoubleComplex* C,
-                                           size_t ldc);
-
-/* -------------------------------------------------------------------- */
-/* SPMM : Symmetric Packed Multiply Matrix*/
-cublasStatus_t CUBLASWINAPI cublasXtSspmm(cublasXtHandle_t handle,
-                                          cublasSideMode_t side,
-                                          cublasFillMode_t uplo,
-                                          size_t m,
-                                          size_t n,
-                                          const float* alpha,
-                                          const float* AP,
-                                          const float* B,
-                                          size_t ldb,
-                                          const float* beta,
-                                          float* C,
-                                          size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtDspmm(cublasXtHandle_t handle,
-                                          cublasSideMode_t side,
-                                          cublasFillMode_t uplo,
-                                          size_t m,
-                                          size_t n,
-                                          const double* alpha,
-                                          const double* AP,
-                                          const double* B,
-                                          size_t ldb,
-                                          const double* beta,
-                                          double* C,
-                                          size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtCspmm(cublasXtHandle_t handle,
-                                          cublasSideMode_t side,
-                                          cublasFillMode_t uplo,
-                                          size_t m,
-                                          size_t n,
-                                          const cuComplex* alpha,
-                                          const cuComplex* AP,
-                                          const cuComplex* B,
-                                          size_t ldb,
-                                          const cuComplex* beta,
-                                          cuComplex* C,
-                                          size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtZspmm(cublasXtHandle_t handle,
-                                          cublasSideMode_t side,
-                                          cublasFillMode_t uplo,
-                                          size_t m,
-                                          size_t n,
-                                          const cuDoubleComplex* alpha,
-                                          const cuDoubleComplex* AP,
-                                          const cuDoubleComplex* B,
-                                          size_t ldb,
-                                          const cuDoubleComplex* beta,
-                                          cuDoubleComplex* C,
-                                          size_t ldc);
-
-/* -------------------------------------------------------------------- */
-/* TRMM */
-cublasStatus_t CUBLASWINAPI cublasXtStrmm(cublasXtHandle_t handle,
-                                          cublasSideMode_t side,
-                                          cublasFillMode_t uplo,
-                                          cublasOperation_t trans,
-                                          cublasDiagType_t diag,
-                                          size_t m,
-                                          size_t n,
-                                          const float* alpha,
-                                          const float* A,
-                                          size_t lda,
-                                          const float* B,
-                                          size_t ldb,
-                                          float* C,
-                                          size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtDtrmm(cublasXtHandle_t handle,
-                                          cublasSideMode_t side,
-                                          cublasFillMode_t uplo,
-                                          cublasOperation_t trans,
-                                          cublasDiagType_t diag,
-                                          size_t m,
-                                          size_t n,
-                                          const double* alpha,
-                                          const double* A,
-                                          size_t lda,
-                                          const double* B,
-                                          size_t ldb,
-                                          double* C,
-                                          size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtCtrmm(cublasXtHandle_t handle,
-                                          cublasSideMode_t side,
-                                          cublasFillMode_t uplo,
-                                          cublasOperation_t trans,
-                                          cublasDiagType_t diag,
-                                          size_t m,
-                                          size_t n,
-                                          const cuComplex* alpha,
-                                          const cuComplex* A,
-                                          size_t lda,
-                                          const cuComplex* B,
-                                          size_t ldb,
-                                          cuComplex* C,
-                                          size_t ldc);
-
-cublasStatus_t CUBLASWINAPI cublasXtZtrmm(cublasXtHandle_t handle,
-                                          cublasSideMode_t side,
-                                          cublasFillMode_t uplo,
-                                          cublasOperation_t trans,
-                                          cublasDiagType_t diag,
-                                          size_t m,
-                                          size_t n,
-                                          const cuDoubleComplex* alpha,
-                                          const cuDoubleComplex* A,
-                                          size_t lda,
-                                          const cuDoubleComplex* B,
-                                          size_t ldb,
-                                          cuDoubleComplex* C,
-                                          size_t ldc);
-
-#if defined(__cplusplus)
-}
-#endif /* __cplusplus */
-
-#endif /* !defined(CUBLAS_XT_H_) */
--- a/include/cuda/cublas_api.h
+++ b/include/cuda/cublas_api.h
--- a/include/cuda/cublas_v2.h
+++ b/include/cuda/cublas_v2.h
@ -1,273 +0,0 @@
-/*
- * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-/*
- * This is the public header file for the new CUBLAS library API, it mapped the generic
- * Cublas name functions to the actual _v2 implementations.
- */
-
-#if !defined(CUBLAS_V2_H_)
-#define CUBLAS_V2_H_
-
-#undef CUBLASAPI
-#ifdef __CUDACC__
-#define CUBLASAPI __host__ __device__
-#else
-#define CUBLASAPI
-#endif
-
-#include "cublas_api.h"
-
-#define cublasCreate cublasCreate_v2
-#define cublasDestroy cublasDestroy_v2
-#define cublasGetVersion cublasGetVersion_v2
-#define cublasSetWorkspace cublasSetWorkspace_v2
-#define cublasSetStream cublasSetStream_v2
-#define cublasGetStream cublasGetStream_v2
-#define cublasGetPointerMode cublasGetPointerMode_v2
-#define cublasSetPointerMode cublasSetPointerMode_v2
-
-/* Blas3 Routines   */
-
-#define cublasSnrm2 cublasSnrm2_v2
-#define cublasDnrm2 cublasDnrm2_v2
-#define cublasScnrm2 cublasScnrm2_v2
-#define cublasDznrm2 cublasDznrm2_v2
-
-#define cublasSdot cublasSdot_v2
-#define cublasDdot cublasDdot_v2
-#define cublasCdotu cublasCdotu_v2
-#define cublasCdotc cublasCdotc_v2
-#define cublasZdotu cublasZdotu_v2
-#define cublasZdotc cublasZdotc_v2
-
-#define cublasSscal cublasSscal_v2
-#define cublasDscal cublasDscal_v2
-#define cublasCscal cublasCscal_v2
-#define cublasCsscal cublasCsscal_v2
-#define cublasZscal cublasZscal_v2
-#define cublasZdscal cublasZdscal_v2
-
-#define cublasSaxpy cublasSaxpy_v2
-#define cublasDaxpy cublasDaxpy_v2
-#define cublasCaxpy cublasCaxpy_v2
-#define cublasZaxpy cublasZaxpy_v2
-
-#define cublasScopy cublasScopy_v2
-#define cublasDcopy cublasDcopy_v2
-#define cublasCcopy cublasCcopy_v2
-#define cublasZcopy cublasZcopy_v2
-
-#define cublasSswap cublasSswap_v2
-#define cublasDswap cublasDswap_v2
-#define cublasCswap cublasCswap_v2
-#define cublasZswap cublasZswap_v2
-
-#define cublasIsamax cublasIsamax_v2
-#define cublasIdamax cublasIdamax_v2
-#define cublasIcamax cublasIcamax_v2
-#define cublasIzamax cublasIzamax_v2
-
-#define cublasIsamin cublasIsamin_v2
-#define cublasIdamin cublasIdamin_v2
-#define cublasIcamin cublasIcamin_v2
-#define cublasIzamin cublasIzamin_v2
-
-#define cublasSasum cublasSasum_v2
-#define cublasDasum cublasDasum_v2
-#define cublasScasum cublasScasum_v2
-#define cublasDzasum cublasDzasum_v2
-
-#define cublasSrot cublasSrot_v2
-#define cublasDrot cublasDrot_v2
-#define cublasCrot cublasCrot_v2
-#define cublasCsrot cublasCsrot_v2
-#define cublasZrot cublasZrot_v2
-#define cublasZdrot cublasZdrot_v2
-
-#define cublasSrotg cublasSrotg_v2
-#define cublasDrotg cublasDrotg_v2
-#define cublasCrotg cublasCrotg_v2
-#define cublasZrotg cublasZrotg_v2
-
-#define cublasSrotm cublasSrotm_v2
-#define cublasDrotm cublasDrotm_v2
-
-#define cublasSrotmg cublasSrotmg_v2
-#define cublasDrotmg cublasDrotmg_v2
-
-/* Blas2 Routines */
-
-#define cublasSgemv cublasSgemv_v2
-#define cublasDgemv cublasDgemv_v2
-#define cublasCgemv cublasCgemv_v2
-#define cublasZgemv cublasZgemv_v2
-
-#define cublasSgbmv cublasSgbmv_v2
-#define cublasDgbmv cublasDgbmv_v2
-#define cublasCgbmv cublasCgbmv_v2
-#define cublasZgbmv cublasZgbmv_v2
-
-#define cublasStrmv cublasStrmv_v2
-#define cublasDtrmv cublasDtrmv_v2
-#define cublasCtrmv cublasCtrmv_v2
-#define cublasZtrmv cublasZtrmv_v2
-
-#define cublasStbmv cublasStbmv_v2
-#define cublasDtbmv cublasDtbmv_v2
-#define cublasCtbmv cublasCtbmv_v2
-#define cublasZtbmv cublasZtbmv_v2
-
-#define cublasStpmv cublasStpmv_v2
-#define cublasDtpmv cublasDtpmv_v2
-#define cublasCtpmv cublasCtpmv_v2
-#define cublasZtpmv cublasZtpmv_v2
-
-#define cublasStrsv cublasStrsv_v2
-#define cublasDtrsv cublasDtrsv_v2
-#define cublasCtrsv cublasCtrsv_v2
-#define cublasZtrsv cublasZtrsv_v2
-
-#define cublasStpsv cublasStpsv_v2
-#define cublasDtpsv cublasDtpsv_v2
-#define cublasCtpsv cublasCtpsv_v2
-#define cublasZtpsv cublasZtpsv_v2
-
-#define cublasStbsv cublasStbsv_v2
-#define cublasDtbsv cublasDtbsv_v2
-#define cublasCtbsv cublasCtbsv_v2
-#define cublasZtbsv cublasZtbsv_v2
-
-#define cublasSsymv cublasSsymv_v2
-#define cublasDsymv cublasDsymv_v2
-#define cublasCsymv cublasCsymv_v2
-#define cublasZsymv cublasZsymv_v2
-#define cublasChemv cublasChemv_v2
-#define cublasZhemv cublasZhemv_v2
-
-#define cublasSsbmv cublasSsbmv_v2
-#define cublasDsbmv cublasDsbmv_v2
-#define cublasChbmv cublasChbmv_v2
-#define cublasZhbmv cublasZhbmv_v2
-
-#define cublasSspmv cublasSspmv_v2
-#define cublasDspmv cublasDspmv_v2
-#define cublasChpmv cublasChpmv_v2
-#define cublasZhpmv cublasZhpmv_v2
-
-#define cublasSger cublasSger_v2
-#define cublasDger cublasDger_v2
-#define cublasCgeru cublasCgeru_v2
-#define cublasCgerc cublasCgerc_v2
-#define cublasZgeru cublasZgeru_v2
-#define cublasZgerc cublasZgerc_v2
-
-#define cublasSsyr cublasSsyr_v2
-#define cublasDsyr cublasDsyr_v2
-#define cublasCsyr cublasCsyr_v2
-#define cublasZsyr cublasZsyr_v2
-#define cublasCher cublasCher_v2
-#define cublasZher cublasZher_v2
-
-#define cublasSspr cublasSspr_v2
-#define cublasDspr cublasDspr_v2
-#define cublasChpr cublasChpr_v2
-#define cublasZhpr cublasZhpr_v2
-
-#define cublasSsyr2 cublasSsyr2_v2
-#define cublasDsyr2 cublasDsyr2_v2
-#define cublasCsyr2 cublasCsyr2_v2
-#define cublasZsyr2 cublasZsyr2_v2
-#define cublasCher2 cublasCher2_v2
-#define cublasZher2 cublasZher2_v2
-
-#define cublasSspr2 cublasSspr2_v2
-#define cublasDspr2 cublasDspr2_v2
-#define cublasChpr2 cublasChpr2_v2
-#define cublasZhpr2 cublasZhpr2_v2
-
-/* Blas3 Routines   */
-
-#define cublasSgemm cublasSgemm_v2
-#define cublasDgemm cublasDgemm_v2
-#define cublasCgemm cublasCgemm_v2
-#define cublasZgemm cublasZgemm_v2
-
-#define cublasSsyrk cublasSsyrk_v2
-#define cublasDsyrk cublasDsyrk_v2
-#define cublasCsyrk cublasCsyrk_v2
-#define cublasZsyrk cublasZsyrk_v2
-#define cublasCherk cublasCherk_v2
-#define cublasZherk cublasZherk_v2
-
-#define cublasSsyr2k cublasSsyr2k_v2
-#define cublasDsyr2k cublasDsyr2k_v2
-#define cublasCsyr2k cublasCsyr2k_v2
-#define cublasZsyr2k cublasZsyr2k_v2
-#define cublasCher2k cublasCher2k_v2
-#define cublasZher2k cublasZher2k_v2
-
-#define cublasSsymm cublasSsymm_v2
-#define cublasDsymm cublasDsymm_v2
-#define cublasCsymm cublasCsymm_v2
-#define cublasZsymm cublasZsymm_v2
-#define cublasChemm cublasChemm_v2
-#define cublasZhemm cublasZhemm_v2
-
-#define cublasStrsm cublasStrsm_v2
-#define cublasDtrsm cublasDtrsm_v2
-#define cublasCtrsm cublasCtrsm_v2
-#define cublasZtrsm cublasZtrsm_v2
-
-#define cublasStrmm cublasStrmm_v2
-#define cublasDtrmm cublasDtrmm_v2
-#define cublasCtrmm cublasCtrmm_v2
-#define cublasZtrmm cublasZtrmm_v2
-
-#endif /* !defined(CUBLAS_V2_H_) */
--- a/include/cuda/cuda.h
+++ b/include/cuda/cuda.h
--- a/include/cuda/cudaD3D10.h
+++ b/include/cuda/cudaD3D10.h
@ -1,805 +0,0 @@
-/*
- * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#ifndef CUDAD3D10_H
-#define CUDAD3D10_H
-
-#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
-#define __CUDA_DEPRECATED
-#elif defined(_MSC_VER)
-#define __CUDA_DEPRECATED __declspec(deprecated)
-#elif defined(__GNUC__)
-#define __CUDA_DEPRECATED __attribute__((deprecated))
-#else
-#define __CUDA_DEPRECATED
-#endif
-
-#ifdef CUDA_FORCE_API_VERSION
-#error "CUDA_FORCE_API_VERSION is no longer supported."
-#endif
-
-#define cuD3D10CtxCreate                    cuD3D10CtxCreate_v2
-#define cuD3D10ResourceGetSurfaceDimensions cuD3D10ResourceGetSurfaceDimensions_v2
-#define cuD3D10ResourceGetMappedPointer     cuD3D10ResourceGetMappedPointer_v2
-#define cuD3D10ResourceGetMappedSize        cuD3D10ResourceGetMappedSize_v2
-#define cuD3D10ResourceGetMappedPitch       cuD3D10ResourceGetMappedPitch_v2
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * \defgroup CUDA_D3D10 Direct3D 10 Interoperability
- * \ingroup CUDA_DRIVER
- *
- * ___MANBRIEF___ Direct3D 10 interoperability functions of the low-level CUDA
- * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the Direct3D 10 interoperability functions of the
- * low-level CUDA driver application programming interface. Note that mapping 
- * of Direct3D 10 resources is performed with the graphics API agnostic, resource 
- * mapping interface described in \ref CUDA_GRAPHICS "Graphics Interoperability".
- *
- * @{
- */
-
-/**
- * CUDA devices corresponding to a D3D10 device
- */
-typedef enum CUd3d10DeviceList_enum {
-    CU_D3D10_DEVICE_LIST_ALL            = 0x01, /**< The CUDA devices for all GPUs used by a D3D10 device */
-    CU_D3D10_DEVICE_LIST_CURRENT_FRAME  = 0x02, /**< The CUDA devices for the GPUs used by a D3D10 device in its currently rendering frame */
-    CU_D3D10_DEVICE_LIST_NEXT_FRAME     = 0x03, /**< The CUDA devices for the GPUs to be used by a D3D10 device in the next frame */
-} CUd3d10DeviceList;
-
-/**
- * \brief Gets the CUDA device corresponding to a display adapter.
- *
- * Returns in \p *pCudaDevice the CUDA-compatible device corresponding to the
- * adapter \p pAdapter obtained from ::IDXGIFactory::EnumAdapters.
- *
- * If no device on \p pAdapter is CUDA-compatible then the call will fail.
- *
- * \param pCudaDevice - Returned CUDA device corresponding to \p pAdapter
- * \param pAdapter    - Adapter to query for CUDA device
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_FOUND,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cuD3D10GetDevices,
- * ::cudaD3D10GetDevice
- */
-CUresult CUDAAPI cuD3D10GetDevice(CUdevice *pCudaDevice, IDXGIAdapter *pAdapter);
-
-/**
- * \brief Gets the CUDA devices corresponding to a Direct3D 10 device
- *
- * Returns in \p *pCudaDeviceCount the number of CUDA-compatible device corresponding
- * to the Direct3D 10 device \p pD3D10Device.
- * Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the CUDA-compatible devices
- * corresponding to the Direct3D 10 device \p pD3D10Device.
- *
- * If any of the GPUs being used to render \p pDevice are not CUDA capable then the
- * call will return ::CUDA_ERROR_NO_DEVICE.
- *
- * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D10Device
- * \param pCudaDevices     - Returned CUDA devices corresponding to \p pD3D10Device
- * \param cudaDeviceCount  - The size of the output device array \p pCudaDevices
- * \param pD3D10Device     - Direct3D 10 device to query for CUDA devices
- * \param deviceList       - The set of devices to return.  This set may be
- *                           ::CU_D3D10_DEVICE_LIST_ALL for all devices,
- *                           ::CU_D3D10_DEVICE_LIST_CURRENT_FRAME for the devices used to
- *                           render the current frame (in SLI), or
- *                           ::CU_D3D10_DEVICE_LIST_NEXT_FRAME for the devices used to
- *                           render the next frame (in SLI).
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_NO_DEVICE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_FOUND,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cuD3D10GetDevice,
- * ::cudaD3D10GetDevices
- */
-CUresult CUDAAPI cuD3D10GetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, ID3D10Device *pD3D10Device, CUd3d10DeviceList deviceList);
-
-/**
- * \brief Register a Direct3D 10 resource for access by CUDA
- *
- * Registers the Direct3D 10 resource \p pD3DResource for access by CUDA and
- * returns a CUDA handle to \p pD3Dresource in \p pCudaResource.
- * The handle returned in \p pCudaResource may be used to map and unmap this
- * resource until it is unregistered.
- * On success this call will increase the internal reference count on
- * \p pD3DResource. This reference count will be decremented when this
- * resource is unregistered through ::cuGraphicsUnregisterResource().
- *
- * This call is potentially high-overhead and should not be called every frame
- * in interactive applications.
- *
- * The type of \p pD3DResource must be one of the following.
- * - ::ID3D10Buffer: may be accessed through a device pointer.
- * - ::ID3D10Texture1D: individual subresources of the texture may be accessed via arrays
- * - ::ID3D10Texture2D: individual subresources of the texture may be accessed via arrays
- * - ::ID3D10Texture3D: individual subresources of the texture may be accessed via arrays
- *
- * The \p Flags argument may be used to specify additional parameters at register
- * time.  The valid values for this parameter are
- * - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
- *   resource will be used.
- * - ::CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST: Specifies that CUDA will
- *   bind this resource to a surface reference.
- * - ::CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER: Specifies that CUDA will perform
- *   texture gather operations on this resource.
- *
- * Not all Direct3D resources of the above types may be used for
- * interoperability with CUDA.  The following are some limitations.
- * - The primary rendertarget may not be registered with CUDA.
- * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
- *   or 32-bit integer or floating-point data cannot be shared.
- * - Surfaces of depth or stencil formats cannot be shared.
- *
- * A complete list of supported DXGI formats is as follows. For compactness the
- * notation A_{B,C,D} represents A_B, A_C, and A_D.
- * - DXGI_FORMAT_A8_UNORM
- * - DXGI_FORMAT_B8G8R8A8_UNORM
- * - DXGI_FORMAT_B8G8R8X8_UNORM
- * - DXGI_FORMAT_R16_FLOAT
- * - DXGI_FORMAT_R16G16B16A16_{FLOAT,SINT,SNORM,UINT,UNORM}
- * - DXGI_FORMAT_R16G16_{FLOAT,SINT,SNORM,UINT,UNORM}
- * - DXGI_FORMAT_R16_{SINT,SNORM,UINT,UNORM}
- * - DXGI_FORMAT_R32_FLOAT
- * - DXGI_FORMAT_R32G32B32A32_{FLOAT,SINT,UINT}
- * - DXGI_FORMAT_R32G32_{FLOAT,SINT,UINT}
- * - DXGI_FORMAT_R32_{SINT,UINT}
- * - DXGI_FORMAT_R8G8B8A8_{SINT,SNORM,UINT,UNORM,UNORM_SRGB}
- * - DXGI_FORMAT_R8G8_{SINT,SNORM,UINT,UNORM}
- * - DXGI_FORMAT_R8_{SINT,SNORM,UINT,UNORM}
- *
- * If \p pD3DResource is of incorrect type or is already registered then
- * ::CUDA_ERROR_INVALID_HANDLE is returned.
- * If \p pD3DResource cannot be registered then ::CUDA_ERROR_UNKNOWN is returned.
- * If \p Flags is not one of the above specified value then ::CUDA_ERROR_INVALID_VALUE
- * is returned.
- *
- * \param pCudaResource - Returned graphics resource handle
- * \param pD3DResource  - Direct3D resource to register
- * \param Flags         - Parameters for resource registration
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cuGraphicsUnregisterResource,
- * ::cuGraphicsMapResources,
- * ::cuGraphicsSubResourceGetMappedArray,
- * ::cuGraphicsResourceGetMappedPointer,
- * ::cudaGraphicsD3D10RegisterResource
- */
-CUresult CUDAAPI cuGraphicsD3D10RegisterResource(CUgraphicsResource *pCudaResource, ID3D10Resource *pD3DResource, unsigned int Flags);
-
-/**
- * \defgroup CUDA_D3D10_DEPRECATED Direct3D 10 Interoperability [DEPRECATED]
- *
- * ___MANBRIEF___ deprecated Direct3D 10 interoperability functions of the 
- * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes deprecated Direct3D 10 interoperability functionality.
- * @{
- */
-
-/** Flags to register a resource */
-typedef enum CUD3D10register_flags_enum {
-    CU_D3D10_REGISTER_FLAGS_NONE  = 0x00,
-    CU_D3D10_REGISTER_FLAGS_ARRAY = 0x01,
-} CUD3D10register_flags;
-
-/** Flags to map or unmap a resource */
-typedef enum CUD3D10map_flags_enum {
-    CU_D3D10_MAPRESOURCE_FLAGS_NONE         = 0x00,
-    CU_D3D10_MAPRESOURCE_FLAGS_READONLY     = 0x01,
-    CU_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD = 0x02,
-} CUD3D10map_flags;
-
-
-/**
- * \brief Create a CUDA context for interoperability with Direct3D 10
- *
- * \deprecated This function is deprecated as of CUDA 5.0.
- *
- * This function is deprecated and should no longer be used.  It is
- * no longer necessary to associate a CUDA context with a D3D10
- * device in order to achieve maximum interoperability performance.
- *
- * \param pCtx        - Returned newly created CUDA context
- * \param pCudaDevice - Returned pointer to the device on which the context was created
- * \param Flags       - Context creation flags (see ::cuCtxCreate() for details)
- * \param pD3DDevice  - Direct3D device to create interoperability context with
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cuD3D10GetDevice,
- * ::cuGraphicsD3D10RegisterResource
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, ID3D10Device *pD3DDevice);
-
-/**
- * \brief Create a CUDA context for interoperability with Direct3D 10
- *
- * \deprecated This function is deprecated as of CUDA 5.0.
- *
- * This function is deprecated and should no longer be used.  It is
- * no longer necessary to associate a CUDA context with a D3D10
- * device in order to achieve maximum interoperability performance.
- *
- * \param pCtx        - Returned newly created CUDA context
- * \param flags       - Context creation flags (see ::cuCtxCreate() for details)
- * \param pD3DDevice  - Direct3D device to create interoperability context with
- * \param cudaDevice  - The CUDA device on which to create the context.  This device
- *                      must be among the devices returned when querying
- *                      ::CU_D3D10_DEVICES_ALL from  ::cuD3D10GetDevices.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cuD3D10GetDevices,
- * ::cuGraphicsD3D10RegisterResource
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10CtxCreateOnDevice(CUcontext *pCtx, unsigned int flags, ID3D10Device *pD3DDevice, CUdevice cudaDevice);
-
-/**
- * \brief Get the Direct3D 10 device against which the current CUDA context was
- * created
- *
- * \deprecated This function is deprecated as of CUDA 5.0.
- *
- * This function is deprecated and should no longer be used.  It is
- * no longer necessary to associate a CUDA context with a D3D10
- * device in order to achieve maximum interoperability performance.
- *
- * \param ppD3DDevice - Returned Direct3D device corresponding to CUDA context
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT
- * \notefnerr
- *
- * \sa
- * ::cuD3D10GetDevice
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10GetDirect3DDevice(ID3D10Device **ppD3DDevice);
-
-/**
- * \brief Register a Direct3D resource for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Registers the Direct3D resource \p pResource for access by CUDA.
- *
- * If this call is successful, then the application will be able to map and
- * unmap this resource until it is unregistered through
- * ::cuD3D10UnregisterResource(). Also on success, this call will increase the
- * internal reference count on \p pResource. This reference count will be
- * decremented when this resource is unregistered through
- * ::cuD3D10UnregisterResource().
- *
- * This call is potentially high-overhead and should not be called every frame
- * in interactive applications.
- *
- * The type of \p pResource must be one of the following.
- *
- * - ::ID3D10Buffer: Cannot be used with \p Flags set to
- *   ::CU_D3D10_REGISTER_FLAGS_ARRAY.
- * - ::ID3D10Texture1D: No restrictions.
- * - ::ID3D10Texture2D: No restrictions.
- * - ::ID3D10Texture3D: No restrictions.
- *
- * The \p Flags argument specifies the mechanism through which CUDA will
- * access the Direct3D resource.  The following values are allowed.
- *
- * - ::CU_D3D10_REGISTER_FLAGS_NONE: Specifies that CUDA will access this
- *   resource through a ::CUdeviceptr. The pointer, size, and (for textures),
- *   pitch for each subresource of this allocation may be queried through
- *   ::cuD3D10ResourceGetMappedPointer(), ::cuD3D10ResourceGetMappedSize(),
- *   and ::cuD3D10ResourceGetMappedPitch() respectively. This option is valid
- *   for all resource types.
- * - ::CU_D3D10_REGISTER_FLAGS_ARRAY: Specifies that CUDA will access this
- *   resource through a ::CUarray queried on a sub-resource basis through
- *   ::cuD3D10ResourceGetMappedArray(). This option is only valid for
- *   resources of type ::ID3D10Texture1D, ::ID3D10Texture2D, and
- *   ::ID3D10Texture3D.
- *
- * Not all Direct3D resources of the above types may be used for
- * interoperability with CUDA.  The following are some limitations.
- *
- * - The primary rendertarget may not be registered with CUDA.
- * - Resources allocated as shared may not be registered with CUDA.
- * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
- *   or 32-bit integer or floating-point data cannot be shared.
- * - Surfaces of depth or stencil formats cannot be shared.
- *
- * If Direct3D interoperability is not initialized on this context then
- * ::CUDA_ERROR_INVALID_CONTEXT is returned. If \p pResource is of incorrect
- * type or is already registered, then ::CUDA_ERROR_INVALID_HANDLE is
- * returned. If \p pResource cannot be registered, then ::CUDA_ERROR_UNKNOWN
- * is returned.
- *
- * \param pResource - Resource to register
- * \param Flags     - Parameters for resource registration
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa ::cuGraphicsD3D10RegisterResource
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10RegisterResource(ID3D10Resource *pResource, unsigned int Flags);
-
-/**
- * \brief Unregister a Direct3D resource
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Unregisters the Direct3D resource \p pResource so it is not accessible by
- * CUDA unless registered again.
- *
- * If \p pResource is not registered, then ::CUDA_ERROR_INVALID_HANDLE is
- * returned.
- *
- * \param pResource - Resources to unregister
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa ::cuGraphicsUnregisterResource
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10UnregisterResource(ID3D10Resource *pResource);
-
-/**
- * \brief Map Direct3D resources for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Maps the \p count Direct3D resources in \p ppResources for access by CUDA.
- *
- * The resources in \p ppResources may be accessed in CUDA kernels until they
- * are unmapped. Direct3D should not access any resources while they are mapped
- * by CUDA. If an application does so, the results are undefined.
- *
- * This function provides the synchronization guarantee that any Direct3D calls
- * issued before ::cuD3D10MapResources() will complete before any CUDA kernels
- * issued after ::cuD3D10MapResources() begin.
- *
- * If any of \p ppResources have not been registered for use with CUDA or if
- * \p ppResources contains any duplicate entries, then
- * ::CUDA_ERROR_INVALID_HANDLE is returned. If any of \p ppResources are
- * presently mapped for access by CUDA, then ::CUDA_ERROR_ALREADY_MAPPED is
- * returned.
- *
- * \param count       - Number of resources to map for CUDA
- * \param ppResources - Resources to map for CUDA
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_ALREADY_MAPPED,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa ::cuGraphicsMapResources
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10MapResources(unsigned int count, ID3D10Resource **ppResources);
-
-/**
- * \brief Unmap Direct3D resources
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Unmaps the \p count Direct3D resources in \p ppResources.
- *
- * This function provides the synchronization guarantee that any CUDA kernels
- * issued before ::cuD3D10UnmapResources() will complete before any Direct3D
- * calls issued after ::cuD3D10UnmapResources() begin.
- *
- * If any of \p ppResources have not been registered for use with CUDA or if
- * \p ppResources contains any duplicate entries, then
- * ::CUDA_ERROR_INVALID_HANDLE is returned. If any of \p ppResources are not
- * presently mapped for access by CUDA, then ::CUDA_ERROR_NOT_MAPPED is
- * returned.
- *
- * \param count       - Number of resources to unmap for CUDA
- * \param ppResources - Resources to unmap for CUDA
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_MAPPED,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa ::cuGraphicsUnmapResources
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10UnmapResources(unsigned int count, ID3D10Resource **ppResources);
-
-/**
- * \brief Set usage flags for mapping a Direct3D resource
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Set flags for mapping the Direct3D resource \p pResource.
- *
- * Changes to flags will take effect the next time \p pResource is mapped. The
- * \p Flags argument may be any of the following.
- *
- * - ::CU_D3D10_MAPRESOURCE_FLAGS_NONE: Specifies no hints about how this
- *   resource will be used. It is therefore assumed that this resource will be
- *   read from and written to by CUDA kernels. This is the default value.
- * - ::CU_D3D10_MAPRESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which
- *   access this resource will not write to this resource.
- * - ::CU_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels
- *   which access this resource will not read from this resource and will
- *   write over the entire contents of the resource, so none of the data
- *   previously stored in the resource will be preserved.
- *
- * If \p pResource has not been registered for use with CUDA, then
- * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is presently
- * mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned.
- *
- * \param pResource - Registered resource to set flags for
- * \param Flags     - Parameters for resource mapping
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_ALREADY_MAPPED
- * \notefnerr
- *
- * \sa ::cuGraphicsResourceSetMapFlags
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceSetMapFlags(ID3D10Resource *pResource, unsigned int Flags);
-
-/**
- * \brief Get an array through which to access a subresource of a Direct3D
- * resource which has been mapped for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Returns in \p *pArray an array through which the subresource of the mapped
- * Direct3D resource \p pResource, which corresponds to \p SubResource may be
- * accessed. The value set in \p pArray may change every time that \p pResource
- * is mapped.
- *
- * If \p pResource is not registered, then ::CUDA_ERROR_INVALID_HANDLE is
- * returned. If \p pResource was not registered with usage flags
- * ::CU_D3D10_REGISTER_FLAGS_ARRAY, then ::CUDA_ERROR_INVALID_HANDLE is
- * returned. If \p pResource is not mapped, then ::CUDA_ERROR_NOT_MAPPED is
- * returned.
- *
- * For usage requirements of the \p SubResource parameter, see
- * ::cuD3D10ResourceGetMappedPointer().
- *
- * \param pArray       - Returned array corresponding to subresource
- * \param pResource    - Mapped resource to access
- * \param SubResource  - Subresource of pResource to access
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_MAPPED
- * \notefnerr
- *
- * \sa ::cuGraphicsSubResourceGetMappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceGetMappedArray(CUarray *pArray, ID3D10Resource *pResource, unsigned int SubResource);
-
-/**
- * \brief Get a pointer through which to access a subresource of a Direct3D
- * resource which has been mapped for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Returns in \p *pDevPtr the base pointer of the subresource of the mapped
- * Direct3D resource \p pResource, which corresponds to \p SubResource. The
- * value set in \p pDevPtr may change every time that \p pResource is mapped.
- *
- * If \p pResource is not registered, then ::CUDA_ERROR_INVALID_HANDLE is
- * returned. If \p pResource was not registered with usage flags
- * ::CU_D3D10_REGISTER_FLAGS_NONE, then ::CUDA_ERROR_INVALID_HANDLE is
- * returned. If \p pResource is not mapped, then ::CUDA_ERROR_NOT_MAPPED is
- * returned.
- *
- * If \p pResource is of type ::ID3D10Buffer, then \p SubResource must be 0.
- * If \p pResource is of any other type, then the value of \p SubResource must
- * come from the subresource calculation in ::D3D10CalcSubResource().
- *
- * \param pDevPtr      - Returned pointer corresponding to subresource
- * \param pResource    - Mapped resource to access
- * \param SubResource  - Subresource of pResource to access
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_MAPPED
- * \notefnerr
- *
- * \sa ::cuGraphicsResourceGetMappedPointer
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceGetMappedPointer(CUdeviceptr *pDevPtr, ID3D10Resource *pResource, unsigned int SubResource);
-
-/**
- * \brief Get the size of a subresource of a Direct3D resource which has been
- * mapped for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Returns in \p *pSize the size of the subresource of the mapped Direct3D
- * resource \p pResource, which corresponds to \p SubResource. The value set
- * in \p pSize may change every time that \p pResource is mapped.
- *
- * If \p pResource has not been registered for use with CUDA, then
- * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource was not registered
- * with usage flags ::CU_D3D10_REGISTER_FLAGS_NONE, then
- * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is not mapped for
- * access by CUDA, then ::CUDA_ERROR_NOT_MAPPED is returned.
- *
- * For usage requirements of the \p SubResource parameter, see
- * ::cuD3D10ResourceGetMappedPointer().
- *
- * \param pSize        - Returned size of subresource
- * \param pResource    - Mapped resource to access
- * \param SubResource  - Subresource of pResource to access
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_MAPPED
- * \notefnerr
- *
- * \sa ::cuGraphicsResourceGetMappedPointer
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceGetMappedSize(size_t *pSize, ID3D10Resource *pResource, unsigned int SubResource);
-
-/**
- * \brief Get the pitch of a subresource of a Direct3D resource which has been
- * mapped for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Returns in \p *pPitch and \p *pPitchSlice the pitch and Z-slice pitch of the
- * subresource of the mapped Direct3D resource \p pResource, which corresponds
- * to \p SubResource. The values set in \p pPitch and \p pPitchSlice may
- * change every time that \p pResource is mapped.
- *
- * The pitch and Z-slice pitch values may be used to compute the location of a
- * sample on a surface as follows.
- *
- * For a 2D surface, the byte offset of the sample at position \b x, \b y from
- * the base pointer of the surface is:
- *
- * \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
- *
- * For a 3D surface, the byte offset of the sample at position \b x, \b y,
- * \b z from the base pointer of the surface is:
- *
- * \b z* \b slicePitch + \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
- *
- * Both parameters \p pPitch and \p pPitchSlice are optional and may be set to
- * NULL.
- *
- * If \p pResource is not of type ::IDirect3DBaseTexture10 or one of its
- * sub-types or if \p pResource has not been registered for use with CUDA, then
- * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource was not registered
- * with usage flags ::CU_D3D10_REGISTER_FLAGS_NONE, then
- * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is not mapped for
- * access by CUDA, then ::CUDA_ERROR_NOT_MAPPED is returned.
- *
- * For usage requirements of the \p SubResource parameter, see
- * ::cuD3D10ResourceGetMappedPointer().
- *
- * \param pPitch       - Returned pitch of subresource
- * \param pPitchSlice  - Returned Z-slice pitch of subresource
- * \param pResource    - Mapped resource to access
- * \param SubResource  - Subresource of pResource to access
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_MAPPED
- * \notefnerr
- *
- * \sa ::cuGraphicsSubResourceGetMappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceGetMappedPitch(size_t *pPitch, size_t *pPitchSlice, ID3D10Resource *pResource, unsigned int SubResource);
-
-/**
- * \brief Get the dimensions of a registered surface
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Returns in \p *pWidth, \p *pHeight, and \p *pDepth the dimensions of the
- * subresource of the mapped Direct3D resource \p pResource, which corresponds
- * to \p SubResource.
- *
- * Because anti-aliased surfaces may have multiple samples per pixel, it is
- * possible that the dimensions of a resource will be an integer factor larger
- * than the dimensions reported by the Direct3D runtime.
- *
- * The parameters \p pWidth, \p pHeight, and \p pDepth are optional. For 2D
- * surfaces, the value returned in \p *pDepth will be 0.
- *
- * If \p pResource is not of type ::IDirect3DBaseTexture10 or
- * ::IDirect3DSurface10 or if \p pResource has not been registered for use
- * with CUDA, then ::CUDA_ERROR_INVALID_HANDLE is returned.
- *
- * For usage requirements of the \p SubResource parameter, see
- * ::cuD3D10ResourceGetMappedPointer().
- *
- * \param pWidth       - Returned width of surface
- * \param pHeight      - Returned height of surface
- * \param pDepth       - Returned depth of surface
- * \param pResource    - Registered resource to access
- * \param SubResource  - Subresource of pResource to access
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa ::cuGraphicsSubResourceGetMappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceGetSurfaceDimensions(size_t *pWidth, size_t *pHeight, size_t *pDepth, ID3D10Resource *pResource, unsigned int SubResource);
-
-/** @} */ /* END CUDA_D3D10_DEPRECATED */
-/** @} */ /* END CUDA_D3D10 */
-
-
-#if defined(__CUDA_API_VERSION_INTERNAL)
-    #undef cuD3D10CtxCreate
-    #undef cuD3D10ResourceGetSurfaceDimensions
-    #undef cuD3D10ResourceGetMappedPointer
-    #undef cuD3D10ResourceGetMappedSize
-    #undef cuD3D10ResourceGetMappedPitch
-
-    CUresult CUDAAPI cuD3D10CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, ID3D10Device *pD3DDevice);
-    CUresult CUDAAPI cuD3D10ResourceGetMappedPitch(unsigned int *pPitch, unsigned int *pPitchSlice, ID3D10Resource *pResource, unsigned int SubResource);
-    CUresult CUDAAPI cuD3D10ResourceGetMappedPointer(CUdeviceptr_v1 *pDevPtr, ID3D10Resource *pResource, unsigned int SubResource);
-    CUresult CUDAAPI cuD3D10ResourceGetMappedSize(unsigned int *pSize, ID3D10Resource *pResource, unsigned int SubResource);
-    CUresult CUDAAPI cuD3D10ResourceGetSurfaceDimensions(unsigned int *pWidth, unsigned int *pHeight, unsigned int *pDepth, ID3D10Resource *pResource, unsigned int SubResource);
-#endif /* __CUDA_API_VERSION_INTERNAL */
-
-#ifdef __cplusplus
-};
-#endif
-
-#undef __CUDA_DEPRECATED
-
-#endif
-
--- a/include/cuda/cudaD3D10Typedefs.h
+++ b/include/cuda/cudaD3D10Typedefs.h
@ -1,119 +0,0 @@
-/*
- * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#ifndef CUDAD3D10TYPEDEFS_H
-#define CUDAD3D10TYPEDEFS_H
-
-// Dependent includes for cudaD3D10.h
-#include <rpcsal.h>
-#include <D3D10_1.h>
-
-#include <cudaD3D10.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif // __cplusplus
-
-/*
- * Macros for the latest version for each driver function in cudaD3D10.h
- */
-#define PFN_cuD3D10GetDevice  PFN_cuD3D10GetDevice_v2010
-#define PFN_cuD3D10GetDevices  PFN_cuD3D10GetDevices_v3020
-#define PFN_cuGraphicsD3D10RegisterResource  PFN_cuGraphicsD3D10RegisterResource_v3000
-#define PFN_cuD3D10CtxCreate  PFN_cuD3D10CtxCreate_v3020
-#define PFN_cuD3D10CtxCreateOnDevice  PFN_cuD3D10CtxCreateOnDevice_v3020
-#define PFN_cuD3D10GetDirect3DDevice  PFN_cuD3D10GetDirect3DDevice_v3020
-#define PFN_cuD3D10RegisterResource  PFN_cuD3D10RegisterResource_v2010
-#define PFN_cuD3D10UnregisterResource  PFN_cuD3D10UnregisterResource_v2010
-#define PFN_cuD3D10MapResources  PFN_cuD3D10MapResources_v2010
-#define PFN_cuD3D10UnmapResources  PFN_cuD3D10UnmapResources_v2010
-#define PFN_cuD3D10ResourceSetMapFlags  PFN_cuD3D10ResourceSetMapFlags_v2010
-#define PFN_cuD3D10ResourceGetMappedArray  PFN_cuD3D10ResourceGetMappedArray_v2010
-#define PFN_cuD3D10ResourceGetMappedPointer  PFN_cuD3D10ResourceGetMappedPointer_v3020
-#define PFN_cuD3D10ResourceGetMappedSize  PFN_cuD3D10ResourceGetMappedSize_v3020
-#define PFN_cuD3D10ResourceGetMappedPitch  PFN_cuD3D10ResourceGetMappedPitch_v3020
-#define PFN_cuD3D10ResourceGetSurfaceDimensions  PFN_cuD3D10ResourceGetSurfaceDimensions_v3020
-
-
-/**
- * Type definitions for functions defined in cudaD3D10.h
- */
-typedef CUresult (CUDAAPI *PFN_cuD3D10GetDevice_v2010)(CUdevice_v1 *pCudaDevice, IDXGIAdapter *pAdapter);
-typedef CUresult (CUDAAPI *PFN_cuD3D10GetDevices_v3020)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, ID3D10Device *pD3D10Device, CUd3d10DeviceList deviceList);
-typedef CUresult (CUDAAPI *PFN_cuGraphicsD3D10RegisterResource_v3000)(CUgraphicsResource *pCudaResource, ID3D10Resource *pD3DResource, unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuD3D10CtxCreate_v3020)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, ID3D10Device *pD3DDevice);
-typedef CUresult (CUDAAPI *PFN_cuD3D10CtxCreateOnDevice_v3020)(CUcontext *pCtx, unsigned int flags, ID3D10Device *pD3DDevice, CUdevice_v1 cudaDevice);
-typedef CUresult (CUDAAPI *PFN_cuD3D10GetDirect3DDevice_v3020)(ID3D10Device **ppD3DDevice);
-typedef CUresult (CUDAAPI *PFN_cuD3D10RegisterResource_v2010)(ID3D10Resource *pResource, unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuD3D10UnregisterResource_v2010)(ID3D10Resource *pResource);
-typedef CUresult (CUDAAPI *PFN_cuD3D10MapResources_v2010)(unsigned int count, ID3D10Resource **ppResources);
-typedef CUresult (CUDAAPI *PFN_cuD3D10UnmapResources_v2010)(unsigned int count, ID3D10Resource **ppResources);
-typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceSetMapFlags_v2010)(ID3D10Resource *pResource, unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedArray_v2010)(CUarray *pArray, ID3D10Resource *pResource, unsigned int SubResource);
-typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedPointer_v3020)(CUdeviceptr_v2 *pDevPtr, ID3D10Resource *pResource, unsigned int SubResource);
-typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedSize_v3020)(size_t *pSize, ID3D10Resource *pResource, unsigned int SubResource);
-typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedPitch_v3020)(size_t *pPitch, size_t *pPitchSlice, ID3D10Resource *pResource, unsigned int SubResource);
-typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetSurfaceDimensions_v3020)(size_t *pWidth, size_t *pHeight, size_t *pDepth, ID3D10Resource *pResource, unsigned int SubResource);
-
-/*
- * Type definitions for older versioned functions in cudaD3D10.h
- */
-#if defined(__CUDA_API_VERSION_INTERNAL)
-    typedef CUresult (CUDAAPI *PFN_cuD3D10CtxCreate_v2010)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, ID3D10Device *pD3DDevice);
-    typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedPitch_v2010)(unsigned int *pPitch, unsigned int *pPitchSlice, ID3D10Resource *pResource, unsigned int SubResource);
-    typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedPointer_v2010)(CUdeviceptr_v1 *pDevPtr, ID3D10Resource *pResource, unsigned int SubResource);
-    typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedSize_v2010)(unsigned int *pSize, ID3D10Resource *pResource, unsigned int SubResource);
-    typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetSurfaceDimensions_v2010)(unsigned int *pWidth, unsigned int *pHeight, unsigned int *pDepth, ID3D10Resource *pResource, unsigned int SubResource);
-#endif
-
-#ifdef __cplusplus
-}
-#endif // __cplusplus
-
-#endif // file guard
--- a/include/cuda/cudaD3D11.h
+++ b/include/cuda/cudaD3D11.h
@ -1,357 +0,0 @@
-/*
- * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#ifndef CUDAD3D11_H
-#define CUDAD3D11_H
-
-#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
-#define __CUDA_DEPRECATED
-#elif defined(_MSC_VER)
-#define __CUDA_DEPRECATED __declspec(deprecated)
-#elif defined(__GNUC__)
-#define __CUDA_DEPRECATED __attribute__((deprecated))
-#else
-#define __CUDA_DEPRECATED
-#endif
-
-#ifdef CUDA_FORCE_API_VERSION
-#error "CUDA_FORCE_API_VERSION is no longer supported."
-#endif
-
-#define cuD3D11CtxCreate cuD3D11CtxCreate_v2
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * \defgroup CUDA_D3D11 Direct3D 11 Interoperability
- * \ingroup CUDA_DRIVER
- *
- * ___MANBRIEF___ Direct3D 11 interoperability functions of the low-level CUDA
- * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the Direct3D 11 interoperability functions of the
- * low-level CUDA driver application programming interface. Note that mapping 
- * of Direct3D 11 resources is performed with the graphics API agnostic, resource 
- * mapping interface described in \ref CUDA_GRAPHICS "Graphics Interoperability".
- *
- * @{
- */
-
-/**
- * CUDA devices corresponding to a D3D11 device
- */
-typedef enum CUd3d11DeviceList_enum {
-    CU_D3D11_DEVICE_LIST_ALL            = 0x01, /**< The CUDA devices for all GPUs used by a D3D11 device */
-    CU_D3D11_DEVICE_LIST_CURRENT_FRAME  = 0x02, /**< The CUDA devices for the GPUs used by a D3D11 device in its currently rendering frame */
-    CU_D3D11_DEVICE_LIST_NEXT_FRAME     = 0x03, /**< The CUDA devices for the GPUs to be used by a D3D11 device in the next frame */
-} CUd3d11DeviceList;
-
-/**
- * \brief Gets the CUDA device corresponding to a display adapter.
- *
- * Returns in \p *pCudaDevice the CUDA-compatible device corresponding to the
- * adapter \p pAdapter obtained from ::IDXGIFactory::EnumAdapters.
- *
- * If no device on \p pAdapter is CUDA-compatible the call will return
- * ::CUDA_ERROR_NO_DEVICE.
- *
- * \param pCudaDevice - Returned CUDA device corresponding to \p pAdapter
- * \param pAdapter    - Adapter to query for CUDA device
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_NO_DEVICE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_FOUND,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cuD3D11GetDevices,
- * ::cudaD3D11GetDevice
- */
-CUresult CUDAAPI cuD3D11GetDevice(CUdevice *pCudaDevice, IDXGIAdapter *pAdapter);
-
-/**
- * \brief Gets the CUDA devices corresponding to a Direct3D 11 device
- *
- * Returns in \p *pCudaDeviceCount the number of CUDA-compatible device corresponding
- * to the Direct3D 11 device \p pD3D11Device.
- * Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the CUDA-compatible devices
- * corresponding to the Direct3D 11 device \p pD3D11Device.
- *
- * If any of the GPUs being used to render \p pDevice are not CUDA capable then the
- * call will return ::CUDA_ERROR_NO_DEVICE.
- *
- * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D11Device
- * \param pCudaDevices     - Returned CUDA devices corresponding to \p pD3D11Device
- * \param cudaDeviceCount  - The size of the output device array \p pCudaDevices
- * \param pD3D11Device     - Direct3D 11 device to query for CUDA devices
- * \param deviceList       - The set of devices to return.  This set may be
- *                           ::CU_D3D11_DEVICE_LIST_ALL for all devices,
- *                           ::CU_D3D11_DEVICE_LIST_CURRENT_FRAME for the devices used to
- *                           render the current frame (in SLI), or
- *                           ::CU_D3D11_DEVICE_LIST_NEXT_FRAME for the devices used to
- *                           render the next frame (in SLI).
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_NO_DEVICE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_FOUND,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cuD3D11GetDevice,
- * ::cudaD3D11GetDevices
- */
-CUresult CUDAAPI cuD3D11GetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, ID3D11Device *pD3D11Device, CUd3d11DeviceList deviceList);
-
-/**
- * \brief Register a Direct3D 11 resource for access by CUDA
- *
- * Registers the Direct3D 11 resource \p pD3DResource for access by CUDA and
- * returns a CUDA handle to \p pD3Dresource in \p pCudaResource.
- * The handle returned in \p pCudaResource may be used to map and unmap this
- * resource until it is unregistered.
- * On success this call will increase the internal reference count on
- * \p pD3DResource. This reference count will be decremented when this
- * resource is unregistered through ::cuGraphicsUnregisterResource().
- *
- * This call is potentially high-overhead and should not be called every frame
- * in interactive applications.
- *
- * The type of \p pD3DResource must be one of the following.
- * - ::ID3D11Buffer: may be accessed through a device pointer.
- * - ::ID3D11Texture1D: individual subresources of the texture may be accessed via arrays
- * - ::ID3D11Texture2D: individual subresources of the texture may be accessed via arrays
- * - ::ID3D11Texture3D: individual subresources of the texture may be accessed via arrays
- *
- * The \p Flags argument may be used to specify additional parameters at register
- * time.  The valid values for this parameter are
- * - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
- *   resource will be used.
- * - ::CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST: Specifies that CUDA will
- *   bind this resource to a surface reference.
- * - ::CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER: Specifies that CUDA will perform
- *   texture gather operations on this resource.
- *
- * Not all Direct3D resources of the above types may be used for
- * interoperability with CUDA.  The following are some limitations.
- * - The primary rendertarget may not be registered with CUDA.
- * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
- *   or 32-bit integer or floating-point data cannot be shared.
- * - Surfaces of depth or stencil formats cannot be shared.
- *
- * A complete list of supported DXGI formats is as follows. For compactness the
- * notation A_{B,C,D} represents A_B, A_C, and A_D.
- * - DXGI_FORMAT_A8_UNORM
- * - DXGI_FORMAT_B8G8R8A8_UNORM
- * - DXGI_FORMAT_B8G8R8X8_UNORM
- * - DXGI_FORMAT_R16_FLOAT
- * - DXGI_FORMAT_R16G16B16A16_{FLOAT,SINT,SNORM,UINT,UNORM}
- * - DXGI_FORMAT_R16G16_{FLOAT,SINT,SNORM,UINT,UNORM}
- * - DXGI_FORMAT_R16_{SINT,SNORM,UINT,UNORM}
- * - DXGI_FORMAT_R32_FLOAT
- * - DXGI_FORMAT_R32G32B32A32_{FLOAT,SINT,UINT}
- * - DXGI_FORMAT_R32G32_{FLOAT,SINT,UINT}
- * - DXGI_FORMAT_R32_{SINT,UINT}
- * - DXGI_FORMAT_R8G8B8A8_{SINT,SNORM,UINT,UNORM,UNORM_SRGB}
- * - DXGI_FORMAT_R8G8_{SINT,SNORM,UINT,UNORM}
- * - DXGI_FORMAT_R8_{SINT,SNORM,UINT,UNORM}
- *
- * If \p pD3DResource is of incorrect type or is already registered then
- * ::CUDA_ERROR_INVALID_HANDLE is returned.
- * If \p pD3DResource cannot be registered then ::CUDA_ERROR_UNKNOWN is returned.
- * If \p Flags is not one of the above specified value then ::CUDA_ERROR_INVALID_VALUE
- * is returned.
- *
- * \param pCudaResource - Returned graphics resource handle
- * \param pD3DResource  - Direct3D resource to register
- * \param Flags         - Parameters for resource registration
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cuGraphicsUnregisterResource,
- * ::cuGraphicsMapResources,
- * ::cuGraphicsSubResourceGetMappedArray,
- * ::cuGraphicsResourceGetMappedPointer,
- * ::cudaGraphicsD3D11RegisterResource
- */
-CUresult CUDAAPI cuGraphicsD3D11RegisterResource(CUgraphicsResource *pCudaResource, ID3D11Resource *pD3DResource, unsigned int Flags);
-
-/**
- * \defgroup CUDA_D3D11_DEPRECATED Direct3D 11 Interoperability [DEPRECATED]
- *
- * ___MANBRIEF___ deprecated Direct3D 11 interoperability functions of the
- * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes deprecated Direct3D 11 interoperability functionality.
- * @{
- */
-
-/**
- * \brief Create a CUDA context for interoperability with Direct3D 11
- *
- * \deprecated This function is deprecated as of CUDA 5.0.
- *
- * This function is deprecated and should no longer be used.  It is
- * no longer necessary to associate a CUDA context with a D3D11
- * device in order to achieve maximum interoperability performance.
- *
- * \param pCtx        - Returned newly created CUDA context
- * \param pCudaDevice - Returned pointer to the device on which the context was created
- * \param Flags       - Context creation flags (see ::cuCtxCreate() for details)
- * \param pD3DDevice  - Direct3D device to create interoperability context with
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cuD3D11GetDevice,
- * ::cuGraphicsD3D11RegisterResource
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D11CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, ID3D11Device *pD3DDevice);
-
-/**
- * \brief Create a CUDA context for interoperability with Direct3D 11
- *
- * \deprecated This function is deprecated as of CUDA 5.0.
- *
- * This function is deprecated and should no longer be used.  It is
- * no longer necessary to associate a CUDA context with a D3D11
- * device in order to achieve maximum interoperability performance.
- *
- * \param pCtx        - Returned newly created CUDA context
- * \param flags       - Context creation flags (see ::cuCtxCreate() for details)
- * \param pD3DDevice  - Direct3D device to create interoperability context with
- * \param cudaDevice  - The CUDA device on which to create the context.  This device
- *                      must be among the devices returned when querying
- *                      ::CU_D3D11_DEVICES_ALL from  ::cuD3D11GetDevices.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cuD3D11GetDevices,
- * ::cuGraphicsD3D11RegisterResource
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D11CtxCreateOnDevice(CUcontext *pCtx, unsigned int flags, ID3D11Device *pD3DDevice, CUdevice cudaDevice);
-
-/**
- * \brief Get the Direct3D 11 device against which the current CUDA context was
- * created
- *
- * \deprecated This function is deprecated as of CUDA 5.0.
- *
- * This function is deprecated and should no longer be used.  It is
- * no longer necessary to associate a CUDA context with a D3D11
- * device in order to achieve maximum interoperability performance.
- *
- * \param ppD3DDevice - Returned Direct3D device corresponding to CUDA context
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT
- * \notefnerr
- *
- * \sa
- * ::cuD3D11GetDevice
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D11GetDirect3DDevice(ID3D11Device **ppD3DDevice);
-
-/** @} */ /* END CUDA_D3D11_DEPRECATED */
-/** @} */ /* END CUDA_D3D11 */
-
-
-#if defined(__CUDA_API_VERSION_INTERNAL)
-    #undef cuD3D11CtxCreate
-
-    CUresult CUDAAPI cuD3D11CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, ID3D11Device *pD3DDevice);
-#endif /* __CUDA_API_VERSION_INTERNAL */
-
-#ifdef __cplusplus
-};
-#endif
-
-#undef __CUDA_DEPRECATED
-
-#endif
-
--- a/include/cuda/cudaD3D11Typedefs.h
+++ b/include/cuda/cudaD3D11Typedefs.h
@ -1,92 +0,0 @@
-/*
- * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#ifndef CUDAD3D11TYPEDEFS_H
-#define CUDAD3D11TYPEDEFS_H
-
-// Dependent includes for cudaD3D11.h
-#include <rpcsal.h>
-#include <D3D11_1.h>
-
-#include <cudaD3D11.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif // __cplusplus
-
-/*
- * Macros for the latest version for each driver function in cudaD3D11.h
- */
-#define PFN_cuD3D11GetDevice  PFN_cuD3D11GetDevice_v3000
-#define PFN_cuD3D11GetDevices  PFN_cuD3D11GetDevices_v3020
-#define PFN_cuGraphicsD3D11RegisterResource  PFN_cuGraphicsD3D11RegisterResource_v3000
-#define PFN_cuD3D11CtxCreate  PFN_cuD3D11CtxCreate_v3020
-#define PFN_cuD3D11CtxCreateOnDevice  PFN_cuD3D11CtxCreateOnDevice_v3020
-#define PFN_cuD3D11GetDirect3DDevice  PFN_cuD3D11GetDirect3DDevice_v3020
-
-
-/**
- * Type definitions for functions defined in cudaD3D11.h
- */
-typedef CUresult (CUDAAPI *PFN_cuD3D11GetDevice_v3000)(CUdevice_v1 *pCudaDevice, IDXGIAdapter *pAdapter);
-typedef CUresult (CUDAAPI *PFN_cuD3D11GetDevices_v3020)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, ID3D11Device *pD3D11Device, CUd3d11DeviceList deviceList);
-typedef CUresult (CUDAAPI *PFN_cuGraphicsD3D11RegisterResource_v3000)(CUgraphicsResource *pCudaResource, ID3D11Resource *pD3DResource, unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuD3D11CtxCreate_v3020)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, ID3D11Device *pD3DDevice);
-typedef CUresult (CUDAAPI *PFN_cuD3D11CtxCreateOnDevice_v3020)(CUcontext *pCtx, unsigned int flags, ID3D11Device *pD3DDevice, CUdevice_v1 cudaDevice);
-typedef CUresult (CUDAAPI *PFN_cuD3D11GetDirect3DDevice_v3020)(ID3D11Device **ppD3DDevice);
-
-#if defined(__CUDA_API_VERSION_INTERNAL)
-    typedef CUresult (CUDAAPI *PFN_cuD3D11CtxCreate_v3000)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, ID3D11Device *pD3DDevice);
-#endif
-
-#ifdef __cplusplus
-}
-#endif // __cplusplus
-
-#endif // file guard
--- a/include/cuda/cudaD3D9.h
+++ b/include/cuda/cudaD3D9.h
@ -1,886 +0,0 @@
-/*
- * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#ifndef CUDAD3D9_H
-#define CUDAD3D9_H
-
-#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
-#define __CUDA_DEPRECATED
-#elif defined(_MSC_VER)
-#define __CUDA_DEPRECATED __declspec(deprecated)
-#elif defined(__GNUC__)
-#define __CUDA_DEPRECATED __attribute__((deprecated))
-#else
-#define __CUDA_DEPRECATED
-#endif
-
-#ifdef CUDA_FORCE_API_VERSION
-#error "CUDA_FORCE_API_VERSION is no longer supported."
-#endif
-
-#define cuD3D9CtxCreate                    cuD3D9CtxCreate_v2
-#define cuD3D9ResourceGetSurfaceDimensions cuD3D9ResourceGetSurfaceDimensions_v2
-#define cuD3D9ResourceGetMappedPointer     cuD3D9ResourceGetMappedPointer_v2
-#define cuD3D9ResourceGetMappedSize        cuD3D9ResourceGetMappedSize_v2
-#define cuD3D9ResourceGetMappedPitch       cuD3D9ResourceGetMappedPitch_v2
-#define cuD3D9MapVertexBuffer              cuD3D9MapVertexBuffer_v2
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * \file cudaD3D9.h
- * \brief Header file for the Direct3D 9 interoperability functions of the
- * low-level CUDA driver application programming interface.
- */
-
-/**
- * \defgroup CUDA_D3D9 Direct3D 9 Interoperability
- * \ingroup CUDA_DRIVER
- *
- * ___MANBRIEF___ Direct3D 9 interoperability functions of the low-level CUDA
- * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the Direct3D 9 interoperability functions of the
- * low-level CUDA driver application programming interface. Note that mapping 
- * of Direct3D 9 resources is performed with the graphics API agnostic, resource 
- * mapping interface described in \ref CUDA_GRAPHICS "Graphics Interoperability".
- *
- * @{
- */
-
-/**
- * CUDA devices corresponding to a D3D9 device
- */
-typedef enum CUd3d9DeviceList_enum {
-    CU_D3D9_DEVICE_LIST_ALL            = 0x01, /**< The CUDA devices for all GPUs used by a D3D9 device */
-    CU_D3D9_DEVICE_LIST_CURRENT_FRAME  = 0x02, /**< The CUDA devices for the GPUs used by a D3D9 device in its currently rendering frame */
-    CU_D3D9_DEVICE_LIST_NEXT_FRAME     = 0x03, /**< The CUDA devices for the GPUs to be used by a D3D9 device in the next frame */
-} CUd3d9DeviceList;
-
-/**
- * \brief Gets the CUDA device corresponding to a display adapter.
- *
- * Returns in \p *pCudaDevice the CUDA-compatible device corresponding to the
- * adapter name \p pszAdapterName obtained from ::EnumDisplayDevices() or
- * ::IDirect3D9::GetAdapterIdentifier().
- *
- * If no device on the adapter with name \p pszAdapterName is CUDA-compatible,
- * then the call will fail.
- *
- * \param pCudaDevice    - Returned CUDA device corresponding to pszAdapterName
- * \param pszAdapterName - Adapter name to query for device
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_FOUND,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cuD3D9CtxCreate,
- * ::cudaD3D9GetDevice
- */
-CUresult CUDAAPI cuD3D9GetDevice(CUdevice *pCudaDevice, const char *pszAdapterName);
-
-/**
- * \brief Gets the CUDA devices corresponding to a Direct3D 9 device
- *
- * Returns in \p *pCudaDeviceCount the number of CUDA-compatible device corresponding
- * to the Direct3D 9 device \p pD3D9Device.
- * Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the CUDA-compatible devices
- * corresponding to the Direct3D 9 device \p pD3D9Device.
- *
- * If any of the GPUs being used to render \p pDevice are not CUDA capable then the
- * call will return ::CUDA_ERROR_NO_DEVICE.
- *
- * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D9Device
- * \param pCudaDevices     - Returned CUDA devices corresponding to \p pD3D9Device
- * \param cudaDeviceCount  - The size of the output device array \p pCudaDevices
- * \param pD3D9Device      - Direct3D 9 device to query for CUDA devices
- * \param deviceList       - The set of devices to return.  This set may be
- *                           ::CU_D3D9_DEVICE_LIST_ALL for all devices,
- *                           ::CU_D3D9_DEVICE_LIST_CURRENT_FRAME for the devices used to
- *                           render the current frame (in SLI), or
- *                           ::CU_D3D9_DEVICE_LIST_NEXT_FRAME for the devices used to
- *                           render the next frame (in SLI).
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_NO_DEVICE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_FOUND,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cuD3D9CtxCreate,
- * ::cudaD3D9GetDevices
- */
-CUresult CUDAAPI cuD3D9GetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, IDirect3DDevice9 *pD3D9Device, CUd3d9DeviceList deviceList);
-
-/**
- * \brief Create a CUDA context for interoperability with Direct3D 9
- *
- * Creates a new CUDA context, enables interoperability for that context with
- * the Direct3D device \p pD3DDevice, and associates the created CUDA context
- * with the calling thread.
- * The created ::CUcontext will be returned in \p *pCtx.
- * Direct3D resources from this device may be registered and mapped through the
- * lifetime of this CUDA context.
- * If \p pCudaDevice is non-NULL then the ::CUdevice on which this CUDA context was
- * created will be returned in \p *pCudaDevice.
- *
- * On success, this call will increase the internal reference count on
- * \p pD3DDevice. This reference count will be decremented upon destruction of
- * this context through ::cuCtxDestroy().
- * This context will cease to function if \p pD3DDevice is destroyed or encounters
- * an error.
- *
- * Note that this function is never required for correct functionality.  Use of 
- * this function will result in accelerated interoperability only when the
- * operating system is Windows Vista or Windows 7, and the device \p pD3DDdevice 
- * is not an IDirect3DDevice9Ex.  In all other circumstances, this function is 
- * not necessary.
- *
- * \param pCtx        - Returned newly created CUDA context
- * \param pCudaDevice - Returned pointer to the device on which the context was created
- * \param Flags       - Context creation flags (see ::cuCtxCreate() for details)
- * \param pD3DDevice  - Direct3D device to create interoperability context with
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cuD3D9GetDevice,
- * ::cuGraphicsD3D9RegisterResource
- */
-CUresult CUDAAPI cuD3D9CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, IDirect3DDevice9 *pD3DDevice);
-
-/**
- * \brief Create a CUDA context for interoperability with Direct3D 9
- *
- * Creates a new CUDA context, enables interoperability for that context with
- * the Direct3D device \p pD3DDevice, and associates the created CUDA context
- * with the calling thread.
- * The created ::CUcontext will be returned in \p *pCtx.
- * Direct3D resources from this device may be registered and mapped through the
- * lifetime of this CUDA context.
- *
- * On success, this call will increase the internal reference count on
- * \p pD3DDevice. This reference count will be decremented upon destruction of
- * this context through ::cuCtxDestroy().
- * This context will cease to function if \p pD3DDevice is destroyed or encounters
- * an error.
- *
- * Note that this function is never required for correct functionality.  Use of 
- * this function will result in accelerated interoperability only when the
- * operating system is Windows Vista or Windows 7, and the device \p pD3DDdevice 
- * is not an IDirect3DDevice9Ex.  In all other circumstances, this function is 
- * not necessary.
- *
- * \param pCtx        - Returned newly created CUDA context
- * \param flags       - Context creation flags (see ::cuCtxCreate() for details)
- * \param pD3DDevice  - Direct3D device to create interoperability context with
- * \param cudaDevice  - The CUDA device on which to create the context.  This device
- *                      must be among the devices returned when querying
- *                      ::CU_D3D9_DEVICES_ALL from  ::cuD3D9GetDevices.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cuD3D9GetDevices,
- * ::cuGraphicsD3D9RegisterResource
- */
-CUresult CUDAAPI cuD3D9CtxCreateOnDevice(CUcontext *pCtx, unsigned int flags, IDirect3DDevice9 *pD3DDevice, CUdevice cudaDevice);
-
-/**
- * \brief Get the Direct3D 9 device against which the current CUDA context was
- * created
- *
- * Returns in \p *ppD3DDevice the Direct3D device against which this CUDA context
- * was created in ::cuD3D9CtxCreate().
- *
- * \param ppD3DDevice - Returned Direct3D device corresponding to CUDA context
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT
- * ::CUDA_ERROR_INVALID_GRAPHICS_CONTEXT
- * \notefnerr
- *
- * \sa
- * ::cuD3D9GetDevice,
- * ::cudaD3D9GetDirect3DDevice
- */
-CUresult CUDAAPI cuD3D9GetDirect3DDevice(IDirect3DDevice9 **ppD3DDevice);
-
-/**
- * \brief Register a Direct3D 9 resource for access by CUDA
- *
- * Registers the Direct3D 9 resource \p pD3DResource for access by CUDA and
- * returns a CUDA handle to \p pD3Dresource in \p pCudaResource.
- * The handle returned in \p pCudaResource may be used to map and unmap this
- * resource until it is unregistered.
- * On success this call will increase the internal reference count on
- * \p pD3DResource. This reference count will be decremented when this
- * resource is unregistered through ::cuGraphicsUnregisterResource().
- *
- * This call is potentially high-overhead and should not be called every frame
- * in interactive applications.
- *
- * The type of \p pD3DResource must be one of the following.
- * - ::IDirect3DVertexBuffer9: may be accessed through a device pointer
- * - ::IDirect3DIndexBuffer9: may be accessed through a device pointer
- * - ::IDirect3DSurface9: may be accessed through an array.
- *     Only stand-alone objects of type ::IDirect3DSurface9
- *     may be explicitly shared. In particular, individual mipmap levels and faces
- *     of cube maps may not be registered directly. To access individual surfaces
- *     associated with a texture, one must register the base texture object.
- * - ::IDirect3DBaseTexture9: individual surfaces on this texture may be accessed
- *     through an array.
- *
- * The \p Flags argument may be used to specify additional parameters at register
- * time.  The valid values for this parameter are
- * - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
- *   resource will be used.
- * - ::CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST: Specifies that CUDA will
- *   bind this resource to a surface reference.
- * - ::CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER: Specifies that CUDA will perform
- *   texture gather operations on this resource.
- *
- * Not all Direct3D resources of the above types may be used for
- * interoperability with CUDA.  The following are some limitations.
- * - The primary rendertarget may not be registered with CUDA.
- * - Resources allocated as shared may not be registered with CUDA.
- * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
- *   or 32-bit integer or floating-point data cannot be shared.
- * - Surfaces of depth or stencil formats cannot be shared.
- *
- * A complete list of supported formats is as follows:
- * - D3DFMT_L8
- * - D3DFMT_L16
- * - D3DFMT_A8R8G8B8
- * - D3DFMT_X8R8G8B8
- * - D3DFMT_G16R16
- * - D3DFMT_A8B8G8R8
- * - D3DFMT_A8
- * - D3DFMT_A8L8
- * - D3DFMT_Q8W8V8U8
- * - D3DFMT_V16U16
- * - D3DFMT_A16B16G16R16F
- * - D3DFMT_A16B16G16R16
- * - D3DFMT_R32F
- * - D3DFMT_G16R16F
- * - D3DFMT_A32B32G32R32F
- * - D3DFMT_G32R32F
- * - D3DFMT_R16F
- *
- * If Direct3D interoperability is not initialized for this context using
- * ::cuD3D9CtxCreate then ::CUDA_ERROR_INVALID_CONTEXT is returned.
- * If \p pD3DResource is of incorrect type or is already registered then
- * ::CUDA_ERROR_INVALID_HANDLE is returned.
- * If \p pD3DResource cannot be registered then ::CUDA_ERROR_UNKNOWN is returned.
- * If \p Flags is not one of the above specified value then ::CUDA_ERROR_INVALID_VALUE
- * is returned.
- *
- * \param pCudaResource - Returned graphics resource handle
- * \param pD3DResource  - Direct3D resource to register
- * \param Flags         - Parameters for resource registration
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cuD3D9CtxCreate,
- * ::cuGraphicsUnregisterResource,
- * ::cuGraphicsMapResources,
- * ::cuGraphicsSubResourceGetMappedArray,
- * ::cuGraphicsResourceGetMappedPointer,
- * ::cudaGraphicsD3D9RegisterResource
- */
-CUresult CUDAAPI cuGraphicsD3D9RegisterResource(CUgraphicsResource *pCudaResource, IDirect3DResource9 *pD3DResource, unsigned int Flags);
-
-/**
- * \defgroup CUDA_D3D9_DEPRECATED Direct3D 9 Interoperability [DEPRECATED]
- *
- * ___MANBRIEF___ deprecated Direct3D 9 interoperability functions of the
- * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes deprecated Direct3D 9 interoperability functionality.
- * @{
- */
-
-/** Flags to register a resource */
-typedef enum CUd3d9register_flags_enum {
-    CU_D3D9_REGISTER_FLAGS_NONE  = 0x00,
-    CU_D3D9_REGISTER_FLAGS_ARRAY = 0x01,
-} CUd3d9register_flags;
-
-/** Flags to map or unmap a resource */
-typedef enum CUd3d9map_flags_enum {
-    CU_D3D9_MAPRESOURCE_FLAGS_NONE         = 0x00,
-    CU_D3D9_MAPRESOURCE_FLAGS_READONLY     = 0x01,
-    CU_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD = 0x02,
-} CUd3d9map_flags;
-
-/**
- * \brief Register a Direct3D resource for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Registers the Direct3D resource \p pResource for access by CUDA.
- *
- * If this call is successful, then the application will be able to map and
- * unmap this resource until it is unregistered through
- * ::cuD3D9UnregisterResource(). Also on success, this call will increase the
- * internal reference count on \p pResource. This reference count will be
- * decremented when this resource is unregistered through
- * ::cuD3D9UnregisterResource().
- *
- * This call is potentially high-overhead and should not be called every frame
- * in interactive applications.
- *
- * The type of \p pResource must be one of the following.
- *
- * - ::IDirect3DVertexBuffer9: Cannot be used with \p Flags set to
- *   ::CU_D3D9_REGISTER_FLAGS_ARRAY.
- * - ::IDirect3DIndexBuffer9: Cannot be used with \p Flags set to
- *   ::CU_D3D9_REGISTER_FLAGS_ARRAY.
- * - ::IDirect3DSurface9: Only stand-alone objects of type ::IDirect3DSurface9
- *   may be explicitly shared. In particular, individual mipmap levels and
- *   faces of cube maps may not be registered directly. To access individual
- *   surfaces associated with a texture, one must register the base texture
- *   object. For restrictions on the \p Flags parameter, see type
- *   ::IDirect3DBaseTexture9.
- * - ::IDirect3DBaseTexture9: When a texture is registered, all surfaces
- *   associated with the all mipmap levels of all faces of the texture will be
- *   accessible to CUDA.
- *
- * The \p Flags argument specifies the mechanism through which CUDA will access
- * the Direct3D resource. The following values are allowed.
- *
- * - CU_D3D9_REGISTER_FLAGS_NONE: Specifies that CUDA will access this resource
- *   through a ::CUdeviceptr. The pointer, size, and (for textures), pitch for
- *   each subresource of this allocation may be queried through
- *   ::cuD3D9ResourceGetMappedPointer(), ::cuD3D9ResourceGetMappedSize(), and
- *   ::cuD3D9ResourceGetMappedPitch() respectively. This option is valid for
- *   all resource types.
- * - ::CU_D3D9_REGISTER_FLAGS_ARRAY: Specifies that CUDA will access this
- *   resource through a ::CUarray queried on a sub-resource basis through
- *   ::cuD3D9ResourceGetMappedArray(). This option is only valid for resources
- *   of type ::IDirect3DSurface9 and subtypes of ::IDirect3DBaseTexture9.
- *
- * Not all Direct3D resources of the above types may be used for
- * interoperability with CUDA. The following are some limitations.
- *
- * - The primary rendertarget may not be registered with CUDA.
- * - Resources allocated as shared may not be registered with CUDA.
- * - Any resources allocated in ::D3DPOOL_SYSTEMMEM or ::D3DPOOL_MANAGED may
- *   not be registered with CUDA.
- * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
- *   or 32-bit integer or floating-point data cannot be shared.
- * - Surfaces of depth or stencil formats cannot be shared.
- *
- * If Direct3D interoperability is not initialized on this context, then
- * ::CUDA_ERROR_INVALID_CONTEXT is returned. If \p pResource is of incorrect
- * type (e.g. is a non-stand-alone ::IDirect3DSurface9) or is already
- * registered, then ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource
- * cannot be registered then ::CUDA_ERROR_UNKNOWN is returned.
- *
- * \param pResource - Resource to register for CUDA access
- * \param Flags     - Flags for resource registration
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cuGraphicsD3D9RegisterResource
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9RegisterResource(IDirect3DResource9 *pResource, unsigned int Flags);
-
-/**
- * \brief Unregister a Direct3D resource
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Unregisters the Direct3D resource \p pResource so it is not accessible by
- * CUDA unless registered again.
- *
- * If \p pResource is not registered, then ::CUDA_ERROR_INVALID_HANDLE is
- * returned.
- *
- * \param pResource - Resource to unregister
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cuGraphicsUnregisterResource
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9UnregisterResource(IDirect3DResource9 *pResource);
-
-/**
- * \brief Map Direct3D resources for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Maps the \p count Direct3D resources in \p ppResource for access by CUDA.
- *
- * The resources in \p ppResource may be accessed in CUDA kernels until they
- * are unmapped. Direct3D should not access any resources while they are mapped
- * by CUDA. If an application does so the results are undefined.
- *
- * This function provides the synchronization guarantee that any Direct3D calls
- * issued before ::cuD3D9MapResources() will complete before any CUDA kernels
- * issued after ::cuD3D9MapResources() begin.
- *
- * If any of \p ppResource have not been registered for use with CUDA or if
- * \p ppResource contains any duplicate entries, then
- * ::CUDA_ERROR_INVALID_HANDLE is returned.  If any of \p ppResource are
- * presently mapped for access by CUDA, then ::CUDA_ERROR_ALREADY_MAPPED is
- * returned.
- *
- * \param count      - Number of resources in ppResource
- * \param ppResource - Resources to map for CUDA usage
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_ALREADY_MAPPED,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cuGraphicsMapResources
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9MapResources(unsigned int count, IDirect3DResource9 **ppResource);
-
-/**
- * \brief Unmaps Direct3D resources
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Unmaps the \p count Direct3D resources in \p ppResource.
- *
- * This function provides the synchronization guarantee that any CUDA kernels
- * issued before ::cuD3D9UnmapResources() will complete before any Direct3D
- * calls issued after ::cuD3D9UnmapResources() begin.
- *
- * If any of \p ppResource have not been registered for use with CUDA or if
- * \p ppResource contains any duplicate entries, then
- * ::CUDA_ERROR_INVALID_HANDLE is returned. If any of \p ppResource are not
- * presently mapped for access by CUDA, then ::CUDA_ERROR_NOT_MAPPED is
- * returned.
- *
- * \param count      - Number of resources to unmap for CUDA
- * \param ppResource - Resources to unmap for CUDA
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_MAPPED,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cuGraphicsUnmapResources
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9UnmapResources(unsigned int count, IDirect3DResource9 **ppResource);
-
-/**
- * \brief Set usage flags for mapping a Direct3D resource
- *
- * \deprecated This function is deprecated as of Cuda 3.0.
- *
- * Set \p Flags for mapping the Direct3D resource \p pResource.
- *
- * Changes to \p Flags will take effect the next time \p pResource is mapped.
- * The \p Flags argument may be any of the following:
- * - ::CU_D3D9_MAPRESOURCE_FLAGS_NONE: Specifies no hints about how this
- *   resource will be used. It is therefore assumed that this resource will be
- *   read from and written to by CUDA kernels. This is the default value.
- * - ::CU_D3D9_MAPRESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which
- *   access this resource will not write to this resource.
- * - ::CU_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels
- *   which access this resource will not read from this resource and will
- *   write over the entire contents of the resource, so none of the data
- *   previously stored in the resource will be preserved.
- *
- * If \p pResource has not been registered for use with CUDA, then
- * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is presently
- * mapped for access by CUDA, then ::CUDA_ERROR_ALREADY_MAPPED is returned.
- *
- * \param pResource - Registered resource to set flags for
- * \param Flags     - Parameters for resource mapping
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_ALREADY_MAPPED
- * \notefnerr
- *
- * \sa ::cuGraphicsResourceSetMapFlags
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceSetMapFlags(IDirect3DResource9 *pResource, unsigned int Flags);
-
-/**
- * \brief Get the dimensions of a registered surface
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Returns in \p *pWidth, \p *pHeight, and \p *pDepth the dimensions of the
- * subresource of the mapped Direct3D resource \p pResource, which corresponds
- * to \p Face and \p Level.
- *
- * Because anti-aliased surfaces may have multiple samples per pixel, it is
- * possible that the dimensions of a resource will be an integer factor larger
- * than the dimensions reported by the Direct3D runtime.
- *
- * The parameters \p pWidth, \p pHeight, and \p pDepth are optional. For 2D
- * surfaces, the value returned in \p *pDepth will be 0.
- *
- * If \p pResource is not of type ::IDirect3DBaseTexture9 or
- * ::IDirect3DSurface9 or if \p pResource has not been registered for use with
- * CUDA, then ::CUDA_ERROR_INVALID_HANDLE is returned.
- *
- * For usage requirements of \p Face and \p Level parameters, see
- * ::cuD3D9ResourceGetMappedPointer().
- *
- * \param pWidth    - Returned width of surface
- * \param pHeight   - Returned height of surface
- * \param pDepth    - Returned depth of surface
- * \param pResource - Registered resource to access
- * \param Face      - Face of resource to access
- * \param Level     - Level of resource to access
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa ::cuGraphicsSubResourceGetMappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceGetSurfaceDimensions(size_t *pWidth, size_t *pHeight, size_t *pDepth, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
-
-/**
- * \brief Get an array through which to access a subresource of a Direct3D
- * resource which has been mapped for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Returns in \p *pArray an array through which the subresource of the mapped
- * Direct3D resource \p pResource which corresponds to \p Face and \p Level may
- * be accessed. The value set in \p pArray may change every time that
- * \p pResource is mapped.
- *
- * If \p pResource is not registered then ::CUDA_ERROR_INVALID_HANDLE is
- * returned. If \p pResource was not registered with usage flags
- * ::CU_D3D9_REGISTER_FLAGS_ARRAY then ::CUDA_ERROR_INVALID_HANDLE is
- * returned. If \p pResource is not mapped then ::CUDA_ERROR_NOT_MAPPED is
- * returned.
- *
- * For usage requirements of \p Face and \p Level parameters, see
- * ::cuD3D9ResourceGetMappedPointer().
- *
- * \param pArray    - Returned array corresponding to subresource
- * \param pResource - Mapped resource to access
- * \param Face      - Face of resource to access
- * \param Level     - Level of resource to access
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_MAPPED
- * \notefnerr
- *
- * \sa ::cuGraphicsSubResourceGetMappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceGetMappedArray(CUarray *pArray, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
-
-/**
- * \brief Get the pointer through which to access a subresource of a Direct3D
- * resource which has been mapped for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Returns in \p *pDevPtr the base pointer of the subresource of the mapped
- * Direct3D resource \p pResource, which corresponds to \p Face and \p Level.
- * The value set in \p pDevPtr may change every time that \p pResource is
- * mapped.
- *
- * If \p pResource is not registered, then ::CUDA_ERROR_INVALID_HANDLE is
- * returned. If \p pResource was not registered with usage flags
- * ::CU_D3D9_REGISTER_FLAGS_NONE, then ::CUDA_ERROR_INVALID_HANDLE is returned.
- * If \p pResource is not mapped, then ::CUDA_ERROR_NOT_MAPPED is returned.
- *
- * If \p pResource is of type ::IDirect3DCubeTexture9, then \p Face must one
- * of the values enumerated by type ::D3DCUBEMAP_FACES.  For all other types
- * \p Face must be 0. If \p Face is invalid, then ::CUDA_ERROR_INVALID_VALUE
- * is returned.
- *
- * If \p pResource is of type ::IDirect3DBaseTexture9, then \p Level must
- * correspond to a valid mipmap level. At present only mipmap level 0 is
- * supported. For all other types \p Level must be 0. If \p Level is invalid,
- * then ::CUDA_ERROR_INVALID_VALUE is returned.
- *
- * \param pDevPtr     - Returned pointer corresponding to subresource
- * \param pResource   - Mapped resource to access
- * \param Face        - Face of resource to access
- * \param Level       - Level of resource to access
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_MAPPED
- * \notefnerr
- *
- * \sa ::cuGraphicsResourceGetMappedPointer
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceGetMappedPointer(CUdeviceptr *pDevPtr, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
-
-/**
- * \brief Get the size of a subresource of a Direct3D resource which has been
- * mapped for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Returns in \p *pSize the size of the subresource of the mapped Direct3D
- * resource \p pResource, which corresponds to \p Face and \p Level. The value
- * set in \p pSize may change every time that \p pResource is mapped.
- *
- * If \p pResource has not been registered for use with CUDA, then
- * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource was not registered
- * with usage flags ::CU_D3D9_REGISTER_FLAGS_NONE, then
- * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is not mapped for
- * access by CUDA, then ::CUDA_ERROR_NOT_MAPPED is returned.
- *
- * For usage requirements of \p Face and \p Level parameters, see
- * ::cuD3D9ResourceGetMappedPointer.
- *
- * \param pSize       - Returned size of subresource
- * \param pResource   - Mapped resource to access
- * \param Face        - Face of resource to access
- * \param Level       - Level of resource to access
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_MAPPED
- * \notefnerr
- *
- * \sa ::cuGraphicsResourceGetMappedPointer
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceGetMappedSize(size_t *pSize, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
-
-/**
- * \brief Get the pitch of a subresource of a Direct3D resource which has been
- * mapped for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Returns in \p *pPitch and \p *pPitchSlice the pitch and Z-slice pitch of
- * the subresource of the mapped Direct3D resource \p pResource, which
- * corresponds to \p Face and \p Level. The values set in \p pPitch and
- * \p pPitchSlice may change every time that \p pResource is mapped.
- *
- * The pitch and Z-slice pitch values may be used to compute the location of a
- * sample on a surface as follows.
- *
- * For a 2D surface, the byte offset of the sample at position \b x, \b y from
- * the base pointer of the surface is:
- *
- * \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
- *
- * For a 3D surface, the byte offset of the sample at position \b x, \b y,
- * \b z from the base pointer of the surface is:
- *
- * \b z* \b slicePitch + \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
- *
- * Both parameters \p pPitch and \p pPitchSlice are optional and may be set to
- * NULL.
- *
- * If \p pResource is not of type ::IDirect3DBaseTexture9 or one of its
- * sub-types or if \p pResource has not been registered for use with CUDA,
- * then ::cudaErrorInvalidResourceHandle is returned. If \p pResource was not
- * registered with usage flags ::CU_D3D9_REGISTER_FLAGS_NONE, then
- * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is not mapped
- * for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned.
- *
- * For usage requirements of \p Face and \p Level parameters, see
- * ::cuD3D9ResourceGetMappedPointer().
- *
- * \param pPitch      - Returned pitch of subresource
- * \param pPitchSlice - Returned Z-slice pitch of subresource
- * \param pResource   - Mapped resource to access
- * \param Face        - Face of resource to access
- * \param Level       - Level of resource to access
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_MAPPED
- * \notefnerr
- *
- * \sa ::cuGraphicsSubResourceGetMappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceGetMappedPitch(size_t *pPitch, size_t *pPitchSlice, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
-
-/* CUDA 1.x compatibility API. These functions are deprecated, please use the ones above. */
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9Begin(IDirect3DDevice9 *pDevice);
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9End(void);
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9RegisterVertexBuffer(IDirect3DVertexBuffer9 *pVB);
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9MapVertexBuffer(CUdeviceptr *pDevPtr, size_t *pSize, IDirect3DVertexBuffer9 *pVB);
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9UnmapVertexBuffer(IDirect3DVertexBuffer9 *pVB);
-__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9UnregisterVertexBuffer(IDirect3DVertexBuffer9 *pVB);
-
-/** @} */ /* END CUDA_D3D9_DEPRECATED */
-/** @} */ /* END CUDA_D3D9 */
-
-
-/**
- * CUDA API versioning support
- */
-#if defined(__CUDA_API_VERSION_INTERNAL)
-    #undef cuD3D9CtxCreate
-    #undef cuD3D9ResourceGetSurfaceDimensions
-    #undef cuD3D9ResourceGetMappedPointer
-    #undef cuD3D9ResourceGetMappedSize
-    #undef cuD3D9ResourceGetMappedPitch
-    #undef cuD3D9MapVertexBuffer
-
-    CUresult CUDAAPI cuD3D9CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, IDirect3DDevice9 *pD3DDevice);
-    CUresult CUDAAPI cuD3D9ResourceGetSurfaceDimensions(unsigned int *pWidth, unsigned int *pHeight, unsigned int *pDepth, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
-    CUresult CUDAAPI cuD3D9ResourceGetMappedPointer(CUdeviceptr_v1 *pDevPtr, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
-    CUresult CUDAAPI cuD3D9ResourceGetMappedSize(unsigned int *pSize, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
-    CUresult CUDAAPI cuD3D9ResourceGetMappedPitch(unsigned int *pPitch, unsigned int *pPitchSlice, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
-    CUresult CUDAAPI cuD3D9MapVertexBuffer(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, IDirect3DVertexBuffer9 *pVB);
-#endif /* __CUDA_API_VERSION_INTERNAL */
-
-#ifdef __cplusplus
-};
-#endif
-
-#undef __CUDA_DEPRECATED
-
-#endif
-
--- a/include/cuda/cudaD3D9Typedefs.h
+++ b/include/cuda/cudaD3D9Typedefs.h
@ -1,131 +0,0 @@
-/*
- * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#ifndef CUDAD3D9TYPEDEFS_H
-#define CUDAD3D9TYPEDEFS_H
-
-// Dependent includes for cudaD3D11.h
-#include <d3d9.h>
-
-#include <cudaD3D9.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif // __cplusplus
-
-/*
- * Macros for the latest version for each driver function in cudaD3D9.h
- */
-#define PFN_cuD3D9GetDevice  PFN_cuD3D9GetDevice_v2000
-#define PFN_cuD3D9GetDevices  PFN_cuD3D9GetDevices_v3020
-#define PFN_cuD3D9CtxCreate  PFN_cuD3D9CtxCreate_v3020
-#define PFN_cuD3D9CtxCreateOnDevice  PFN_cuD3D9CtxCreateOnDevice_v3020
-#define PFN_cuD3D9GetDirect3DDevice  PFN_cuD3D9GetDirect3DDevice_v2000
-#define PFN_cuGraphicsD3D9RegisterResource  PFN_cuGraphicsD3D9RegisterResource_v3000
-#define PFN_cuD3D9RegisterResource  PFN_cuD3D9RegisterResource_v2000
-#define PFN_cuD3D9UnregisterResource  PFN_cuD3D9UnregisterResource_v2000
-#define PFN_cuD3D9MapResources  PFN_cuD3D9MapResources_v2000
-#define PFN_cuD3D9UnmapResources  PFN_cuD3D9UnmapResources_v2000
-#define PFN_cuD3D9ResourceSetMapFlags  PFN_cuD3D9ResourceSetMapFlags_v2000
-#define PFN_cuD3D9ResourceGetSurfaceDimensions  PFN_cuD3D9ResourceGetSurfaceDimensions_v3020
-#define PFN_cuD3D9ResourceGetMappedArray  PFN_cuD3D9ResourceGetMappedArray_v2010
-#define PFN_cuD3D9ResourceGetMappedPointer  PFN_cuD3D9ResourceGetMappedPointer_v3020
-#define PFN_cuD3D9ResourceGetMappedSize  PFN_cuD3D9ResourceGetMappedSize_v3020
-#define PFN_cuD3D9ResourceGetMappedPitch  PFN_cuD3D9ResourceGetMappedPitch_v3020
-#define PFN_cuD3D9Begin  PFN_cuD3D9Begin_v2000
-#define PFN_cuD3D9End  PFN_cuD3D9End_v2000
-#define PFN_cuD3D9RegisterVertexBuffer  PFN_cuD3D9RegisterVertexBuffer_v2000
-#define PFN_cuD3D9MapVertexBuffer  PFN_cuD3D9MapVertexBuffer_v3020
-#define PFN_cuD3D9UnmapVertexBuffer  PFN_cuD3D9UnmapVertexBuffer_v2000
-#define PFN_cuD3D9UnregisterVertexBuffer  PFN_cuD3D9UnregisterVertexBuffer_v2000
-
-
-/**
- * Type definitions for functions defined in cudaD3D9.h
- */
-typedef CUresult (CUDAAPI *PFN_cuD3D9GetDevice_v2000)(CUdevice_v1 *pCudaDevice, const char *pszAdapterName);
-typedef CUresult (CUDAAPI *PFN_cuD3D9GetDevices_v3020)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, IDirect3DDevice9 *pD3D9Device, CUd3d9DeviceList deviceList);
-typedef CUresult (CUDAAPI *PFN_cuD3D9CtxCreate_v3020)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, IDirect3DDevice9 *pD3DDevice);
-typedef CUresult (CUDAAPI *PFN_cuD3D9CtxCreateOnDevice_v3020)(CUcontext *pCtx, unsigned int flags, IDirect3DDevice9 *pD3DDevice, CUdevice_v1 cudaDevice);
-typedef CUresult (CUDAAPI *PFN_cuD3D9GetDirect3DDevice_v2000)(IDirect3DDevice9 **ppD3DDevice);
-typedef CUresult (CUDAAPI *PFN_cuGraphicsD3D9RegisterResource_v3000)(CUgraphicsResource *pCudaResource, IDirect3DResource9 *pD3DResource, unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuD3D9RegisterResource_v2000)(IDirect3DResource9 *pResource, unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuD3D9UnregisterResource_v2000)(IDirect3DResource9 *pResource);
-typedef CUresult (CUDAAPI *PFN_cuD3D9MapResources_v2000)(unsigned int count, IDirect3DResource9 **ppResource);
-typedef CUresult (CUDAAPI *PFN_cuD3D9UnmapResources_v2000)(unsigned int count, IDirect3DResource9 **ppResource);
-typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceSetMapFlags_v2000)(IDirect3DResource9 *pResource, unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetSurfaceDimensions_v3020)(size_t *pWidth, size_t *pHeight, size_t *pDepth, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
-typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedArray_v2010)(CUarray *pArray, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
-typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedPointer_v3020)(CUdeviceptr_v2 *pDevPtr, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
-typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedSize_v3020)(size_t *pSize, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
-typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedPitch_v3020)(size_t *pPitch, size_t *pPitchSlice, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
-typedef CUresult (CUDAAPI *PFN_cuD3D9Begin_v2000)(IDirect3DDevice9 *pDevice);
-typedef CUresult (CUDAAPI *PFN_cuD3D9End_v2000)(void);
-typedef CUresult (CUDAAPI *PFN_cuD3D9RegisterVertexBuffer_v2000)(IDirect3DVertexBuffer9 *pVB);
-typedef CUresult (CUDAAPI *PFN_cuD3D9MapVertexBuffer_v3020)(CUdeviceptr_v2 *pDevPtr, size_t *pSize, IDirect3DVertexBuffer9 *pVB);
-typedef CUresult (CUDAAPI *PFN_cuD3D9UnmapVertexBuffer_v2000)(IDirect3DVertexBuffer9 *pVB);
-typedef CUresult (CUDAAPI *PFN_cuD3D9UnregisterVertexBuffer_v2000)(IDirect3DVertexBuffer9 *pVB);
-
-/*
- * Type definitions for older versioned functions in cudaD3D9.h
- */
-#if defined(__CUDA_API_VERSION_INTERNAL)
-    typedef CUresult (CUDAAPI *PFN_cuD3D9CtxCreate_v2000)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, IDirect3DDevice9 *pD3DDevice);
-    typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetSurfaceDimensions_v2000)(unsigned int *pWidth, unsigned int *pHeight, unsigned int *pDepth, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
-    typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedPointer_v2000)(CUdeviceptr_v1 *pDevPtr, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
-    typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedSize_v2000)(unsigned int *pSize, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
-    typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedPitch_v2000)(unsigned int *pPitch, unsigned int *pPitchSlice, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
-    typedef CUresult (CUDAAPI *PFN_cuD3D9MapVertexBuffer_v2000)(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, IDirect3DVertexBuffer9 *pVB);
-#endif
-
-#ifdef __cplusplus
-}
-#endif // __cplusplus
-
-#endif // file guard
--- a/include/cuda/cudaGL.h
+++ b/include/cuda/cudaGL.h
@ -1,610 +0,0 @@
-/*
- * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#ifndef CUDAGL_H
-#define CUDAGL_H
-
-#include <cuda.h>
-#ifdef __APPLE__
-#include <OpenGL/gl.h>
-#else
-#include <GL/gl.h>
-#endif
-
-#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
-#define __CUDA_DEPRECATED
-#elif defined(_MSC_VER)
-#define __CUDA_DEPRECATED __declspec(deprecated)
-#elif defined(__GNUC__)
-#define __CUDA_DEPRECATED __attribute__((deprecated))
-#else
-#define __CUDA_DEPRECATED
-#endif
-
-#ifdef CUDA_FORCE_API_VERSION
-#error "CUDA_FORCE_API_VERSION is no longer supported."
-#endif
-
-#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
-    #define __CUDA_API_PER_THREAD_DEFAULT_STREAM
-    #define __CUDA_API_PTDS(api) api ## _ptds
-    #define __CUDA_API_PTSZ(api) api ## _ptsz
-#else
-    #define __CUDA_API_PTDS(api) api
-    #define __CUDA_API_PTSZ(api) api
-#endif
-
-#define cuGLCtxCreate            cuGLCtxCreate_v2
-#define cuGLMapBufferObject      __CUDA_API_PTDS(cuGLMapBufferObject_v2)
-#define cuGLMapBufferObjectAsync __CUDA_API_PTSZ(cuGLMapBufferObjectAsync_v2)
-#define cuGLGetDevices           cuGLGetDevices_v2
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * \file cudaGL.h
- * \brief Header file for the OpenGL interoperability functions of the
- * low-level CUDA driver application programming interface.
- */
-
-/**
- * \defgroup CUDA_GL OpenGL Interoperability
- * \ingroup CUDA_DRIVER
- *
- * ___MANBRIEF___ OpenGL interoperability functions of the low-level CUDA
- * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the OpenGL interoperability functions of the
- * low-level CUDA driver application programming interface. Note that mapping 
- * of OpenGL resources is performed with the graphics API agnostic, resource 
- * mapping interface described in \ref CUDA_GRAPHICS "Graphics Interoperability".
- *
- * @{
- */
-
-#if defined(_WIN32)
-#if !defined(WGL_NV_gpu_affinity)
-typedef void* HGPUNV;
-#endif
-#endif /* _WIN32 */
-
-/**
- * \brief Registers an OpenGL buffer object
- *
- * Registers the buffer object specified by \p buffer for access by
- * CUDA.  A handle to the registered object is returned as \p
- * pCudaResource.  The register flags \p Flags specify the intended usage,
- * as follows:
- *
- * - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
- *   resource will be used. It is therefore assumed that this resource will be
- *   read from and written to by CUDA. This is the default value.
- * - ::CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: Specifies that CUDA
- *   will not write to this resource.
- * - ::CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD: Specifies that
- *   CUDA will not read from this resource and will write over the
- *   entire contents of the resource, so none of the data previously
- *   stored in the resource will be preserved.
- *
- * \param pCudaResource - Pointer to the returned object handle
- * \param buffer - name of buffer object to be registered
- * \param Flags - Register flags
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_ALREADY_MAPPED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * \notefnerr
- *
- * \sa 
- * ::cuGraphicsUnregisterResource,
- * ::cuGraphicsMapResources,
- * ::cuGraphicsResourceGetMappedPointer,
- * ::cudaGraphicsGLRegisterBuffer
- */
-CUresult CUDAAPI cuGraphicsGLRegisterBuffer(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags);
-
-/**
- * \brief Register an OpenGL texture or renderbuffer object
- *
- * Registers the texture or renderbuffer object specified by \p image for access by CUDA.  
- * A handle to the registered object is returned as \p pCudaResource.  
- *
- * \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D, 
- * ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY, 
- * or ::GL_RENDERBUFFER.
- *
- * The register flags \p Flags specify the intended usage, as follows:
- *
- * - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
- *   resource will be used. It is therefore assumed that this resource will be
- *   read from and written to by CUDA. This is the default value.
- * - ::CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: Specifies that CUDA
- *   will not write to this resource.
- * - ::CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD: Specifies that
- *   CUDA will not read from this resource and will write over the
- *   entire contents of the resource, so none of the data previously
- *   stored in the resource will be preserved.
- * - ::CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST: Specifies that CUDA will
- *   bind this resource to a surface reference.
- * - ::CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER: Specifies that CUDA will perform
- *   texture gather operations on this resource.
- *
- * The following image formats are supported. For brevity's sake, the list is abbreviated.
- * For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats 
- * {GL_R8, GL_R16, GL_RG8, GL_RG16} :
- * - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY
- * - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I}
- * - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X
- * {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT}
- *
- * The following image classes are currently disallowed:
- * - Textures with borders
- * - Multisampled renderbuffers
- *
- * \param pCudaResource - Pointer to the returned object handle
- * \param image - name of texture or renderbuffer object to be registered
- * \param target - Identifies the type of object specified by \p image
- * \param Flags - Register flags
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_ALREADY_MAPPED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * \notefnerr
- *
- * \sa 
- * ::cuGraphicsUnregisterResource,
- * ::cuGraphicsMapResources,
- * ::cuGraphicsSubResourceGetMappedArray,
- * ::cudaGraphicsGLRegisterImage
- */
-CUresult CUDAAPI cuGraphicsGLRegisterImage(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags);
-
-#ifdef _WIN32
-/**
- * \brief Gets the CUDA device associated with hGpu
- *
- * Returns in \p *pDevice the CUDA device associated with a \p hGpu, if
- * applicable.
- *
- * \param pDevice - Device associated with hGpu
- * \param hGpu    - Handle to a GPU, as queried via ::WGL_NV_gpu_affinity()
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuGLMapBufferObject,
- * ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
- * ::cuGLUnregisterBufferObject, ::cuGLUnmapBufferObjectAsync,
- * ::cuGLSetBufferObjectMapFlags,
- * ::cudaWGLGetDevice
- */
-CUresult CUDAAPI cuWGLGetDevice(CUdevice *pDevice, HGPUNV hGpu);
-#endif /* _WIN32 */
-
-/**
- * CUDA devices corresponding to an OpenGL device
- */
-typedef enum CUGLDeviceList_enum {
-    CU_GL_DEVICE_LIST_ALL            = 0x01, /**< The CUDA devices for all GPUs used by the current OpenGL context */
-    CU_GL_DEVICE_LIST_CURRENT_FRAME  = 0x02, /**< The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame */
-    CU_GL_DEVICE_LIST_NEXT_FRAME     = 0x03, /**< The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame */
-} CUGLDeviceList;
-
-/**
- * \brief Gets the CUDA devices associated with the current OpenGL context
- *
- * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices 
- * corresponding to the current OpenGL context. Also returns in \p *pCudaDevices 
- * at most cudaDeviceCount of the CUDA-compatible devices corresponding to 
- * the current OpenGL context. If any of the GPUs being used by the current OpenGL
- * context are not CUDA capable then the call will return CUDA_ERROR_NO_DEVICE.
- *
- * The \p deviceList argument may be any of the following:
- * - ::CU_GL_DEVICE_LIST_ALL: Query all devices used by the current OpenGL context.
- * - ::CU_GL_DEVICE_LIST_CURRENT_FRAME: Query the devices used by the current OpenGL context to
- *   render the current frame (in SLI).
- * - ::CU_GL_DEVICE_LIST_NEXT_FRAME: Query the devices used by the current OpenGL context to
- *   render the next frame (in SLI). Note that this is a prediction, it can't be guaranteed that
- *   this is correct in all cases.
- *
- * \param pCudaDeviceCount - Returned number of CUDA devices.
- * \param pCudaDevices     - Returned CUDA devices.
- * \param cudaDeviceCount  - The size of the output device array pCudaDevices.
- * \param deviceList       - The set of devices to return.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NO_DEVICE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_GRAPHICS_CONTEXT
- *
- * \note This function is not supported on Mac OS X.
- * \notefnerr
- *
- * \sa
- * ::cuWGLGetDevice,
- * ::cudaGLGetDevices
- */
-CUresult CUDAAPI cuGLGetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
-
-/**
- * \defgroup CUDA_GL_DEPRECATED OpenGL Interoperability [DEPRECATED]
- *
- * ___MANBRIEF___ deprecated OpenGL interoperability functions of the low-level
- * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes deprecated OpenGL interoperability functionality.
- *
- * @{
- */
-
-/** Flags to map or unmap a resource */
-typedef enum CUGLmap_flags_enum {
-    CU_GL_MAP_RESOURCE_FLAGS_NONE          = 0x00,
-    CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY     = 0x01,
-    CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02,    
-} CUGLmap_flags;
-
-/**
- * \brief Create a CUDA context for interoperability with OpenGL
- *
- * \deprecated This function is deprecated as of Cuda 5.0. 
- *
- * This function is deprecated and should no longer be used.  It is
- * no longer necessary to associate a CUDA context with an OpenGL
- * context in order to achieve maximum interoperability performance.
- *
- * \param pCtx   - Returned CUDA context
- * \param Flags  - Options for CUDA context creation
- * \param device - Device on which to create the context
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \notefnerr
- *
- * \sa ::cuCtxCreate, ::cuGLInit, ::cuGLMapBufferObject,
- * ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
- * ::cuGLUnregisterBufferObject, ::cuGLMapBufferObjectAsync,
- * ::cuGLUnmapBufferObjectAsync, ::cuGLSetBufferObjectMapFlags,
- * ::cuWGLGetDevice
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuGLCtxCreate(CUcontext *pCtx, unsigned int Flags, CUdevice device );
-
-/**
- * \brief Initializes OpenGL interoperability
- *
- * \deprecated This function is deprecated as of Cuda 3.0. 
- *
- * Initializes OpenGL interoperability. This function is deprecated
- * and calling it is no longer required. It may fail if the needed
- * OpenGL driver facilities are not available.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa ::cuGLMapBufferObject,
- * ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
- * ::cuGLUnregisterBufferObject, ::cuGLMapBufferObjectAsync,
- * ::cuGLUnmapBufferObjectAsync, ::cuGLSetBufferObjectMapFlags,
- * ::cuWGLGetDevice
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuGLInit(void);
-
-/**
- * \brief Registers an OpenGL buffer object
- *
- * \deprecated This function is deprecated as of Cuda 3.0. 
- *
- * Registers the buffer object specified by \p buffer for access by
- * CUDA. This function must be called before CUDA can map the buffer
- * object.  There must be a valid OpenGL context bound to the current
- * thread when this function is called, and the buffer name is
- * resolved by that context.
- *
- * \param buffer - The name of the buffer object to register.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_ALREADY_MAPPED
- * \notefnerr
- *
- * \sa ::cuGraphicsGLRegisterBuffer
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuGLRegisterBufferObject(GLuint buffer);
-
-/**
- * \brief Maps an OpenGL buffer object
- *
- * \deprecated This function is deprecated as of Cuda 3.0. 
- *
- * Maps the buffer object specified by \p buffer into the address space of the
- * current CUDA context and returns in \p *dptr and \p *size the base pointer
- * and size of the resulting mapping.
- *
- * There must be a valid OpenGL context bound to the current thread
- * when this function is called.  This must be the same context, or a
- * member of the same shareGroup, as the context that was bound when
- * the buffer was registered.
- *
- * All streams in the current CUDA context are synchronized with the
- * current GL context.
- *
- * \param dptr   - Returned mapped base pointer
- * \param size   - Returned size of mapping
- * \param buffer - The name of the buffer object to map
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_MAP_FAILED
- * \notefnerr
- *
- * \sa ::cuGraphicsMapResources
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuGLMapBufferObject(CUdeviceptr *dptr, size_t *size,  GLuint buffer);  
-
-/**
- * \brief Unmaps an OpenGL buffer object
- *
- * \deprecated This function is deprecated as of Cuda 3.0. 
- *
- * Unmaps the buffer object specified by \p buffer for access by CUDA.
- *
- * There must be a valid OpenGL context bound to the current thread
- * when this function is called.  This must be the same context, or a
- * member of the same shareGroup, as the context that was bound when
- * the buffer was registered.
- *
- * All streams in the current CUDA context are synchronized with the
- * current GL context.
- *
- * \param buffer - Buffer object to unmap
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuGraphicsUnmapResources
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnmapBufferObject(GLuint buffer);
-
-/**
- * \brief Unregister an OpenGL buffer object
- *
- * \deprecated This function is deprecated as of Cuda 3.0. 
- *
- * Unregisters the buffer object specified by \p buffer.  This
- * releases any resources associated with the registered buffer.
- * After this call, the buffer may no longer be mapped for access by
- * CUDA.
- *
- * There must be a valid OpenGL context bound to the current thread
- * when this function is called.  This must be the same context, or a
- * member of the same shareGroup, as the context that was bound when
- * the buffer was registered.
- *
- * \param buffer - Name of the buffer object to unregister
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuGraphicsUnregisterResource
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnregisterBufferObject(GLuint buffer);
-
-/**
- * \brief Set the map flags for an OpenGL buffer object
- *
- * \deprecated This function is deprecated as of Cuda 3.0. 
- *
- * Sets the map flags for the buffer object specified by \p buffer.
- *
- * Changes to \p Flags will take effect the next time \p buffer is mapped.
- * The \p Flags argument may be any of the following:
- * - ::CU_GL_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
- *   resource will be used. It is therefore assumed that this resource will be
- *   read from and written to by CUDA kernels. This is the default value.
- * - ::CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA kernels which
- *   access this resource will not write to this resource.
- * - ::CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that CUDA kernels
- *   which access this resource will not read from this resource and will
- *   write over the entire contents of the resource, so none of the data
- *   previously stored in the resource will be preserved.
- *
- * If \p buffer has not been registered for use with CUDA, then
- * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p buffer is presently
- * mapped for access by CUDA, then ::CUDA_ERROR_ALREADY_MAPPED is returned.
- *
- * There must be a valid OpenGL context bound to the current thread
- * when this function is called.  This must be the same context, or a
- * member of the same shareGroup, as the context that was bound when
- * the buffer was registered.
- *
- * \param buffer - Buffer object to unmap
- * \param Flags  - Map flags
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_ALREADY_MAPPED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * \notefnerr
- *
- * \sa ::cuGraphicsResourceSetMapFlags
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuGLSetBufferObjectMapFlags(GLuint buffer, unsigned int Flags);
-
-/**
- * \brief Maps an OpenGL buffer object
- *
- * \deprecated This function is deprecated as of Cuda 3.0. 
- *
- * Maps the buffer object specified by \p buffer into the address space of the
- * current CUDA context and returns in \p *dptr and \p *size the base pointer
- * and size of the resulting mapping.
- *
- * There must be a valid OpenGL context bound to the current thread
- * when this function is called.  This must be the same context, or a
- * member of the same shareGroup, as the context that was bound when
- * the buffer was registered.
- *
- * Stream \p hStream in the current CUDA context is synchronized with
- * the current GL context.
- *
- * \param dptr    - Returned mapped base pointer
- * \param size    - Returned size of mapping
- * \param buffer  - The name of the buffer object to map
- * \param hStream - Stream to synchronize
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_MAP_FAILED
- * \notefnerr
- *
- * \sa ::cuGraphicsMapResources
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuGLMapBufferObjectAsync(CUdeviceptr *dptr, size_t *size,  GLuint buffer, CUstream hStream);
-
-/**
- * \brief Unmaps an OpenGL buffer object
- *
- * \deprecated This function is deprecated as of Cuda 3.0. 
- *
- * Unmaps the buffer object specified by \p buffer for access by CUDA.
- *
- * There must be a valid OpenGL context bound to the current thread
- * when this function is called.  This must be the same context, or a
- * member of the same shareGroup, as the context that was bound when
- * the buffer was registered.
- *
- * Stream \p hStream in the current CUDA context is synchronized with
- * the current GL context.
- *
- * \param buffer  - Name of the buffer object to unmap
- * \param hStream - Stream to synchronize
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuGraphicsUnmapResources
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnmapBufferObjectAsync(GLuint buffer, CUstream hStream);
-
-/** @} */ /* END CUDA_GL_DEPRECATED */
-/** @} */ /* END CUDA_GL */
-
-
-#if defined(__CUDA_API_VERSION_INTERNAL)
-    #undef cuGLCtxCreate
-    #undef cuGLMapBufferObject
-    #undef cuGLMapBufferObjectAsync
-    #undef cuGLGetDevices
-
-    CUresult CUDAAPI cuGLGetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
-    CUresult CUDAAPI cuGLMapBufferObject_v2(CUdeviceptr *dptr, size_t *size,  GLuint buffer);
-    CUresult CUDAAPI cuGLMapBufferObjectAsync_v2(CUdeviceptr *dptr, size_t *size,  GLuint buffer, CUstream hStream);
-    CUresult CUDAAPI cuGLCtxCreate(CUcontext *pCtx, unsigned int Flags, CUdevice device );
-    CUresult CUDAAPI cuGLMapBufferObject(CUdeviceptr_v1 *dptr, unsigned int *size,  GLuint buffer);
-    CUresult CUDAAPI cuGLMapBufferObjectAsync(CUdeviceptr_v1 *dptr, unsigned int *size,  GLuint buffer, CUstream hStream);
-#endif /* __CUDA_API_VERSION_INTERNAL */
-
-#ifdef __cplusplus
-};
-#endif
-
-#undef __CUDA_DEPRECATED
-
-#endif
--- a/include/cuda/cudaGLTypedefs.h
+++ b/include/cuda/cudaGLTypedefs.h
@ -1,127 +0,0 @@
-/*
- * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#ifndef CUDAGLTYPEDEFS_H
-#define CUDAGLTYPEDEFS_H
-
-// Dependent includes for cudagl.h
-#ifdef __APPLE__
-#include <OpenGL/gl.h>
-#else
-#include <GL/gl.h>
-#endif
-
-#include <cudaGL.h>
-
-#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
-    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptds
-    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptsz
-#else
-    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## default_version
-    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## default_version
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif // __cplusplus
-
-/*
- * Macros for the latest version for each driver function in cudaGL.h
- */
-#define PFN_cuGraphicsGLRegisterBuffer  PFN_cuGraphicsGLRegisterBuffer_v3000
-#define PFN_cuGraphicsGLRegisterImage  PFN_cuGraphicsGLRegisterImage_v3000
-#define PFN_cuWGLGetDevice  PFN_cuWGLGetDevice_v2020
-#define PFN_cuGLGetDevices  PFN_cuGLGetDevices_v6050
-#define PFN_cuGLCtxCreate  PFN_cuGLCtxCreate_v3020
-#define PFN_cuGLInit  PFN_cuGLInit_v2000
-#define PFN_cuGLRegisterBufferObject  PFN_cuGLRegisterBufferObject_v2000
-#define PFN_cuGLMapBufferObject  __API_TYPEDEF_PTDS(PFN_cuGLMapBufferObject, 3020, 7000)
-#define PFN_cuGLUnmapBufferObject  PFN_cuGLUnmapBufferObject_v2000
-#define PFN_cuGLUnregisterBufferObject  PFN_cuGLUnregisterBufferObject_v2000
-#define PFN_cuGLSetBufferObjectMapFlags  PFN_cuGLSetBufferObjectMapFlags_v2030
-#define PFN_cuGLMapBufferObjectAsync  __API_TYPEDEF_PTSZ(PFN_cuGLMapBufferObjectAsync, 3020, 7000)
-#define PFN_cuGLUnmapBufferObjectAsync  PFN_cuGLUnmapBufferObjectAsync_v2030
-
-
-/**
- * Type definitions for functions defined in cudaGL.h
- */
-typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterBuffer_v3000)(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterImage_v3000)(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags);
-#ifdef _WIN32
-typedef CUresult (CUDAAPI *PFN_cuWGLGetDevice_v2020)(CUdevice_v1 *pDevice, HGPUNV hGpu);
-#endif
-typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v6050)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
-typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v3020)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
-typedef CUresult (CUDAAPI *PFN_cuGLInit_v2000)(void);
-typedef CUresult (CUDAAPI *PFN_cuGLRegisterBufferObject_v2000)(GLuint buffer);
-typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v7000_ptds)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
-typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObject_v2000)(GLuint buffer);
-typedef CUresult (CUDAAPI *PFN_cuGLUnregisterBufferObject_v2000)(GLuint buffer);
-typedef CUresult (CUDAAPI *PFN_cuGLSetBufferObjectMapFlags_v2030)(GLuint buffer, unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v7000_ptsz)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObjectAsync_v2030)(GLuint buffer, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
-typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
-
-/*
- * Type definitions for older versioned functions in cuda.h
- */
-#if defined(__CUDA_API_VERSION_INTERNAL)
-typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v4010)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
-typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v2000)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer);
-typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v2030)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v2000)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
-#endif
-
-#ifdef __cplusplus
-}
-#endif // __cplusplus
-
-#endif // file guard
--- a/include/cuda/cudaProfiler.h
+++ b/include/cuda/cudaProfiler.h
@ -1,217 +0,0 @@
-/*
- * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#ifndef cuda_profiler_H
-#define cuda_profiler_H
-
-#include <cuda.h>
-
-#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
-#define __CUDA_DEPRECATED
-#elif defined(_MSC_VER)
-#define __CUDA_DEPRECATED __declspec(deprecated)
-#elif defined(__GNUC__)
-#define __CUDA_DEPRECATED __attribute__((deprecated))
-#else
-#define __CUDA_DEPRECATED
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * Profiler Output Modes
- */
-/*DEVICE_BUILTIN*/
-typedef enum CUoutput_mode_enum
-{
-    CU_OUT_KEY_VALUE_PAIR  = 0x00, /**< Output mode Key-Value pair format. */
-    CU_OUT_CSV             = 0x01  /**< Output mode Comma separated values format. */
-}CUoutput_mode;
-
-
-/**
- * \ingroup CUDA_DRIVER
- * \defgroup CUDA_PROFILER_DEPRECATED Profiler Control [DEPRECATED]
- *
- * ___MANBRIEF___ profiler control functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the profiler control functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Initialize the profiling.
- *
- * \deprecated
- *
- * Using this API user can initialize the CUDA profiler by specifying
- * the configuration file, output file and output file format. This
- * API is generally used to profile different set of counters by
- * looping the kernel launch. The \p configFile parameter can be used
- * to select profiling options including profiler counters. Refer to
- * the "Compute Command Line Profiler User Guide" for supported
- * profiler options and counters.
- *
- * Limitation: The CUDA profiler cannot be initialized with this API
- * if another profiling tool is already active, as indicated by the
- * ::CUDA_ERROR_PROFILER_DISABLED return code.
- *
- * Typical usage of the profiling APIs is as follows: 
- *
- * for each set of counters/options\n
- * {\n
- *     cuProfilerInitialize(); //Initialize profiling, set the counters or options in the config file \n
- *     ...\n
- *     cuProfilerStart(); \n
- *     // code to be profiled \n
- *     cuProfilerStop(); \n
- *     ...\n
- *     cuProfilerStart(); \n
- *     // code to be profiled \n
- *     cuProfilerStop(); \n
- *     ...\n
- * }\n
- *
- * \param configFile - Name of the config file that lists the counters/options
- * for profiling.
- * \param outputFile - Name of the outputFile where the profiling results will
- * be stored.
- * \param outputMode - outputMode, can be ::CU_OUT_KEY_VALUE_PAIR or ::CU_OUT_CSV.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_PROFILER_DISABLED
- * \notefnerr
- *
- * \sa
- * ::cuProfilerStart,
- * ::cuProfilerStop,
- * ::cudaProfilerInitialize
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuProfilerInitialize(const char *configFile, const char *outputFile, CUoutput_mode outputMode);
- 
-/** @} */ /* END CUDA_PROFILER_DEPRECATED */
-
-/**
- * \ingroup CUDA_DRIVER
- * \defgroup CUDA_PROFILER Profiler Control 
- *
- * ___MANBRIEF___ profiler control functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the profiler control functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Enable profiling.
- *
- * Enables profile collection by the active profiling tool for the
- * current context. If profiling is already enabled, then
- * cuProfilerStart() has no effect.
- *
- * cuProfilerStart and cuProfilerStop APIs are used to
- * programmatically control the profiling granularity by allowing
- * profiling to be done only on selective pieces of code.
- * 
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_CONTEXT
- * \notefnerr
- *
- * \sa
- * ::cuProfilerInitialize,
- * ::cuProfilerStop,
- * ::cudaProfilerStart
- */
-CUresult CUDAAPI cuProfilerStart(void);
-
-/**
- * \brief Disable profiling.
- *
- * Disables profile collection by the active profiling tool for the
- * current context. If profiling is already disabled, then
- * cuProfilerStop() has no effect.
- *
- * cuProfilerStart and cuProfilerStop APIs are used to
- * programmatically control the profiling granularity by allowing
- * profiling to be done only on selective pieces of code.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_CONTEXT
- * \notefnerr
- *
- * \sa
- * ::cuProfilerInitialize,
- * ::cuProfilerStart,
- * ::cudaProfilerStop
- */
-CUresult CUDAAPI cuProfilerStop(void);
-
-/** @} */ /* END CUDA_PROFILER */
-
-#ifdef __cplusplus
-};
-#endif
-
-#undef __CUDA_DEPRECATED
-
-#endif
-
--- a/include/cuda/cudaProfilerTypedefs.h
+++ b/include/cuda/cudaProfilerTypedefs.h
@ -1,78 +0,0 @@
-/*
- * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#ifndef CUDAPROFILERTYPEDEFS_H
-#define CUDAPROFILERTYPEDEFS_H
-
-#include <cudaProfiler.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif // __cplusplus
-
-/*
- * Macros for the latest version for each driver function in cudaProfiler.h
- */
-#define PFN_cuProfilerInitialize  PFN_cuProfilerInitialize_v4000
-#define PFN_cuProfilerStart  PFN_cuProfilerStart_v4000
-#define PFN_cuProfilerStop  PFN_cuProfilerStop_v4000
-
-
-/**
- * Type definitions for functions defined in cudaProfiler.h
- */
-typedef CUresult (CUDAAPI *PFN_cuProfilerInitialize_v4000)(const char *configFile, const char *outputFile, CUoutput_mode outputMode);
-typedef CUresult (CUDAAPI *PFN_cuProfilerStart_v4000)(void);
-typedef CUresult (CUDAAPI *PFN_cuProfilerStop_v4000)(void);
-
-#ifdef __cplusplus
-}
-#endif // __cplusplus
-
-#endif // file guard
--- a/include/cuda/cudaTypedefs.h
+++ b/include/cuda/cudaTypedefs.h
@ -1,939 +0,0 @@
-/*
- * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#ifndef CUDATYPEDEFS_H
-#define CUDATYPEDEFS_H
-
-#include <cuda.h>
-
-#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
-    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptds
-    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptsz
-#else
-    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## default_version
-    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## default_version
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif // __cplusplus
-
-/*
- * Macros for the latest version for each driver function in cuda.h
- */
-#define PFN_cuGetErrorString  PFN_cuGetErrorString_v6000
-#define PFN_cuGetErrorName  PFN_cuGetErrorName_v6000
-#define PFN_cuInit  PFN_cuInit_v2000
-#define PFN_cuDriverGetVersion  PFN_cuDriverGetVersion_v2020
-#define PFN_cuDeviceGet  PFN_cuDeviceGet_v2000
-#define PFN_cuDeviceGetCount  PFN_cuDeviceGetCount_v2000
-#define PFN_cuDeviceGetName  PFN_cuDeviceGetName_v2000
-#define PFN_cuDeviceGetUuid  PFN_cuDeviceGetUuid_v11040
-#define PFN_cuDeviceGetLuid  PFN_cuDeviceGetLuid_v10000
-#define PFN_cuDeviceTotalMem  PFN_cuDeviceTotalMem_v3020
-#define PFN_cuDeviceGetTexture1DLinearMaxWidth  PFN_cuDeviceGetTexture1DLinearMaxWidth_v11010
-#define PFN_cuDeviceGetAttribute  PFN_cuDeviceGetAttribute_v2000
-#define PFN_cuDeviceGetNvSciSyncAttributes  PFN_cuDeviceGetNvSciSyncAttributes_v10020
-#define PFN_cuDeviceSetMemPool  PFN_cuDeviceSetMemPool_v11020
-#define PFN_cuDeviceGetMemPool  PFN_cuDeviceGetMemPool_v11020
-#define PFN_cuDeviceGetDefaultMemPool  PFN_cuDeviceGetDefaultMemPool_v11020
-#define PFN_cuDeviceGetProperties  PFN_cuDeviceGetProperties_v2000
-#define PFN_cuDeviceComputeCapability  PFN_cuDeviceComputeCapability_v2000
-#define PFN_cuDevicePrimaryCtxRetain  PFN_cuDevicePrimaryCtxRetain_v7000
-#define PFN_cuDevicePrimaryCtxRelease  PFN_cuDevicePrimaryCtxRelease_v11000
-#define PFN_cuDevicePrimaryCtxSetFlags  PFN_cuDevicePrimaryCtxSetFlags_v11000
-#define PFN_cuDevicePrimaryCtxGetState  PFN_cuDevicePrimaryCtxGetState_v7000
-#define PFN_cuDevicePrimaryCtxReset  PFN_cuDevicePrimaryCtxReset_v11000
-#define PFN_cuDeviceGetExecAffinitySupport  PFN_cuDeviceGetExecAffinitySupport_v11040
-#define PFN_cuCtxCreate  PFN_cuCtxCreate_v11040
-#define PFN_cuCtxDestroy  PFN_cuCtxDestroy_v4000
-#define PFN_cuCtxPushCurrent  PFN_cuCtxPushCurrent_v4000
-#define PFN_cuCtxPopCurrent  PFN_cuCtxPopCurrent_v4000
-#define PFN_cuCtxSetCurrent  PFN_cuCtxSetCurrent_v4000
-#define PFN_cuCtxGetCurrent  PFN_cuCtxGetCurrent_v4000
-#define PFN_cuCtxGetDevice  PFN_cuCtxGetDevice_v2000
-#define PFN_cuCtxGetFlags  PFN_cuCtxGetFlags_v7000
-#define PFN_cuCtxSynchronize  PFN_cuCtxSynchronize_v2000
-#define PFN_cuCtxSetLimit  PFN_cuCtxSetLimit_v3010
-#define PFN_cuCtxGetLimit  PFN_cuCtxGetLimit_v3010
-#define PFN_cuCtxGetCacheConfig  PFN_cuCtxGetCacheConfig_v3020
-#define PFN_cuCtxSetCacheConfig  PFN_cuCtxSetCacheConfig_v3020
-#define PFN_cuCtxGetSharedMemConfig  PFN_cuCtxGetSharedMemConfig_v4020
-#define PFN_cuCtxSetSharedMemConfig  PFN_cuCtxSetSharedMemConfig_v4020
-#define PFN_cuCtxGetApiVersion  PFN_cuCtxGetApiVersion_v3020
-#define PFN_cuCtxGetStreamPriorityRange  PFN_cuCtxGetStreamPriorityRange_v5050
-#define PFN_cuCtxResetPersistingL2Cache  PFN_cuCtxResetPersistingL2Cache_v11000
-#define PFN_cuCtxAttach  PFN_cuCtxAttach_v2000
-#define PFN_cuCtxDetach  PFN_cuCtxDetach_v2000
-#define PFN_cuCtxGetExecAffinity  PFN_cuCtxGetExecAffinity_v11040
-#define PFN_cuModuleLoad  PFN_cuModuleLoad_v2000
-#define PFN_cuModuleLoadData  PFN_cuModuleLoadData_v2000
-#define PFN_cuModuleLoadDataEx  PFN_cuModuleLoadDataEx_v2010
-#define PFN_cuModuleLoadFatBinary  PFN_cuModuleLoadFatBinary_v2000
-#define PFN_cuModuleUnload  PFN_cuModuleUnload_v2000
-#define PFN_cuModuleGetFunction  PFN_cuModuleGetFunction_v2000
-#define PFN_cuModuleGetGlobal  PFN_cuModuleGetGlobal_v3020
-#define PFN_cuModuleGetTexRef  PFN_cuModuleGetTexRef_v2000
-#define PFN_cuModuleGetSurfRef  PFN_cuModuleGetSurfRef_v3000
-#define PFN_cuLinkCreate  PFN_cuLinkCreate_v6050
-#define PFN_cuLinkAddData  PFN_cuLinkAddData_v6050
-#define PFN_cuLinkAddFile  PFN_cuLinkAddFile_v6050
-#define PFN_cuLinkComplete  PFN_cuLinkComplete_v5050
-#define PFN_cuLinkDestroy  PFN_cuLinkDestroy_v5050
-#define PFN_cuMemGetInfo  PFN_cuMemGetInfo_v3020
-#define PFN_cuMemAlloc  PFN_cuMemAlloc_v3020
-#define PFN_cuMemAllocPitch  PFN_cuMemAllocPitch_v3020
-#define PFN_cuMemFree  PFN_cuMemFree_v3020
-#define PFN_cuMemGetAddressRange  PFN_cuMemGetAddressRange_v3020
-#define PFN_cuMemAllocHost  PFN_cuMemAllocHost_v3020
-#define PFN_cuMemFreeHost  PFN_cuMemFreeHost_v2000
-#define PFN_cuMemHostAlloc  PFN_cuMemHostAlloc_v2020
-#define PFN_cuMemHostGetDevicePointer  PFN_cuMemHostGetDevicePointer_v3020
-#define PFN_cuMemHostGetFlags  PFN_cuMemHostGetFlags_v2030
-#define PFN_cuMemAllocManaged  PFN_cuMemAllocManaged_v6000
-#define PFN_cuDeviceGetByPCIBusId  PFN_cuDeviceGetByPCIBusId_v4010
-#define PFN_cuDeviceGetPCIBusId  PFN_cuDeviceGetPCIBusId_v4010
-#define PFN_cuIpcGetEventHandle  PFN_cuIpcGetEventHandle_v4010
-#define PFN_cuIpcOpenEventHandle  PFN_cuIpcOpenEventHandle_v4010
-#define PFN_cuIpcGetMemHandle  PFN_cuIpcGetMemHandle_v4010
-#define PFN_cuIpcOpenMemHandle  PFN_cuIpcOpenMemHandle_v11000
-#define PFN_cuIpcCloseMemHandle  PFN_cuIpcCloseMemHandle_v4010
-#define PFN_cuMemHostRegister  PFN_cuMemHostRegister_v6050
-#define PFN_cuMemHostUnregister  PFN_cuMemHostUnregister_v4000
-#define PFN_cuMemcpy  __API_TYPEDEF_PTDS(PFN_cuMemcpy, 4000, 7000)
-#define PFN_cuMemcpyPeer  __API_TYPEDEF_PTDS(PFN_cuMemcpyPeer, 4000, 7000)
-#define PFN_cuMemcpyHtoD  __API_TYPEDEF_PTDS(PFN_cuMemcpyHtoD, 3020, 7000)
-#define PFN_cuMemcpyDtoH  __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoH, 3020, 7000)
-#define PFN_cuMemcpyDtoD  __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoD, 3020, 7000)
-#define PFN_cuMemcpyDtoA  __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoA, 3020, 7000)
-#define PFN_cuMemcpyAtoD  __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoD, 3020, 7000)
-#define PFN_cuMemcpyHtoA  __API_TYPEDEF_PTDS(PFN_cuMemcpyHtoA, 3020, 7000)
-#define PFN_cuMemcpyAtoH  __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoH, 3020, 7000)
-#define PFN_cuMemcpyAtoA  __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoA, 3020, 7000)
-#define PFN_cuMemcpy2D  __API_TYPEDEF_PTDS(PFN_cuMemcpy2D, 3020, 7000)
-#define PFN_cuMemcpy2DUnaligned  __API_TYPEDEF_PTDS(PFN_cuMemcpy2DUnaligned, 3020, 7000)
-#define PFN_cuMemcpy3D  __API_TYPEDEF_PTDS(PFN_cuMemcpy3D, 3020, 7000)
-#define PFN_cuMemcpy3DPeer  __API_TYPEDEF_PTDS(PFN_cuMemcpy3DPeer, 4000, 7000)
-#define PFN_cuMemcpyAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyAsync, 4000, 7000)
-#define PFN_cuMemcpyPeerAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyPeerAsync, 4000, 7000)
-#define PFN_cuMemcpyHtoDAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyHtoDAsync, 3020, 7000)
-#define PFN_cuMemcpyDtoHAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyDtoHAsync, 3020, 7000)
-#define PFN_cuMemcpyDtoDAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyDtoDAsync, 3020, 7000)
-#define PFN_cuMemcpyHtoAAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyHtoAAsync, 3020, 7000)
-#define PFN_cuMemcpyAtoHAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyAtoHAsync, 3020, 7000)
-#define PFN_cuMemcpy2DAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpy2DAsync, 3020, 7000)
-#define PFN_cuMemcpy3DAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpy3DAsync, 3020, 7000)
-#define PFN_cuMemcpy3DPeerAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpy3DPeerAsync, 4000, 7000)
-#define PFN_cuMemsetD8  __API_TYPEDEF_PTDS(PFN_cuMemsetD8, 3020, 7000)
-#define PFN_cuMemsetD16  __API_TYPEDEF_PTDS(PFN_cuMemsetD16, 3020, 7000)
-#define PFN_cuMemsetD32  __API_TYPEDEF_PTDS(PFN_cuMemsetD32, 3020, 7000)
-#define PFN_cuMemsetD2D8  __API_TYPEDEF_PTDS(PFN_cuMemsetD2D8, 3020, 7000)
-#define PFN_cuMemsetD2D16  __API_TYPEDEF_PTDS(PFN_cuMemsetD2D16, 3020, 7000)
-#define PFN_cuMemsetD2D32  __API_TYPEDEF_PTDS(PFN_cuMemsetD2D32, 3020, 7000)
-#define PFN_cuMemsetD8Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD8Async, 3020, 7000)
-#define PFN_cuMemsetD16Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD16Async, 3020, 7000)
-#define PFN_cuMemsetD32Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD32Async, 3020, 7000)
-#define PFN_cuMemsetD2D8Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D8Async, 3020, 7000)
-#define PFN_cuMemsetD2D16Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D16Async, 3020, 7000)
-#define PFN_cuMemsetD2D32Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D32Async, 3020, 7000)
-#define PFN_cuArrayCreate  PFN_cuArrayCreate_v3020
-#define PFN_cuArrayGetDescriptor  PFN_cuArrayGetDescriptor_v3020
-#define PFN_cuArrayGetSparseProperties  PFN_cuArrayGetSparseProperties_v11010
-#define PFN_cuMipmappedArrayGetSparseProperties  PFN_cuMipmappedArrayGetSparseProperties_v11010
-
-#define PFN_cuArrayGetMemoryRequirements  PFN_cuArrayGetMemoryRequirements_v11060
-#define PFN_cuMipmappedArrayGetMemoryRequirements  PFN_cuMipmappedArrayGetMemoryRequirements_v11060
-
-#define PFN_cuArrayGetPlane  PFN_cuArrayGetPlane_v11020
-#define PFN_cuArrayDestroy  PFN_cuArrayDestroy_v2000
-#define PFN_cuArray3DCreate  PFN_cuArray3DCreate_v3020
-#define PFN_cuArray3DGetDescriptor  PFN_cuArray3DGetDescriptor_v3020
-#define PFN_cuMipmappedArrayCreate  PFN_cuMipmappedArrayCreate_v5000
-#define PFN_cuMipmappedArrayGetLevel  PFN_cuMipmappedArrayGetLevel_v5000
-#define PFN_cuMipmappedArrayDestroy  PFN_cuMipmappedArrayDestroy_v5000
-#define PFN_cuMemAddressReserve  PFN_cuMemAddressReserve_v10020
-#define PFN_cuMemAddressFree  PFN_cuMemAddressFree_v10020
-#define PFN_cuMemCreate  PFN_cuMemCreate_v10020
-#define PFN_cuMemRelease  PFN_cuMemRelease_v10020
-#define PFN_cuMemMap  PFN_cuMemMap_v10020
-#define PFN_cuMemMapArrayAsync  __API_TYPEDEF_PTSZ(PFN_cuMemMapArrayAsync, 11010, 11010)
-#define PFN_cuMemUnmap  PFN_cuMemUnmap_v10020
-#define PFN_cuMemSetAccess  PFN_cuMemSetAccess_v10020
-#define PFN_cuMemGetAccess  PFN_cuMemGetAccess_v10020
-#define PFN_cuMemExportToShareableHandle  PFN_cuMemExportToShareableHandle_v10020
-#define PFN_cuMemImportFromShareableHandle  PFN_cuMemImportFromShareableHandle_v10020
-#define PFN_cuMemGetAllocationGranularity  PFN_cuMemGetAllocationGranularity_v10020
-#define PFN_cuMemGetAllocationPropertiesFromHandle  PFN_cuMemGetAllocationPropertiesFromHandle_v10020
-#define PFN_cuMemRetainAllocationHandle  PFN_cuMemRetainAllocationHandle_v11000
-#define PFN_cuMemFreeAsync  __API_TYPEDEF_PTSZ(PFN_cuMemFreeAsync, 11020, 11020)
-#define PFN_cuMemAllocAsync  __API_TYPEDEF_PTSZ(PFN_cuMemAllocAsync, 11020, 11020)
-#define PFN_cuMemPoolTrimTo  PFN_cuMemPoolTrimTo_v11020
-#define PFN_cuMemPoolSetAttribute  PFN_cuMemPoolSetAttribute_v11020
-#define PFN_cuMemPoolGetAttribute  PFN_cuMemPoolGetAttribute_v11020
-#define PFN_cuMemPoolSetAccess  PFN_cuMemPoolSetAccess_v11020
-#define PFN_cuMemPoolGetAccess  PFN_cuMemPoolGetAccess_v11020
-#define PFN_cuMemPoolCreate  PFN_cuMemPoolCreate_v11020
-#define PFN_cuMemPoolDestroy  PFN_cuMemPoolDestroy_v11020
-#define PFN_cuMemAllocFromPoolAsync  __API_TYPEDEF_PTSZ(PFN_cuMemAllocFromPoolAsync, 11020, 11020)
-#define PFN_cuMemPoolExportToShareableHandle  PFN_cuMemPoolExportToShareableHandle_v11020
-#define PFN_cuMemPoolImportFromShareableHandle  PFN_cuMemPoolImportFromShareableHandle_v11020
-#define PFN_cuMemPoolExportPointer  PFN_cuMemPoolExportPointer_v11020
-#define PFN_cuMemPoolImportPointer  PFN_cuMemPoolImportPointer_v11020
-#define PFN_cuPointerGetAttribute  PFN_cuPointerGetAttribute_v4000
-#define PFN_cuMemPrefetchAsync  __API_TYPEDEF_PTSZ(PFN_cuMemPrefetchAsync, 8000, 8000)
-#define PFN_cuMemAdvise  PFN_cuMemAdvise_v8000
-#define PFN_cuMemRangeGetAttribute  PFN_cuMemRangeGetAttribute_v8000
-#define PFN_cuMemRangeGetAttributes  PFN_cuMemRangeGetAttributes_v8000
-#define PFN_cuPointerSetAttribute  PFN_cuPointerSetAttribute_v6000
-#define PFN_cuPointerGetAttributes  PFN_cuPointerGetAttributes_v7000
-#define PFN_cuStreamCreate  PFN_cuStreamCreate_v2000
-#define PFN_cuStreamCreateWithPriority  PFN_cuStreamCreateWithPriority_v5050
-#define PFN_cuStreamGetPriority  __API_TYPEDEF_PTSZ(PFN_cuStreamGetPriority, 5050, 7000)
-#define PFN_cuStreamGetFlags  __API_TYPEDEF_PTSZ(PFN_cuStreamGetFlags, 5050, 7000)
-#define PFN_cuStreamGetCtx  __API_TYPEDEF_PTSZ(PFN_cuStreamGetCtx, 9020, 9020)
-#define PFN_cuStreamWaitEvent  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitEvent, 3020, 7000)
-#define PFN_cuStreamAddCallback  __API_TYPEDEF_PTSZ(PFN_cuStreamAddCallback, 5000, 7000)
-#define PFN_cuStreamBeginCapture  __API_TYPEDEF_PTSZ(PFN_cuStreamBeginCapture, 10010, 10010)
-#define PFN_cuThreadExchangeStreamCaptureMode  PFN_cuThreadExchangeStreamCaptureMode_v10010
-#define PFN_cuStreamEndCapture  __API_TYPEDEF_PTSZ(PFN_cuStreamEndCapture, 10000, 10000)
-#define PFN_cuStreamIsCapturing  __API_TYPEDEF_PTSZ(PFN_cuStreamIsCapturing, 10000, 10000)
-#define PFN_cuStreamGetCaptureInfo  __API_TYPEDEF_PTSZ(PFN_cuStreamGetCaptureInfo, 10010, 10010)
-#define PFN_cuStreamGetCaptureInfo_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamGetCaptureInfo, 11030, 11030)
-#define PFN_cuStreamUpdateCaptureDependencies  __API_TYPEDEF_PTSZ(PFN_cuStreamUpdateCaptureDependencies, 11030, 11030)
-#define PFN_cuStreamAttachMemAsync  __API_TYPEDEF_PTSZ(PFN_cuStreamAttachMemAsync, 6000, 7000)
-#define PFN_cuStreamQuery  __API_TYPEDEF_PTSZ(PFN_cuStreamQuery, 2000, 7000)
-#define PFN_cuStreamSynchronize  __API_TYPEDEF_PTSZ(PFN_cuStreamSynchronize, 2000, 7000)
-#define PFN_cuStreamDestroy  PFN_cuStreamDestroy_v4000
-#define PFN_cuStreamCopyAttributes  __API_TYPEDEF_PTSZ(PFN_cuStreamCopyAttributes, 11000, 11000)
-#define PFN_cuStreamGetAttribute  __API_TYPEDEF_PTSZ(PFN_cuStreamGetAttribute, 11000, 11000)
-#define PFN_cuStreamSetAttribute  __API_TYPEDEF_PTSZ(PFN_cuStreamSetAttribute, 11000, 11000)
-#define PFN_cuEventCreate  PFN_cuEventCreate_v2000
-#define PFN_cuEventRecord  __API_TYPEDEF_PTSZ(PFN_cuEventRecord, 2000, 7000)
-#define PFN_cuEventRecordWithFlags  __API_TYPEDEF_PTSZ(PFN_cuEventRecordWithFlags, 11010, 11010)
-#define PFN_cuEventQuery  PFN_cuEventQuery_v2000
-#define PFN_cuEventSynchronize  PFN_cuEventSynchronize_v2000
-#define PFN_cuEventDestroy  PFN_cuEventDestroy_v4000
-#define PFN_cuEventElapsedTime  PFN_cuEventElapsedTime_v2000
-#define PFN_cuImportExternalMemory  PFN_cuImportExternalMemory_v10000
-#define PFN_cuExternalMemoryGetMappedBuffer  PFN_cuExternalMemoryGetMappedBuffer_v10000
-#define PFN_cuExternalMemoryGetMappedMipmappedArray  PFN_cuExternalMemoryGetMappedMipmappedArray_v10000
-#define PFN_cuDestroyExternalMemory  PFN_cuDestroyExternalMemory_v10000
-#define PFN_cuImportExternalSemaphore  PFN_cuImportExternalSemaphore_v10000
-#define PFN_cuSignalExternalSemaphoresAsync  __API_TYPEDEF_PTSZ(PFN_cuSignalExternalSemaphoresAsync, 10000, 10000)
-#define PFN_cuWaitExternalSemaphoresAsync  __API_TYPEDEF_PTSZ(PFN_cuWaitExternalSemaphoresAsync, 10000, 10000)
-#define PFN_cuDestroyExternalSemaphore  PFN_cuDestroyExternalSemaphore_v10000
-#define PFN_cuStreamWaitValue32  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue32, 8000, 8000)
-#define PFN_cuStreamWaitValue64  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue64, 9000, 9000)
-#define PFN_cuStreamWriteValue32  __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue32, 8000, 8000)
-#define PFN_cuStreamWriteValue64  __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue64, 9000, 9000)
-#define PFN_cuStreamBatchMemOp  __API_TYPEDEF_PTSZ(PFN_cuStreamBatchMemOp, 8000, 8000)
-#define PFN_cuFuncGetAttribute  PFN_cuFuncGetAttribute_v2020
-#define PFN_cuFuncSetAttribute  PFN_cuFuncSetAttribute_v9000
-#define PFN_cuFuncSetCacheConfig  PFN_cuFuncSetCacheConfig_v3000
-#define PFN_cuFuncSetSharedMemConfig  PFN_cuFuncSetSharedMemConfig_v4020
-#define PFN_cuLaunchKernel  __API_TYPEDEF_PTSZ(PFN_cuLaunchKernel, 4000, 7000)
-
-
-
-#define PFN_cuLaunchCooperativeKernel  __API_TYPEDEF_PTSZ(PFN_cuLaunchCooperativeKernel, 9000, 9000)
-#define PFN_cuLaunchCooperativeKernelMultiDevice  PFN_cuLaunchCooperativeKernelMultiDevice_v9000
-#define PFN_cuLaunchHostFunc  __API_TYPEDEF_PTSZ(PFN_cuLaunchHostFunc, 10000, 10000)
-#define PFN_cuFuncSetBlockShape  PFN_cuFuncSetBlockShape_v2000
-#define PFN_cuFuncSetSharedSize  PFN_cuFuncSetSharedSize_v2000
-#define PFN_cuParamSetSize  PFN_cuParamSetSize_v2000
-#define PFN_cuParamSeti  PFN_cuParamSeti_v2000
-#define PFN_cuParamSetf  PFN_cuParamSetf_v2000
-#define PFN_cuParamSetv  PFN_cuParamSetv_v2000
-#define PFN_cuLaunch  PFN_cuLaunch_v2000
-#define PFN_cuLaunchGrid  PFN_cuLaunchGrid_v2000
-#define PFN_cuLaunchGridAsync  PFN_cuLaunchGridAsync_v2000
-#define PFN_cuParamSetTexRef  PFN_cuParamSetTexRef_v2000
-#define PFN_cuGraphCreate  PFN_cuGraphCreate_v10000
-#define PFN_cuGraphAddKernelNode  PFN_cuGraphAddKernelNode_v10000
-#define PFN_cuGraphKernelNodeGetParams  PFN_cuGraphKernelNodeGetParams_v10000
-#define PFN_cuGraphKernelNodeSetParams  PFN_cuGraphKernelNodeSetParams_v10000
-#define PFN_cuGraphAddMemcpyNode  PFN_cuGraphAddMemcpyNode_v10000
-#define PFN_cuGraphMemcpyNodeGetParams  PFN_cuGraphMemcpyNodeGetParams_v10000
-#define PFN_cuGraphMemcpyNodeSetParams  PFN_cuGraphMemcpyNodeSetParams_v10000
-#define PFN_cuGraphAddMemsetNode  PFN_cuGraphAddMemsetNode_v10000
-#define PFN_cuGraphMemsetNodeGetParams  PFN_cuGraphMemsetNodeGetParams_v10000
-#define PFN_cuGraphMemsetNodeSetParams  PFN_cuGraphMemsetNodeSetParams_v10000
-#define PFN_cuGraphAddHostNode  PFN_cuGraphAddHostNode_v10000
-#define PFN_cuGraphHostNodeGetParams  PFN_cuGraphHostNodeGetParams_v10000
-#define PFN_cuGraphHostNodeSetParams  PFN_cuGraphHostNodeSetParams_v10000
-#define PFN_cuGraphAddChildGraphNode  PFN_cuGraphAddChildGraphNode_v10000
-#define PFN_cuGraphChildGraphNodeGetGraph  PFN_cuGraphChildGraphNodeGetGraph_v10000
-#define PFN_cuGraphAddEmptyNode  PFN_cuGraphAddEmptyNode_v10000
-#define PFN_cuGraphAddEventRecordNode  PFN_cuGraphAddEventRecordNode_v11010
-#define PFN_cuGraphEventRecordNodeGetEvent  PFN_cuGraphEventRecordNodeGetEvent_v11010
-#define PFN_cuGraphEventRecordNodeSetEvent  PFN_cuGraphEventRecordNodeSetEvent_v11010
-#define PFN_cuGraphAddEventWaitNode  PFN_cuGraphAddEventWaitNode_v11010
-#define PFN_cuGraphEventWaitNodeGetEvent  PFN_cuGraphEventWaitNodeGetEvent_v11010
-#define PFN_cuGraphEventWaitNodeSetEvent  PFN_cuGraphEventWaitNodeSetEvent_v11010
-#define PFN_cuGraphAddExternalSemaphoresSignalNode  PFN_cuGraphAddExternalSemaphoresSignalNode_v11020
-#define PFN_cuGraphExternalSemaphoresSignalNodeGetParams  PFN_cuGraphExternalSemaphoresSignalNodeGetParams_v11020
-#define PFN_cuGraphExternalSemaphoresSignalNodeSetParams  PFN_cuGraphExternalSemaphoresSignalNodeSetParams_v11020
-#define PFN_cuGraphAddExternalSemaphoresWaitNode  PFN_cuGraphAddExternalSemaphoresWaitNode_v11020
-#define PFN_cuGraphExternalSemaphoresWaitNodeGetParams  PFN_cuGraphExternalSemaphoresWaitNodeGetParams_v11020
-#define PFN_cuGraphExternalSemaphoresWaitNodeSetParams  PFN_cuGraphExternalSemaphoresWaitNodeSetParams_v11020
-#define PFN_cuGraphClone  PFN_cuGraphClone_v10000
-#define PFN_cuGraphNodeFindInClone  PFN_cuGraphNodeFindInClone_v10000
-#define PFN_cuGraphNodeGetType  PFN_cuGraphNodeGetType_v10000
-#define PFN_cuGraphGetNodes  PFN_cuGraphGetNodes_v10000
-#define PFN_cuGraphGetRootNodes  PFN_cuGraphGetRootNodes_v10000
-#define PFN_cuGraphGetEdges  PFN_cuGraphGetEdges_v10000
-#define PFN_cuGraphNodeGetDependencies  PFN_cuGraphNodeGetDependencies_v10000
-#define PFN_cuGraphNodeGetDependentNodes  PFN_cuGraphNodeGetDependentNodes_v10000
-#define PFN_cuGraphAddDependencies  PFN_cuGraphAddDependencies_v10000
-#define PFN_cuGraphRemoveDependencies  PFN_cuGraphRemoveDependencies_v10000
-#define PFN_cuGraphDestroyNode  PFN_cuGraphDestroyNode_v10000
-#define PFN_cuGraphInstantiate  PFN_cuGraphInstantiate_v11000
-#define PFN_cuGraphInstantiateWithFlags  PFN_cuGraphInstantiateWithFlags_v11040
-#define PFN_cuGraphExecKernelNodeSetParams  PFN_cuGraphExecKernelNodeSetParams_v10010
-#define PFN_cuGraphExecMemcpyNodeSetParams  PFN_cuGraphExecMemcpyNodeSetParams_v10020
-#define PFN_cuGraphExecMemsetNodeSetParams  PFN_cuGraphExecMemsetNodeSetParams_v10020
-#define PFN_cuGraphExecHostNodeSetParams  PFN_cuGraphExecHostNodeSetParams_v10020
-#define PFN_cuGraphExecChildGraphNodeSetParams  PFN_cuGraphExecChildGraphNodeSetParams_v11010
-#define PFN_cuGraphExecEventRecordNodeSetEvent  PFN_cuGraphExecEventRecordNodeSetEvent_v11010
-#define PFN_cuGraphExecEventWaitNodeSetEvent  PFN_cuGraphExecEventWaitNodeSetEvent_v11010
-#define PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams  PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams_v11020
-#define PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams  PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams_v11020
-#define PFN_cuGraphUpload  __API_TYPEDEF_PTSZ(PFN_cuGraphUpload, 11010, 11010)
-#define PFN_cuGraphLaunch  __API_TYPEDEF_PTSZ(PFN_cuGraphLaunch, 10000, 10000)
-#define PFN_cuGraphExecDestroy  PFN_cuGraphExecDestroy_v10000
-#define PFN_cuGraphDestroy  PFN_cuGraphDestroy_v10000
-#define PFN_cuGraphExecUpdate  PFN_cuGraphExecUpdate_v10020
-#define PFN_cuGraphKernelNodeCopyAttributes  PFN_cuGraphKernelNodeCopyAttributes_v11000
-#define PFN_cuGraphKernelNodeGetAttribute  PFN_cuGraphKernelNodeGetAttribute_v11000
-#define PFN_cuGraphKernelNodeSetAttribute  PFN_cuGraphKernelNodeSetAttribute_v11000
-#define PFN_cuGraphDebugDotPrint  PFN_cuGraphDebugDotPrint_v11030
-#define PFN_cuGraphAddMemAllocNode  PFN_cuGraphAddMemAllocNode_v11040
-#define PFN_cuGraphMemAllocNodeGetParams PFN_cuGraphMemAllocNodeGetParams_v11040
-#define PFN_cuGraphAddMemFreeNode  PFN_cuGraphAddMemFreeNode_v11040
-#define PFN_cuGraphMemFreeNodeGetParams PFN_cuGraphMemFreeNodeGetParams_v11040
-#define PFN_cuGraphNodeSetEnabled PFN_cuGraphNodeSetEnabled_v11060
-#define PFN_cuGraphNodeGetEnabled PFN_cuGraphNodeGetEnabled_v11060
-#define PFN_cuDeviceGraphMemTrim  PFN_cuDeviceGraphMemTrim_v11040
-#define PFN_cuDeviceGetGraphMemAttribute  PFN_cuDeviceGetGraphMemAttribute_v11040
-#define PFN_cuDeviceSetGraphMemAttribute  PFN_cuDeviceSetGraphMemAttribute_v11040
-#define PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor  PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor_v6050
-#define PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags  PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000
-#define PFN_cuOccupancyMaxPotentialBlockSize  PFN_cuOccupancyMaxPotentialBlockSize_v6050
-#define PFN_cuOccupancyMaxPotentialBlockSizeWithFlags  PFN_cuOccupancyMaxPotentialBlockSizeWithFlags_v7000
-#define PFN_cuOccupancyAvailableDynamicSMemPerBlock  PFN_cuOccupancyAvailableDynamicSMemPerBlock_v10020
-#define PFN_cuTexRefSetArray  PFN_cuTexRefSetArray_v2000
-#define PFN_cuTexRefSetMipmappedArray  PFN_cuTexRefSetMipmappedArray_v5000
-#define PFN_cuTexRefSetAddress  PFN_cuTexRefSetAddress_v3020
-#define PFN_cuTexRefSetAddress2D  PFN_cuTexRefSetAddress2D_v4010
-#define PFN_cuTexRefSetFormat  PFN_cuTexRefSetFormat_v2000
-#define PFN_cuTexRefSetAddressMode  PFN_cuTexRefSetAddressMode_v2000
-#define PFN_cuTexRefSetFilterMode  PFN_cuTexRefSetFilterMode_v2000
-#define PFN_cuTexRefSetMipmapFilterMode  PFN_cuTexRefSetMipmapFilterMode_v5000
-#define PFN_cuTexRefSetMipmapLevelBias  PFN_cuTexRefSetMipmapLevelBias_v5000
-#define PFN_cuTexRefSetMipmapLevelClamp  PFN_cuTexRefSetMipmapLevelClamp_v5000
-#define PFN_cuTexRefSetMaxAnisotropy  PFN_cuTexRefSetMaxAnisotropy_v5000
-#define PFN_cuTexRefSetBorderColor  PFN_cuTexRefSetBorderColor_v8000
-#define PFN_cuTexRefSetFlags  PFN_cuTexRefSetFlags_v2000
-#define PFN_cuTexRefGetAddress  PFN_cuTexRefGetAddress_v3020
-#define PFN_cuTexRefGetArray  PFN_cuTexRefGetArray_v2000
-#define PFN_cuTexRefGetMipmappedArray  PFN_cuTexRefGetMipmappedArray_v5000
-#define PFN_cuTexRefGetAddressMode  PFN_cuTexRefGetAddressMode_v2000
-#define PFN_cuTexRefGetFilterMode  PFN_cuTexRefGetFilterMode_v2000
-#define PFN_cuTexRefGetFormat  PFN_cuTexRefGetFormat_v2000
-#define PFN_cuTexRefGetMipmapFilterMode  PFN_cuTexRefGetMipmapFilterMode_v5000
-#define PFN_cuTexRefGetMipmapLevelBias  PFN_cuTexRefGetMipmapLevelBias_v5000
-#define PFN_cuTexRefGetMipmapLevelClamp  PFN_cuTexRefGetMipmapLevelClamp_v5000
-#define PFN_cuTexRefGetMaxAnisotropy  PFN_cuTexRefGetMaxAnisotropy_v5000
-#define PFN_cuTexRefGetBorderColor  PFN_cuTexRefGetBorderColor_v8000
-#define PFN_cuTexRefGetFlags  PFN_cuTexRefGetFlags_v2000
-#define PFN_cuTexRefCreate  PFN_cuTexRefCreate_v2000
-#define PFN_cuTexRefDestroy  PFN_cuTexRefDestroy_v2000
-#define PFN_cuSurfRefSetArray  PFN_cuSurfRefSetArray_v3000
-#define PFN_cuSurfRefGetArray  PFN_cuSurfRefGetArray_v3000
-#define PFN_cuTexObjectCreate  PFN_cuTexObjectCreate_v5000
-#define PFN_cuTexObjectDestroy  PFN_cuTexObjectDestroy_v5000
-#define PFN_cuTexObjectGetResourceDesc  PFN_cuTexObjectGetResourceDesc_v5000
-#define PFN_cuTexObjectGetTextureDesc  PFN_cuTexObjectGetTextureDesc_v5000
-#define PFN_cuTexObjectGetResourceViewDesc  PFN_cuTexObjectGetResourceViewDesc_v5000
-#define PFN_cuSurfObjectCreate  PFN_cuSurfObjectCreate_v5000
-#define PFN_cuSurfObjectDestroy  PFN_cuSurfObjectDestroy_v5000
-#define PFN_cuSurfObjectGetResourceDesc  PFN_cuSurfObjectGetResourceDesc_v5000
-#define PFN_cuDeviceCanAccessPeer  PFN_cuDeviceCanAccessPeer_v4000
-#define PFN_cuCtxEnablePeerAccess  PFN_cuCtxEnablePeerAccess_v4000
-#define PFN_cuCtxDisablePeerAccess  PFN_cuCtxDisablePeerAccess_v4000
-#define PFN_cuDeviceGetP2PAttribute  PFN_cuDeviceGetP2PAttribute_v8000
-#define PFN_cuGraphicsUnregisterResource  PFN_cuGraphicsUnregisterResource_v3000
-#define PFN_cuGraphicsSubResourceGetMappedArray  PFN_cuGraphicsSubResourceGetMappedArray_v3000
-#define PFN_cuGraphicsResourceGetMappedMipmappedArray  PFN_cuGraphicsResourceGetMappedMipmappedArray_v5000
-#define PFN_cuGraphicsResourceGetMappedPointer  PFN_cuGraphicsResourceGetMappedPointer_v3020
-#define PFN_cuGraphicsResourceSetMapFlags  PFN_cuGraphicsResourceSetMapFlags_v6050
-#define PFN_cuGraphicsMapResources  __API_TYPEDEF_PTSZ(PFN_cuGraphicsMapResources, 3000, 7000)
-#define PFN_cuGraphicsUnmapResources  __API_TYPEDEF_PTSZ(PFN_cuGraphicsUnmapResources, 3000, 7000)
-#define PFN_cuGetExportTable  PFN_cuGetExportTable_v3000
-#define PFN_cuFuncGetModule  PFN_cuFuncGetModule_v11000
-#define PFN_cuFlushGPUDirectRDMAWrites PFN_cuFlushGPUDirectRDMAWrites_v11030
-#define PFN_cuGetProcAddress  PFN_cuGetProcAddress_v11030
-#define PFN_cuUserObjectCreate  PFN_cuUserObjectCreate_v11030
-#define PFN_cuUserObjectRetain  PFN_cuUserObjectRetain_v11030
-#define PFN_cuUserObjectRelease  PFN_cuUserObjectRelease_v11030
-#define PFN_cuGraphRetainUserObject  PFN_cuGraphRetainUserObject_v11030
-#define PFN_cuGraphReleaseUserObject  PFN_cuGraphReleaseUserObject_v11030
-
-
-/*
- * Type definitions for functions defined in cuda.h
- */
-typedef CUresult (CUDAAPI *PFN_cuGetErrorString_v6000)(CUresult error, const char **pStr);
-typedef CUresult (CUDAAPI *PFN_cuGetErrorName_v6000)(CUresult error, const char **pStr);
-typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion);
-typedef CUresult (CUDAAPI *PFN_cuDeviceGet_v2000)(CUdevice_v1 *device, int ordinal);
-typedef CUresult (CUDAAPI *PFN_cuDeviceGetCount_v2000)(int *count);
-typedef CUresult (CUDAAPI *PFN_cuDeviceGetName_v2000)(char *name, int len, CUdevice_v1 dev);
-typedef CUresult (CUDAAPI *PFN_cuDeviceGetUuid_v9020)(CUuuid *uuid, CUdevice_v1 dev);
-typedef CUresult (CUDAAPI *PFN_cuDeviceGetUuid_v11040)(CUuuid *uuid, CUdevice_v1 dev);
-typedef CUresult (CUDAAPI *PFN_cuDeviceGetLuid_v10000)(char *luid, unsigned int *deviceNodeMask, CUdevice_v1 dev);
-typedef CUresult (CUDAAPI *PFN_cuDeviceTotalMem_v3020)(size_t *bytes, CUdevice_v1 dev);
-typedef CUresult (CUDAAPI *PFN_cuDeviceGetTexture1DLinearMaxWidth_v11010)(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice_v1 dev);
-typedef CUresult (CUDAAPI *PFN_cuDeviceGetAttribute_v2000)(int *pi, CUdevice_attribute attrib, CUdevice_v1 dev);
-typedef CUresult (CUDAAPI *PFN_cuDeviceGetNvSciSyncAttributes_v10020)(void *nvSciSyncAttrList, CUdevice_v1 dev, int flags);
-typedef CUresult (CUDAAPI *PFN_cuDeviceSetMemPool_v11020)(CUdevice_v1 dev, CUmemoryPool pool);
-typedef CUresult (CUDAAPI *PFN_cuDeviceGetMemPool_v11020)(CUmemoryPool *pool, CUdevice_v1 dev);
-typedef CUresult (CUDAAPI *PFN_cuDeviceGetDefaultMemPool_v11020)(CUmemoryPool *pool_out, CUdevice_v1 dev);
-typedef CUresult (CUDAAPI *PFN_cuDeviceGetProperties_v2000)(CUdevprop_v1 *prop, CUdevice_v1 dev);
-typedef CUresult (CUDAAPI *PFN_cuDeviceComputeCapability_v2000)(int *major, int *minor, CUdevice_v1 dev);
-typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRetain_v7000)(CUcontext *pctx, CUdevice_v1 dev);
-typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRelease_v11000)(CUdevice_v1 dev);
-typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxSetFlags_v11000)(CUdevice_v1 dev, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxGetState_v7000)(CUdevice_v1 dev, unsigned int *flags, int *active);
-typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxReset_v11000)(CUdevice_v1 dev);
-typedef CUresult (CUDAAPI *PFN_cuDeviceGetExecAffinitySupport_v11040)(int *pi, CUexecAffinityType type, CUdevice dev);
-typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v3020)(CUcontext *pctx, unsigned int flags, CUdevice_v1 dev);
-typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v11040)(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice_v1 dev);
-typedef CUresult (CUDAAPI *PFN_cuCtxDestroy_v4000)(CUcontext ctx);
-typedef CUresult (CUDAAPI *PFN_cuCtxPushCurrent_v4000)(CUcontext ctx);
-typedef CUresult (CUDAAPI *PFN_cuCtxPopCurrent_v4000)(CUcontext *pctx);
-typedef CUresult (CUDAAPI *PFN_cuCtxSetCurrent_v4000)(CUcontext ctx);
-typedef CUresult (CUDAAPI *PFN_cuCtxGetCurrent_v4000)(CUcontext *pctx);
-typedef CUresult (CUDAAPI *PFN_cuCtxGetDevice_v2000)(CUdevice_v1 *device);
-typedef CUresult (CUDAAPI *PFN_cuCtxGetFlags_v7000)(unsigned int *flags);
-typedef CUresult (CUDAAPI *PFN_cuCtxSynchronize_v2000)(void);
-typedef CUresult (CUDAAPI *PFN_cuCtxSetLimit_v3010)(CUlimit limit, size_t value);
-typedef CUresult (CUDAAPI *PFN_cuCtxGetLimit_v3010)(size_t *pvalue, CUlimit limit);
-typedef CUresult (CUDAAPI *PFN_cuCtxGetCacheConfig_v3020)(CUfunc_cache *pconfig);
-typedef CUresult (CUDAAPI *PFN_cuCtxSetCacheConfig_v3020)(CUfunc_cache config);
-typedef CUresult (CUDAAPI *PFN_cuCtxGetSharedMemConfig_v4020)(CUsharedconfig *pConfig);
-typedef CUresult (CUDAAPI *PFN_cuCtxSetSharedMemConfig_v4020)(CUsharedconfig config);
-typedef CUresult (CUDAAPI *PFN_cuCtxGetApiVersion_v3020)(CUcontext ctx, unsigned int *version);
-typedef CUresult (CUDAAPI *PFN_cuCtxGetStreamPriorityRange_v5050)(int *leastPriority, int *greatestPriority);
-typedef CUresult (CUDAAPI *PFN_cuCtxResetPersistingL2Cache_v11000)(void);
-typedef CUresult (CUDAAPI *PFN_cuCtxAttach_v2000)(CUcontext *pctx, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuCtxDetach_v2000)(CUcontext ctx);
-typedef CUresult (CUDAAPI *PFN_cuCtxGetExecAffinity_v11040)(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type);
-typedef CUresult (CUDAAPI *PFN_cuModuleLoad_v2000)(CUmodule *module, const char *fname);
-typedef CUresult (CUDAAPI *PFN_cuModuleLoadData_v2000)(CUmodule *module, const void *image);
-typedef CUresult (CUDAAPI *PFN_cuModuleLoadDataEx_v2010)(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
-typedef CUresult (CUDAAPI *PFN_cuModuleLoadFatBinary_v2000)(CUmodule *module, const void *fatCubin);
-typedef CUresult (CUDAAPI *PFN_cuModuleUnload_v2000)(CUmodule hmod);
-typedef CUresult (CUDAAPI *PFN_cuModuleGetFunction_v2000)(CUfunction *hfunc, CUmodule hmod, const char *name);
-typedef CUresult (CUDAAPI *PFN_cuModuleGetGlobal_v3020)(CUdeviceptr_v2 *dptr, size_t *bytes, CUmodule hmod, const char *name);
-typedef CUresult (CUDAAPI *PFN_cuModuleGetTexRef_v2000)(CUtexref *pTexRef, CUmodule hmod, const char *name);
-typedef CUresult (CUDAAPI *PFN_cuModuleGetSurfRef_v3000)(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
-typedef CUresult (CUDAAPI *PFN_cuLinkCreate_v6050)(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
-typedef CUresult (CUDAAPI *PFN_cuLinkAddData_v6050)(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
-typedef CUresult (CUDAAPI *PFN_cuLinkAddFile_v6050)(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues);
-typedef CUresult (CUDAAPI *PFN_cuLinkComplete_v5050)(CUlinkState state, void **cubinOut, size_t *sizeOut);
-typedef CUresult (CUDAAPI *PFN_cuLinkDestroy_v5050)(CUlinkState state);
-typedef CUresult (CUDAAPI *PFN_cuMemGetInfo_v3020)(size_t *free, size_t *total);
-typedef CUresult (CUDAAPI *PFN_cuMemAlloc_v3020)(CUdeviceptr_v2 *dptr, size_t bytesize);
-typedef CUresult (CUDAAPI *PFN_cuMemAllocPitch_v3020)(CUdeviceptr_v2 *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
-typedef CUresult (CUDAAPI *PFN_cuMemFree_v3020)(CUdeviceptr_v2 dptr);
-typedef CUresult (CUDAAPI *PFN_cuMemGetAddressRange_v3020)(CUdeviceptr_v2 *pbase, size_t *psize, CUdeviceptr_v2 dptr);
-typedef CUresult (CUDAAPI *PFN_cuMemAllocHost_v3020)(void **pp, size_t bytesize);
-typedef CUresult (CUDAAPI *PFN_cuMemFreeHost_v2000)(void *p);
-typedef CUresult (CUDAAPI *PFN_cuMemHostAlloc_v2020)(void **pp, size_t bytesize, unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuMemHostGetDevicePointer_v3020)(CUdeviceptr_v2 *pdptr, void *p, unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuMemHostGetFlags_v2030)(unsigned int *pFlags, void *p);
-typedef CUresult (CUDAAPI *PFN_cuMemAllocManaged_v6000)(CUdeviceptr_v2 *dptr, size_t bytesize, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuDeviceGetByPCIBusId_v4010)(CUdevice_v1 *dev, const char *pciBusId);
-typedef CUresult (CUDAAPI *PFN_cuDeviceGetPCIBusId_v4010)(char *pciBusId, int len, CUdevice_v1 dev);
-typedef CUresult (CUDAAPI *PFN_cuIpcGetEventHandle_v4010)(CUipcEventHandle_v1 *pHandle, CUevent event);
-typedef CUresult (CUDAAPI *PFN_cuIpcOpenEventHandle_v4010)(CUevent *phEvent, CUipcEventHandle_v1 handle);
-typedef CUresult (CUDAAPI *PFN_cuIpcGetMemHandle_v4010)(CUipcMemHandle_v1 *pHandle, CUdeviceptr_v2 dptr);
-typedef CUresult (CUDAAPI *PFN_cuIpcOpenMemHandle_v11000)(CUdeviceptr_v2 *pdptr, CUipcMemHandle_v1 handle, unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuIpcCloseMemHandle_v4010)(CUdeviceptr_v2 dptr);
-typedef CUresult (CUDAAPI *PFN_cuMemHostRegister_v6050)(void *p, size_t bytesize, unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuMemHostUnregister_v4000)(void *p);
-typedef CUresult (CUDAAPI *PFN_cuMemcpy_v7000_ptds)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyPeer_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v7000_ptds)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, CUdeviceptr_v2 srcDevice, size_t ByteCount);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v7000_ptds)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
-typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v7000_ptds)(const CUDA_MEMCPY2D_v2 *pCopy);
-typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v7000_ptds)(const CUDA_MEMCPY2D_v2 *pCopy);
-typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v7000_ptds)(const CUDA_MEMCPY3D_v2 *pCopy);
-typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeer_v7000_ptds)(const CUDA_MEMCPY3D_PEER_v1 *pCopy);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyAsync_v7000_ptsz)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyPeerAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v7000_ptsz)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v7000_ptsz)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v7000_ptsz)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v7000_ptsz)(const CUDA_MEMCPY2D_v2 *pCopy, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v7000_ptsz)(const CUDA_MEMCPY3D_v2 *pCopy, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeerAsync_v7000_ptsz)(const CUDA_MEMCPY3D_PEER_v1 *pCopy, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD8Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD16Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD32Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuArrayCreate_v3020)(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v2 *pAllocateArray);
-typedef CUresult (CUDAAPI *PFN_cuArrayGetDescriptor_v3020)(CUDA_ARRAY_DESCRIPTOR_v2 *pArrayDescriptor, CUarray hArray);
-typedef CUresult (CUDAAPI *PFN_cuArrayGetSparseProperties_v11010)(CUDA_ARRAY_SPARSE_PROPERTIES_v1 *sparseProperties, CUarray array);
-typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetSparseProperties_v11010)(CUDA_ARRAY_SPARSE_PROPERTIES_v1 *sparseProperties, CUmipmappedArray mipmap);
-
-typedef CUresult (CUDAAPI *PFN_cuArrayGetMemoryRequirements_v11060)(CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 *memoryRequirements, CUarray array, CUdevice device);
-typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetMemoryRequirements_v11060)(CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 *memoryRequirements, CUmipmappedArray mipmap, CUdevice device);
-
-typedef CUresult (CUDAAPI *PFN_cuArrayGetPlane_v11020)(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx);
-typedef CUresult (CUDAAPI *PFN_cuArrayDestroy_v2000)(CUarray hArray);
-typedef CUresult (CUDAAPI *PFN_cuArray3DCreate_v3020)(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v2 *pAllocateArray);
-typedef CUresult (CUDAAPI *PFN_cuArray3DGetDescriptor_v3020)(CUDA_ARRAY3D_DESCRIPTOR_v2 *pArrayDescriptor, CUarray hArray);
-typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayCreate_v5000)(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v2 *pMipmappedArrayDesc, unsigned int numMipmapLevels);
-typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetLevel_v5000)(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
-typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayDestroy_v5000)(CUmipmappedArray hMipmappedArray);
-typedef CUresult (CUDAAPI *PFN_cuMemAddressReserve_v10020)(CUdeviceptr_v2 *ptr, size_t size, size_t alignment, CUdeviceptr_v2 addr, unsigned long long flags);
-typedef CUresult (CUDAAPI *PFN_cuMemAddressFree_v10020)(CUdeviceptr_v2 ptr, size_t size);
-typedef CUresult (CUDAAPI *PFN_cuMemCreate_v10020)(CUmemGenericAllocationHandle_v1 *handle, size_t size, const CUmemAllocationProp_v1 *prop, unsigned long long flags);
-typedef CUresult (CUDAAPI *PFN_cuMemRelease_v10020)(CUmemGenericAllocationHandle_v1 handle);
-typedef CUresult (CUDAAPI *PFN_cuMemMap_v10020)(CUdeviceptr_v2 ptr, size_t size, size_t offset, CUmemGenericAllocationHandle_v1 handle, unsigned long long flags);
-typedef CUresult (CUDAAPI *PFN_cuMemMapArrayAsync_v11010_ptsz)(CUarrayMapInfo_v1 *mapInfoList, unsigned int count, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemUnmap_v10020)(CUdeviceptr_v2 ptr, size_t size);
-typedef CUresult (CUDAAPI *PFN_cuMemSetAccess_v10020)(CUdeviceptr_v2 ptr, size_t size, const CUmemAccessDesc_v1 *desc, size_t count);
-typedef CUresult (CUDAAPI *PFN_cuMemGetAccess_v10020)(unsigned long long *flags, const CUmemLocation_v1 *location, CUdeviceptr_v2 ptr);
-typedef CUresult (CUDAAPI *PFN_cuMemExportToShareableHandle_v10020)(void *shareableHandle, CUmemGenericAllocationHandle_v1 handle, CUmemAllocationHandleType handleType, unsigned long long flags);
-typedef CUresult (CUDAAPI *PFN_cuMemImportFromShareableHandle_v10020)(CUmemGenericAllocationHandle_v1 *handle, void *osHandle, CUmemAllocationHandleType shHandleType);
-typedef CUresult (CUDAAPI *PFN_cuMemGetAllocationGranularity_v10020)(size_t *granularity, const CUmemAllocationProp_v1 *prop, CUmemAllocationGranularity_flags option);
-typedef CUresult (CUDAAPI *PFN_cuMemGetAllocationPropertiesFromHandle_v10020)(CUmemAllocationProp_v1 *prop, CUmemGenericAllocationHandle_v1 handle);
-typedef CUresult (CUDAAPI *PFN_cuMemRetainAllocationHandle_v11000)(CUmemGenericAllocationHandle_v1 *handle, void *addr);
-typedef CUresult (CUDAAPI *PFN_cuMemFreeAsync_v11020_ptsz)(CUdeviceptr_v2 dptr, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemAllocAsync_v11020_ptsz)(CUdeviceptr_v2 *dptr, size_t bytesize, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemPoolTrimTo_v11020)(CUmemoryPool pool, size_t minBytesToKeep);
-typedef CUresult (CUDAAPI *PFN_cuMemPoolSetAttribute_v11020)(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
-typedef CUresult (CUDAAPI *PFN_cuMemPoolGetAttribute_v11020)(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
-typedef CUresult (CUDAAPI *PFN_cuMemPoolSetAccess_v11020)(CUmemoryPool pool, const CUmemAccessDesc_v1 *map, size_t count);
-typedef CUresult (CUDAAPI *PFN_cuMemPoolGetAccess_v11020)(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation_v1 *location);
-typedef CUresult (CUDAAPI *PFN_cuMemPoolCreate_v11020)(CUmemoryPool *pool, const CUmemPoolProps_v1 *poolProps);
-typedef CUresult (CUDAAPI *PFN_cuMemPoolDestroy_v11020)(CUmemoryPool pool);
-typedef CUresult (CUDAAPI *PFN_cuMemAllocFromPoolAsync_v11020_ptsz)(CUdeviceptr_v2 *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemPoolExportToShareableHandle_v11020)(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags);
-typedef CUresult (CUDAAPI *PFN_cuMemPoolImportFromShareableHandle_v11020)(CUmemoryPool *pool_out, void *handle, CUmemAllocationHandleType handleType, unsigned long long flags);
-typedef CUresult (CUDAAPI *PFN_cuMemPoolExportPointer_v11020)(CUmemPoolPtrExportData_v1 *shareData_out, CUdeviceptr_v2 ptr);
-typedef CUresult (CUDAAPI *PFN_cuMemPoolImportPointer_v11020)(CUdeviceptr_v2 *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData_v1 *shareData);
-typedef CUresult (CUDAAPI *PFN_cuPointerGetAttribute_v4000)(void *data, CUpointer_attribute attribute, CUdeviceptr_v2 ptr);
-typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v8000_ptsz)(CUdeviceptr_v2 devPtr, size_t count, CUdevice_v1 dstDevice, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemAdvise_v8000)(CUdeviceptr_v2 devPtr, size_t count, CUmem_advise advice, CUdevice_v1 device);
-typedef CUresult (CUDAAPI *PFN_cuMemRangeGetAttribute_v8000)(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr_v2 devPtr, size_t count);
-typedef CUresult (CUDAAPI *PFN_cuMemRangeGetAttributes_v8000)(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr_v2 devPtr, size_t count);
-typedef CUresult (CUDAAPI *PFN_cuPointerSetAttribute_v6000)(const void *value, CUpointer_attribute attribute, CUdeviceptr_v2 ptr);
-typedef CUresult (CUDAAPI *PFN_cuPointerGetAttributes_v7000)(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr_v2 ptr);
-typedef CUresult (CUDAAPI *PFN_cuStreamCreate_v2000)(CUstream *phStream, unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuStreamCreateWithPriority_v5050)(CUstream *phStream, unsigned int flags, int priority);
-typedef CUresult (CUDAAPI *PFN_cuStreamGetPriority_v7000_ptsz)(CUstream hStream, int *priority);
-typedef CUresult (CUDAAPI *PFN_cuStreamGetFlags_v7000_ptsz)(CUstream hStream, unsigned int *flags);
-typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v9020_ptsz)(CUstream hStream, CUcontext *pctx);
-typedef CUresult (CUDAAPI *PFN_cuStreamWaitEvent_v7000_ptsz)(CUstream hStream, CUevent hEvent, unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuStreamAddCallback_v7000_ptsz)(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10010_ptsz)(CUstream hStream, CUstreamCaptureMode mode);
-typedef CUresult (CUDAAPI *PFN_cuThreadExchangeStreamCaptureMode_v10010)(CUstreamCaptureMode *mode);
-typedef CUresult (CUDAAPI *PFN_cuStreamEndCapture_v10000_ptsz)(CUstream hStream, CUgraph *phGraph);
-typedef CUresult (CUDAAPI *PFN_cuStreamIsCapturing_v10000_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus);
-typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v10010_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
-typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v11030_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
-typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v11030_ptsz)(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuStreamAttachMemAsync_v7000_ptsz)(CUstream hStream, CUdeviceptr_v2 dptr, size_t length, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuStreamQuery_v7000_ptsz)(CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuStreamSynchronize_v7000_ptsz)(CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuStreamDestroy_v4000)(CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuStreamCopyAttributes_v11000_ptsz)(CUstream dst, CUstream src);
-typedef CUresult (CUDAAPI *PFN_cuStreamGetAttribute_v11000_ptsz)(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue_v1 *value_out);
-typedef CUresult (CUDAAPI *PFN_cuStreamSetAttribute_v11000_ptsz)(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue_v1 *value);
-typedef CUresult (CUDAAPI *PFN_cuEventCreate_v2000)(CUevent *phEvent, unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuEventRecord_v7000_ptsz)(CUevent hEvent, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuEventRecordWithFlags_v11010_ptsz)(CUevent hEvent, CUstream hStream, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuEventQuery_v2000)(CUevent hEvent);
-typedef CUresult (CUDAAPI *PFN_cuEventSynchronize_v2000)(CUevent hEvent);
-typedef CUresult (CUDAAPI *PFN_cuEventDestroy_v4000)(CUevent hEvent);
-typedef CUresult (CUDAAPI *PFN_cuEventElapsedTime_v2000)(float *pMilliseconds, CUevent hStart, CUevent hEnd);
-typedef CUresult (CUDAAPI *PFN_cuImportExternalMemory_v10000)(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 *memHandleDesc);
-typedef CUresult (CUDAAPI *PFN_cuExternalMemoryGetMappedBuffer_v10000)(CUdeviceptr_v2 *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 *bufferDesc);
-typedef CUresult (CUDAAPI *PFN_cuExternalMemoryGetMappedMipmappedArray_v10000)(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 *mipmapDesc);
-typedef CUresult (CUDAAPI *PFN_cuDestroyExternalMemory_v10000)(CUexternalMemory extMem);
-typedef CUresult (CUDAAPI *PFN_cuImportExternalSemaphore_v10000)(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 *semHandleDesc);
-typedef CUresult (CUDAAPI *PFN_cuSignalExternalSemaphoresAsync_v10000_ptsz)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
-typedef CUresult (CUDAAPI *PFN_cuWaitExternalSemaphoresAsync_v10000_ptsz)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
-typedef CUresult (CUDAAPI *PFN_cuDestroyExternalSemaphore_v10000)(CUexternalSemaphore extSem);
-typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v8000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v9000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v8000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v9000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v8000_ptsz)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams_v1 *paramArray, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuFuncGetAttribute_v2020)(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
-typedef CUresult (CUDAAPI *PFN_cuFuncSetAttribute_v9000)(CUfunction hfunc, CUfunction_attribute attrib, int value);
-typedef CUresult (CUDAAPI *PFN_cuFuncSetCacheConfig_v3000)(CUfunction hfunc, CUfunc_cache config);
-typedef CUresult (CUDAAPI *PFN_cuFuncSetSharedMemConfig_v4020)(CUfunction hfunc, CUsharedconfig config);
-typedef CUresult (CUDAAPI *PFN_cuLaunchKernel_v7000_ptsz)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
-
-
-
-typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernel_v9000_ptsz)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
-typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernelMultiDevice_v9000)(CUDA_LAUNCH_PARAMS_v1 *launchParamsList, unsigned int numDevices, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuLaunchHostFunc_v10000_ptsz)(CUstream hStream, CUhostFn fn, void *userData);
-typedef CUresult (CUDAAPI *PFN_cuFuncSetBlockShape_v2000)(CUfunction hfunc, int x, int y, int z);
-typedef CUresult (CUDAAPI *PFN_cuFuncSetSharedSize_v2000)(CUfunction hfunc, unsigned int bytes);
-typedef CUresult (CUDAAPI *PFN_cuParamSetSize_v2000)(CUfunction hfunc, unsigned int numbytes);
-typedef CUresult (CUDAAPI *PFN_cuParamSeti_v2000)(CUfunction hfunc, int offset, unsigned int value);
-typedef CUresult (CUDAAPI *PFN_cuParamSetf_v2000)(CUfunction hfunc, int offset, float value);
-typedef CUresult (CUDAAPI *PFN_cuParamSetv_v2000)(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
-typedef CUresult (CUDAAPI *PFN_cuLaunch_v2000)(CUfunction f);
-typedef CUresult (CUDAAPI *PFN_cuLaunchGrid_v2000)(CUfunction f, int grid_width, int grid_height);
-typedef CUresult (CUDAAPI *PFN_cuLaunchGridAsync_v2000)(CUfunction f, int grid_width, int grid_height, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuParamSetTexRef_v2000)(CUfunction hfunc, int texunit, CUtexref hTexRef);
-typedef CUresult (CUDAAPI *PFN_cuGraphCreate_v10000)(CUgraph *phGraph, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuGraphAddKernelNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
-typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeGetParams_v10000)(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
-typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
-typedef CUresult (CUDAAPI *PFN_cuGraphAddMemcpyNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D_v2 *copyParams, CUcontext ctx);
-typedef CUresult (CUDAAPI *PFN_cuGraphMemcpyNodeGetParams_v10000)(CUgraphNode hNode, CUDA_MEMCPY3D_v2 *nodeParams);
-typedef CUresult (CUDAAPI *PFN_cuGraphMemcpyNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_MEMCPY3D_v2 *nodeParams);
-typedef CUresult (CUDAAPI *PFN_cuGraphAddMemsetNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS_v1 *memsetParams, CUcontext ctx);
-typedef CUresult (CUDAAPI *PFN_cuGraphMemsetNodeGetParams_v10000)(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS_v1 *nodeParams);
-typedef CUresult (CUDAAPI *PFN_cuGraphMemsetNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS_v1 *nodeParams);
-typedef CUresult (CUDAAPI *PFN_cuGraphAddHostNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
-typedef CUresult (CUDAAPI *PFN_cuGraphHostNodeGetParams_v10000)(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
-typedef CUresult (CUDAAPI *PFN_cuGraphHostNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
-typedef CUresult (CUDAAPI *PFN_cuGraphAddChildGraphNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph);
-typedef CUresult (CUDAAPI *PFN_cuGraphChildGraphNodeGetGraph_v10000)(CUgraphNode hNode, CUgraph *phGraph);
-typedef CUresult (CUDAAPI *PFN_cuGraphAddEmptyNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies);
-typedef CUresult (CUDAAPI *PFN_cuGraphAddEventRecordNode_v11010)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
-typedef CUresult (CUDAAPI *PFN_cuGraphEventRecordNodeGetEvent_v11010)(CUgraphNode hNode, CUevent *event_out);
-typedef CUresult (CUDAAPI *PFN_cuGraphEventRecordNodeSetEvent_v11010)(CUgraphNode hNode, CUevent event);
-typedef CUresult (CUDAAPI *PFN_cuGraphAddEventWaitNode_v11010)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
-typedef CUresult (CUDAAPI *PFN_cuGraphEventWaitNodeGetEvent_v11010)(CUgraphNode hNode, CUevent *event_out);
-typedef CUresult (CUDAAPI *PFN_cuGraphEventWaitNodeSetEvent_v11010)(CUgraphNode hNode, CUevent event);
-typedef CUresult (CUDAAPI *PFN_cuGraphAddExternalSemaphoresSignalNode_v11020)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
-typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresSignalNodeGetParams_v11020)(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *params_out);
-typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresSignalNodeSetParams_v11020)(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
-typedef CUresult (CUDAAPI *PFN_cuGraphAddExternalSemaphoresWaitNode_v11020)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
-typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresWaitNodeGetParams_v11020)(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *params_out);
-typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresWaitNodeSetParams_v11020)(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
-typedef CUresult (CUDAAPI *PFN_cuGraphClone_v10000)(CUgraph *phGraphClone, CUgraph originalGraph);
-typedef CUresult (CUDAAPI *PFN_cuGraphNodeFindInClone_v10000)(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph);
-typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetType_v10000)(CUgraphNode hNode, CUgraphNodeType *type);
-typedef CUresult (CUDAAPI *PFN_cuGraphGetNodes_v10000)(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes);
-typedef CUresult (CUDAAPI *PFN_cuGraphGetRootNodes_v10000)(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes);
-typedef CUresult (CUDAAPI *PFN_cuGraphGetEdges_v10000)(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges);
-typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependencies_v10000)(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies);
-typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependentNodes_v10000)(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes);
-typedef CUresult (CUDAAPI *PFN_cuGraphAddDependencies_v10000)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
-typedef CUresult (CUDAAPI *PFN_cuGraphRemoveDependencies_v10000)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
-typedef CUresult (CUDAAPI *PFN_cuGraphDestroyNode_v10000)(CUgraphNode hNode);
-typedef CUresult (CUDAAPI *PFN_cuGraphInstantiate_v11000)(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
-typedef CUresult (CUDAAPI *PFN_cuGraphInstantiateWithFlags_v11040)(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags);
-typedef CUresult (CUDAAPI *PFN_cuGraphExecKernelNodeSetParams_v10010)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
-typedef CUresult (CUDAAPI *PFN_cuGraphExecMemcpyNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D_v2 *copyParams, CUcontext ctx);
-typedef CUresult (CUDAAPI *PFN_cuGraphExecMemsetNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS_v1 *memsetParams, CUcontext ctx);
-typedef CUresult (CUDAAPI *PFN_cuGraphExecHostNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
-typedef CUresult (CUDAAPI *PFN_cuGraphExecChildGraphNodeSetParams_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph);
-typedef CUresult (CUDAAPI *PFN_cuGraphExecEventRecordNodeSetEvent_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
-typedef CUresult (CUDAAPI *PFN_cuGraphExecEventWaitNodeSetEvent_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
-typedef CUresult (CUDAAPI *PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams_v11020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
-typedef CUresult (CUDAAPI *PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams_v11020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
-typedef CUresult (CUDAAPI *PFN_cuGraphUpload_v11010_ptsz)(CUgraphExec hGraphExec, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuGraphLaunch_v10000_ptsz)(CUgraphExec hGraphExec, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuGraphExecDestroy_v10000)(CUgraphExec hGraphExec);
-typedef CUresult (CUDAAPI *PFN_cuGraphDestroy_v10000)(CUgraph hGraph);
-typedef CUresult (CUDAAPI *PFN_cuGraphExecUpdate_v10020)(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out);
-typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeCopyAttributes_v11000)(CUgraphNode dst, CUgraphNode src);
-typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeGetAttribute_v11000)(CUgraphNode hNode, CUkernelNodeAttrID attr, CUkernelNodeAttrValue_v1 *value_out);
-typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeSetAttribute_v11000)(CUgraphNode hNode, CUkernelNodeAttrID attr, const CUkernelNodeAttrValue_v1 *value);
-typedef CUresult (CUDAAPI *PFN_cuGraphDebugDotPrint_v11030)(CUgraph hGraph, const char *path, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuGraphAddMemAllocNode_v11040)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams);
-typedef CUresult (CUDAAPI *PFN_cuGraphMemAllocNodeGetParams_v11040)(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out);
-typedef CUresult (CUDAAPI *PFN_cuGraphAddMemFreeNode_v11040)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr);
-typedef CUresult (CUDAAPI *PFN_cuGraphMemFreeNodeGetParams_v11040)(CUgraphNode hNode, CUdeviceptr *dptr_out);
-typedef CUresult (CUDAAPI *PFN_cuGraphNodeSetEnabled_v11060)(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int isEnabled);
-typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetEnabled_v11060)(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int *isEnabled);
-typedef CUresult (CUDAAPI *PFN_cuDeviceGraphMemTrim_v11040)(CUdevice device);
-typedef CUresult (CUDAAPI *PFN_cuDeviceGetGraphMemAttribute_v11040)(CUdevice device, CUgraphMem_attribute attr, void* value);
-typedef CUresult (CUDAAPI *PFN_cuDeviceSetGraphMemAttribute_v11040)(CUdevice device, CUgraphMem_attribute attr, void* value);
-typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor_v6050)(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize);
-typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000)(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialBlockSize_v6050)(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit);
-typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialBlockSizeWithFlags_v7000)(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuOccupancyAvailableDynamicSMemPerBlock_v10020)(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize);
-typedef CUresult (CUDAAPI *PFN_cuTexRefSetArray_v2000)(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmappedArray_v5000)(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress_v3020)(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr_v2 dptr, size_t bytes);
-typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v4010)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v2 *desc, CUdeviceptr_v2 dptr, size_t Pitch);
-typedef CUresult (CUDAAPI *PFN_cuTexRefSetFormat_v2000)(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
-typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddressMode_v2000)(CUtexref hTexRef, int dim, CUaddress_mode am);
-typedef CUresult (CUDAAPI *PFN_cuTexRefSetFilterMode_v2000)(CUtexref hTexRef, CUfilter_mode fm);
-typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapFilterMode_v5000)(CUtexref hTexRef, CUfilter_mode fm);
-typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapLevelBias_v5000)(CUtexref hTexRef, float bias);
-typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapLevelClamp_v5000)(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);
-typedef CUresult (CUDAAPI *PFN_cuTexRefSetMaxAnisotropy_v5000)(CUtexref hTexRef, unsigned int maxAniso);
-typedef CUresult (CUDAAPI *PFN_cuTexRefSetBorderColor_v8000)(CUtexref hTexRef, float *pBorderColor);
-typedef CUresult (CUDAAPI *PFN_cuTexRefSetFlags_v2000)(CUtexref hTexRef, unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddress_v3020)(CUdeviceptr_v2 *pdptr, CUtexref hTexRef);
-typedef CUresult (CUDAAPI *PFN_cuTexRefGetArray_v2000)(CUarray *phArray, CUtexref hTexRef);
-typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmappedArray_v5000)(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef);
-typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddressMode_v2000)(CUaddress_mode *pam, CUtexref hTexRef, int dim);
-typedef CUresult (CUDAAPI *PFN_cuTexRefGetFilterMode_v2000)(CUfilter_mode *pfm, CUtexref hTexRef);
-typedef CUresult (CUDAAPI *PFN_cuTexRefGetFormat_v2000)(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
-typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapFilterMode_v5000)(CUfilter_mode *pfm, CUtexref hTexRef);
-typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapLevelBias_v5000)(float *pbias, CUtexref hTexRef);
-typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapLevelClamp_v5000)(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef);
-typedef CUresult (CUDAAPI *PFN_cuTexRefGetMaxAnisotropy_v5000)(int *pmaxAniso, CUtexref hTexRef);
-typedef CUresult (CUDAAPI *PFN_cuTexRefGetBorderColor_v8000)(float *pBorderColor, CUtexref hTexRef);
-typedef CUresult (CUDAAPI *PFN_cuTexRefGetFlags_v2000)(unsigned int *pFlags, CUtexref hTexRef);
-typedef CUresult (CUDAAPI *PFN_cuTexRefCreate_v2000)(CUtexref *pTexRef);
-typedef CUresult (CUDAAPI *PFN_cuTexRefDestroy_v2000)(CUtexref hTexRef);
-typedef CUresult (CUDAAPI *PFN_cuSurfRefSetArray_v3000)(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuSurfRefGetArray_v3000)(CUarray *phArray, CUsurfref hSurfRef);
-typedef CUresult (CUDAAPI *PFN_cuTexObjectCreate_v5000)(CUtexObject_v1 *pTexObject, const CUDA_RESOURCE_DESC_v1 *pResDesc, const CUDA_TEXTURE_DESC_v1 *pTexDesc, const CUDA_RESOURCE_VIEW_DESC_v1 *pResViewDesc);
-typedef CUresult (CUDAAPI *PFN_cuTexObjectDestroy_v5000)(CUtexObject_v1 texObject);
-typedef CUresult (CUDAAPI *PFN_cuTexObjectGetResourceDesc_v5000)(CUDA_RESOURCE_DESC_v1 *pResDesc, CUtexObject_v1 texObject);
-typedef CUresult (CUDAAPI *PFN_cuTexObjectGetTextureDesc_v5000)(CUDA_TEXTURE_DESC_v1 *pTexDesc, CUtexObject_v1 texObject);
-typedef CUresult (CUDAAPI *PFN_cuTexObjectGetResourceViewDesc_v5000)(CUDA_RESOURCE_VIEW_DESC_v1 *pResViewDesc, CUtexObject_v1 texObject);
-typedef CUresult (CUDAAPI *PFN_cuSurfObjectCreate_v5000)(CUsurfObject_v1 *pSurfObject, const CUDA_RESOURCE_DESC_v1 *pResDesc);
-typedef CUresult (CUDAAPI *PFN_cuSurfObjectDestroy_v5000)(CUsurfObject_v1 surfObject);
-typedef CUresult (CUDAAPI *PFN_cuSurfObjectGetResourceDesc_v5000)(CUDA_RESOURCE_DESC_v1 *pResDesc, CUsurfObject_v1 surfObject);
-typedef CUresult (CUDAAPI *PFN_cuDeviceCanAccessPeer_v4000)(int *canAccessPeer, CUdevice_v1 dev, CUdevice_v1 peerDev);
-typedef CUresult (CUDAAPI *PFN_cuCtxEnablePeerAccess_v4000)(CUcontext peerContext, unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuCtxDisablePeerAccess_v4000)(CUcontext peerContext);
-typedef CUresult (CUDAAPI *PFN_cuDeviceGetP2PAttribute_v8000)(int *value, CUdevice_P2PAttribute attrib, CUdevice_v1 srcDevice, CUdevice_v1 dstDevice);
-typedef CUresult (CUDAAPI *PFN_cuGraphicsUnregisterResource_v3000)(CUgraphicsResource resource);
-typedef CUresult (CUDAAPI *PFN_cuGraphicsSubResourceGetMappedArray_v3000)(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
-typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedMipmappedArray_v5000)(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource);
-typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedPointer_v3020)(CUdeviceptr_v2 *pDevPtr, size_t *pSize, CUgraphicsResource resource);
-typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceSetMapFlags_v6050)(CUgraphicsResource resource, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuGraphicsMapResources_v7000_ptsz)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuGraphicsUnmapResources_v7000_ptsz)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuGetExportTable_v3000)(const void **ppExportTable, const CUuuid *pExportTableId);
-typedef CUresult (CUDAAPI *PFN_cuFuncGetModule_v11000)(CUmodule *hmod, CUfunction hfunc);
-typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v3020)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v3020)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v3020)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v3020)(CUarray dstArray, size_t dstOffset, CUdeviceptr_v2 srcDevice, size_t ByteCount);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v3020)(CUdeviceptr_v2 dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v3020)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v3020)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v3020)(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v3020)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v3020)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v3020)(const CUDA_MEMCPY2D_v2 *pCopy);
-typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v3020)(const CUDA_MEMCPY2D_v2 *pCopy);
-typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v3020)(const CUDA_MEMCPY3D_v2 *pCopy);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v3020)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v3020)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v3020)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v3020)(const CUDA_MEMCPY2D_v2 *pCopy, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v3020)(const CUDA_MEMCPY3D_v2 *pCopy, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v3020)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v3020)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v3020)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
-typedef CUresult (CUDAAPI *PFN_cuMemcpy_v4000)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyAsync_v4000)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyPeer_v4000)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount);
-typedef CUresult (CUDAAPI *PFN_cuMemcpyPeerAsync_v4000)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeer_v4000)(const CUDA_MEMCPY3D_PEER_v1 *pCopy);
-typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeerAsync_v4000)(const CUDA_MEMCPY3D_PEER_v1 *pCopy, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD8Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD16Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD32Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuStreamGetPriority_v5050)(CUstream hStream, int *priority);
-typedef CUresult (CUDAAPI *PFN_cuStreamGetFlags_v5050)(CUstream hStream, unsigned int *flags);
-typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v9020)(CUstream hStream, CUcontext *pctx);
-typedef CUresult (CUDAAPI *PFN_cuStreamWaitEvent_v3020)(CUstream hStream, CUevent hEvent, unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuStreamAddCallback_v5000)(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuStreamAttachMemAsync_v6000)(CUstream hStream, CUdeviceptr_v2 dptr, size_t length, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuStreamQuery_v2000)(CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuStreamSynchronize_v2000)(CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuEventRecord_v2000)(CUevent hEvent, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuEventRecordWithFlags_v11010)(CUevent hEvent, CUstream hStream, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuLaunchKernel_v4000)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
-
-
-
-typedef CUresult (CUDAAPI *PFN_cuLaunchHostFunc_v10000)(CUstream hStream, CUhostFn fn, void *userData);
-typedef CUresult (CUDAAPI *PFN_cuGraphicsMapResources_v3000)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuGraphicsUnmapResources_v3000)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v8000)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v8000)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v9000)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v9000)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v8000)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v8000)(CUdeviceptr_v2 devPtr, size_t count, CUdevice_v1 dstDevice, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernel_v9000)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
-typedef CUresult (CUDAAPI *PFN_cuSignalExternalSemaphoresAsync_v10000)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
-typedef CUresult (CUDAAPI *PFN_cuWaitExternalSemaphoresAsync_v10000)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
-typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10010)(CUstream hStream, CUstreamCaptureMode mode);
-typedef CUresult (CUDAAPI *PFN_cuStreamEndCapture_v10000)(CUstream hStream, CUgraph *phGraph);
-typedef CUresult (CUDAAPI *PFN_cuStreamIsCapturing_v10000)(CUstream hStream, CUstreamCaptureStatus *captureStatus);
-typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v10010)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
-typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v11030)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
-typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v11030)(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuGraphUpload_v11010)(CUgraphExec hGraph, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuGraphLaunch_v10000)(CUgraphExec hGraph, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuStreamCopyAttributes_v11000)(CUstream dstStream, CUstream srcStream);
-typedef CUresult (CUDAAPI *PFN_cuStreamGetAttribute_v11000)(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue_v1 *value);
-typedef CUresult (CUDAAPI *PFN_cuStreamSetAttribute_v11000)(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue_v1 *param);
-typedef CUresult (CUDAAPI *PFN_cuMemMapArrayAsync_v11010)(CUarrayMapInfo_v1 *mapInfoList, unsigned int count, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemFreeAsync_v11020)(CUdeviceptr_v2 dptr, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemAllocAsync_v11020)(CUdeviceptr_v2 *dptr, size_t bytesize, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuMemAllocFromPoolAsync_v11020)(CUdeviceptr_v2 *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
-typedef CUresult (CUDAAPI *PFN_cuFlushGPUDirectRDMAWrites_v11030)(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope);
-typedef CUresult (CUDAAPI *PFN_cuUserObjectCreate_v11030)(CUuserObject *object_out, void *ptr, CUhostFn destroy, unsigned int initialRefcount, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuUserObjectRetain_v11030)(CUuserObject object, unsigned int count);
-typedef CUresult (CUDAAPI *PFN_cuUserObjectRelease_v11030)(CUuserObject object, unsigned int count);
-typedef CUresult (CUDAAPI *PFN_cuGraphRetainUserObject_v11030)(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags);
-typedef CUresult (CUDAAPI *PFN_cuGraphReleaseUserObject_v11030)(CUgraph graph, CUuserObject object, unsigned int count);
-
-/*
- * Type definitions for older versioned functions in cuda.h
- */
-#if defined(__CUDA_API_VERSION_INTERNAL)
-    typedef CUresult (CUDAAPI *PFN_cuMemHostRegister_v4000)(void *p, size_t bytesize, unsigned int Flags);
-    typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceSetMapFlags_v3000)(CUgraphicsResource resource, unsigned int flags);
-    typedef CUresult (CUDAAPI *PFN_cuLinkCreate_v5050)(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
-    typedef CUresult (CUDAAPI *PFN_cuLinkAddData_v5050)(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
-    typedef CUresult (CUDAAPI *PFN_cuLinkAddFile_v5050)(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues);
-    typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v3020)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v2 *desc, CUdeviceptr_v2 dptr, size_t Pitch);
-    typedef CUresult (CUDAAPI *PFN_cuDeviceTotalMem_v2000)(unsigned int *bytes, CUdevice_v1 dev);
-    typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v2000)(CUcontext *pctx, unsigned int flags, CUdevice_v1 dev);
-    typedef CUresult (CUDAAPI *PFN_cuModuleGetGlobal_v2000)(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
-    typedef CUresult (CUDAAPI *PFN_cuMemGetInfo_v2000)(unsigned int *free, unsigned int *total);
-    typedef CUresult (CUDAAPI *PFN_cuMemAlloc_v2000)(CUdeviceptr_v1 *dptr, unsigned int bytesize);
-    typedef CUresult (CUDAAPI *PFN_cuMemAllocPitch_v2000)(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes);
-    typedef CUresult (CUDAAPI *PFN_cuMemFree_v2000)(CUdeviceptr_v1 dptr);
-    typedef CUresult (CUDAAPI *PFN_cuMemGetAddressRange_v2000)(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr);
-    typedef CUresult (CUDAAPI *PFN_cuMemAllocHost_v2000)(void **pp, unsigned int bytesize);
-    typedef CUresult (CUDAAPI *PFN_cuMemHostGetDevicePointer_v2020)(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags);
-    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v2000)(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount);
-    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v2000)(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
-    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v2000)(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
-    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v2000)(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
-    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v2000)(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
-    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v2000)(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
-    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v2000)(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
-    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v2000)(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
-    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v2000)(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream);
-    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v2000)(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
-    typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v2000)(const CUDA_MEMCPY2D_v1 *pCopy);
-    typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v2000)(const CUDA_MEMCPY2D_v1 *pCopy);
-    typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v2000)(const CUDA_MEMCPY3D_v1 *pCopy);
-    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v2000)(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream);
-    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v2000)(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
-    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v3000)(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
-    typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v2000)(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream);
-    typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v2000)(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream);
-    typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v2000)(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N);
-    typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v2000)(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N);
-    typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v2000)(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N);
-    typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
-    typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
-    typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
-    typedef CUresult (CUDAAPI *PFN_cuArrayCreate_v2000)(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray);
-    typedef CUresult (CUDAAPI *PFN_cuArrayGetDescriptor_v2000)(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
-    typedef CUresult (CUDAAPI *PFN_cuArray3DCreate_v2000)(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray);
-    typedef CUresult (CUDAAPI *PFN_cuArray3DGetDescriptor_v2000)(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
-    typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress_v2000)(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes);
-    typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v2020)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch);
-    typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddress_v2000)(CUdeviceptr_v1 *pdptr, CUtexref hTexRef);
-    typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedPointer_v3000)(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
-    typedef CUresult (CUDAAPI *PFN_cuCtxDestroy_v2000)(CUcontext ctx);
-    typedef CUresult (CUDAAPI *PFN_cuCtxPopCurrent_v2000)(CUcontext *pctx);
-    typedef CUresult (CUDAAPI *PFN_cuCtxPushCurrent_v2000)(CUcontext ctx);
-    typedef CUresult (CUDAAPI *PFN_cuStreamDestroy_v2000)(CUstream hStream);
-    typedef CUresult (CUDAAPI *PFN_cuEventDestroy_v2000)(CUevent hEvent);
-    typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRelease_v7000)(CUdevice_v1 dev);
-    typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxReset_v7000)(CUdevice_v1 dev);
-    typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxSetFlags_v7000)(CUdevice_v1 dev, unsigned int flags);
-    typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10000)(CUstream hStream);
-    typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10000_ptsz)(CUstream hStream);
-    typedef CUresult (CUDAAPI *PFN_cuIpcOpenMemHandle_v4010)(CUdeviceptr_v2 *pdptr, CUipcMemHandle_v1 handle, unsigned int Flags);
-    typedef CUresult (CUDAAPI *PFN_cuGraphInstantiate_v10000)(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
-#endif
-
-#ifdef __cplusplus
-}
-#endif // __cplusplus
-
-#endif // file guard
--- a/include/cuda/cuda_awbarrier.h
+++ b/include/cuda/cuda_awbarrier.h
@ -1,211 +0,0 @@
-/*
- * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#ifndef _CUDA_AWBARRIER_H_
-# define _CUDA_AWBARRIER_H_
-
-# include "cuda_awbarrier_primitives.h"
-
-# if !defined(_CUDA_AWBARRIER_ARCH_700_OR_LATER)
-#  error This file requires compute capability 7.0 or greater.
-# endif
-
-# if !defined(_CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER)
-#  error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
-             -std=c++11 compiler option.
-# endif
-
-_CUDA_AWBARRIER_BEGIN_NAMESPACE
-
-class awbarrier {
-public:
-    class arrival_token {
-    public:
-        arrival_token() = default;
-        ~arrival_token() = default;
-        _CUDA_AWBARRIER_QUALIFIER uint32_t pending_count() const;
-    private:
-        _CUDA_AWBARRIER_QUALIFIER arrival_token(uint64_t token);
-        uint64_t token;
-        friend awbarrier;
-    };
-    awbarrier() = default;
-    awbarrier(const awbarrier&) = delete;
-    awbarrier& operator=(const awbarrier&) = delete;
-    ~awbarrier() = default;
-
-    _CUDA_AWBARRIER_QUALIFIER arrival_token arrive();
-    _CUDA_AWBARRIER_QUALIFIER arrival_token arrive_and_drop();
-    _CUDA_AWBARRIER_QUALIFIER bool timed_wait(arrival_token token, uint32_t hint_cycles);
-    _CUDA_AWBARRIER_QUALIFIER void wait(arrival_token token);
-    _CUDA_AWBARRIER_QUALIFIER void arrive_and_wait();
-    _CUDA_AWBARRIER_STATIC_QUALIFIER __host__ constexpr uint32_t max();
-private:
-    uint64_t barrier;
-    friend _CUDA_AWBARRIER_QUALIFIER void init(awbarrier* barrier, uint32_t expected_count);
-    friend _CUDA_AWBARRIER_QUALIFIER void inval(awbarrier* barrier);
-    friend class pipeline;
-};
-
-_CUDA_AWBARRIER_QUALIFIER
-uint32_t awbarrier::arrival_token::pending_count() const
-{
-    const uint32_t pending_count = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_token_pending_count(this->token);
-    return (pending_count >> 15);
-}
-
-_CUDA_AWBARRIER_QUALIFIER
-awbarrier::arrival_token::arrival_token(uint64_t token)
-    : token(token)
-{
-}
-
-_CUDA_AWBARRIER_QUALIFIER
-void init(awbarrier* barrier, uint32_t expected_count)
-{
-    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
-    _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count <= _CUDA_AWBARRIER_MAX_COUNT);
-
-    const uint32_t init_count = (expected_count << 15) + expected_count;
-
-    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_init(&barrier->barrier, init_count);
-}
-
-_CUDA_AWBARRIER_QUALIFIER
-void inval(awbarrier* barrier)
-{
-    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
-
-    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_inval(&barrier->barrier);
-}
-
-_CUDA_AWBARRIER_QUALIFIER
-awbarrier::arrival_token awbarrier::arrive()
-{
-    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
-
-    const uint32_t arrive_count = 1 << 15;
-    const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete<false>(&this->barrier, arrive_count);
-
-    (void)_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<false>(&this->barrier);
-
-    return arrival_token(token);
-}
-
-_CUDA_AWBARRIER_QUALIFIER
-awbarrier::arrival_token awbarrier::arrive_and_drop()
-{
-    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
-
-    const uint32_t arrive_count = 1 << 15;
-    const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete<true>(&this->barrier, arrive_count);
-
-    (void)_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<true>(&this->barrier);
-
-    return arrival_token(token);
-}
-
-_CUDA_AWBARRIER_QUALIFIER
-bool awbarrier::timed_wait(arrival_token token, uint32_t hint_cycles)
-{
-    constexpr uint64_t max_busy_wait_cycles = 1024;
-    constexpr uint32_t max_sleep_ns = 1 << 20;
-
-    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
-
-    if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) {
-        return true;
-    }
-
-    uint64_t start_cycles = clock64();
-    uint64_t elapsed_cycles = 0;
-    uint32_t sleep_ns = 32;
-    while (elapsed_cycles < hint_cycles) {
-        if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) {
-            return true;
-        }
-
-        if (elapsed_cycles > max_busy_wait_cycles) {
-            __nanosleep(sleep_ns);
-            if (sleep_ns < max_sleep_ns) {
-                sleep_ns *= 2;
-            }
-        }
-
-        elapsed_cycles = clock64() - start_cycles;
-    }
-
-    return false;
-}
-
-_CUDA_AWBARRIER_QUALIFIER
-void awbarrier::wait(arrival_token token)
-{
-    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
-
-    while (!timed_wait(token, ~0u));
-}
-
-_CUDA_AWBARRIER_QUALIFIER
-void awbarrier::arrive_and_wait()
-{
-    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
-
-    this->wait(this->arrive());
-}
-
-_CUDA_AWBARRIER_QUALIFIER __host__
-constexpr uint32_t awbarrier::max()
-{
-    return _CUDA_AWBARRIER_MAX_COUNT;
-}
-
-_CUDA_AWBARRIER_END_NAMESPACE
-
-#endif /* !_CUDA_AWBARRIER_H_ */
--- a/include/cuda/cuda_awbarrier_helpers.h
+++ b/include/cuda/cuda_awbarrier_helpers.h
@ -1,370 +0,0 @@
-/*
- * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#ifndef _CUDA_AWBARRIER_HELPERS_H_
-# define _CUDA_AWBARRIER_HELPERS_H_
-
-# define _CUDA_AWBARRIER_NAMESPACE       nvcuda::experimental
-# define _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace nvcuda { namespace experimental {
-# define _CUDA_AWBARRIER_END_NAMESPACE   } }
-
-# define _CUDA_AWBARRIER_INTERNAL_NAMESPACE       _CUDA_AWBARRIER_NAMESPACE::__awbarrier_internal
-# define _CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace __awbarrier_internal {
-# define _CUDA_AWBARRIER_END_INTERNAL_NAMESPACE   } _CUDA_AWBARRIER_END_NAMESPACE
-
-# if !defined(_CUDA_AWBARRIER_QUALIFIER)
-#  define _CUDA_AWBARRIER_QUALIFIER inline __device__
-# endif
-# if !defined(_CUDA_AWBARRIER_STATIC_QUALIFIER)
-#  define _CUDA_AWBARRIER_STATIC_QUALIFIER static inline __device__
-# endif
-
-# if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
-#  define _CUDA_AWBARRIER_ARCH_700_OR_LATER
-# endif
-
-# define _CUDA_AWBARRIER_MAX_COUNT ((1 << 14) - 1)
-
-# if (__CUDA_ARCH__ >= 800)
-#  define _CUDA_AWBARRIER_HAS_HW_MBARRIER 1
-# else
-#  define _CUDA_AWBARRIER_HAS_HW_MBARRIER 0
-# endif
-
-# if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && (_MSC_VER >= 1900)))
-#  define _CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER
-# endif
-
-# if !defined(_CUDA_AWBARRIER_DEBUG)
-#  if defined(__CUDACC_DEBUG__)
-#   define _CUDA_AWBARRIER_DEBUG 1
-#  else
-#   define _CUDA_AWBARRIER_DEBUG 0
-#  endif
-# endif
-
-# if defined(_CUDA_AWBARRIER_DEBUG) && (_CUDA_AWBARRIER_DEBUG == 1) && !defined(NDEBUG)
-#  if !defined(__CUDACC_RTC__)
-#   include <cassert>
-#  endif
-#  define _CUDA_AWBARRIER_ASSERT(x) assert((x));
-#  define _CUDA_AWBARRIER_ABORT() assert(0);
-# else
-#  define _CUDA_AWBARRIER_ASSERT(x)
-#  define _CUDA_AWBARRIER_ABORT() __trap();
-# endif
-
-
-# if defined(_MSC_VER) && !defined(_WIN64)
-#  define _CUDA_AWBARRIER_ASM_PTR_CONSTRAINT "r"
-# else
-#  define _CUDA_AWBARRIER_ASM_PTR_CONSTRAINT "l"
-# endif
-
-# if defined(__CUDACC_RTC__)
-typedef unsigned short     uint16_t;
-typedef unsigned int       uint32_t;
-typedef unsigned long long uint64_t;
-typedef uint64_t           uintptr_t;
-# else
-#  include <stdint.h>
-# endif
-
-# if defined(_CUDA_AWBARRIER_ARCH_700_OR_LATER)
-
-_CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE
-
-extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *);
-
-template<bool UseHWAtomicArrive>
-struct ImplementationChooser;
-
-template<>
-struct ImplementationChooser<true> {
-    _CUDA_AWBARRIER_STATIC_QUALIFIER
-    void awbarrier_init(uint64_t* barrier, uint32_t expected_count)
-    {
-        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
-        _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count < (1 << 29));
-
-        asm volatile ("mbarrier.init.shared.b64 [%0], %1;"
-            :
-            : "r"(__nvvm_get_smem_pointer(barrier)), "r"(expected_count)
-            : "memory");
-    }
-
-    _CUDA_AWBARRIER_STATIC_QUALIFIER
-    void awbarrier_inval(uint64_t* barrier)
-    {
-        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
-
-        asm volatile ("mbarrier.inval.shared.b64 [%0];"
-            :
-            : "r"(__nvvm_get_smem_pointer(barrier))
-            : "memory");
-    }
-
-    _CUDA_AWBARRIER_STATIC_QUALIFIER
-    uint32_t awbarrier_token_pending_count(uint64_t token)
-    {
-        uint32_t pending_count;
-
-        asm ("mbarrier.pending_count.b64 %0, %1;"
-            : "=r"(pending_count)
-            : "l"(token));
-        return pending_count;
-    }
-
-    template<bool Drop>
-    _CUDA_AWBARRIER_STATIC_QUALIFIER
-    uint64_t awbarrier_arrive_drop(uint64_t* barrier)
-    {
-        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
-
-        uint64_t token;
-
-        if (Drop) {
-            asm volatile ("mbarrier.arrive_drop.shared.b64 %0, [%1];"
-                : "=l"(token)
-                : "r"(__nvvm_get_smem_pointer(barrier))
-                          : "memory");
-        } else {
-            asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];"
-                : "=l"(token)
-                : "r"(__nvvm_get_smem_pointer(barrier))
-                : "memory");
-        }
-
-        return token;
-    }
-
-    template<bool Drop>
-    _CUDA_AWBARRIER_STATIC_QUALIFIER
-    uint64_t awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t count)
-    {
-        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
-        _CUDA_AWBARRIER_ASSERT(count > 0 && count < (1 << 29));
-
-        uint64_t token;
-
-        if (Drop) {
-            asm volatile ("mbarrier.arrive_drop.noComplete.shared.b64 %0, [%1], %2;"
-                : "=l"(token)
-                : "r"(__nvvm_get_smem_pointer(barrier)), "r"(count)
-                : "memory");
-        } else {
-            asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;"
-                : "=l"(token)
-                : "r"(__nvvm_get_smem_pointer(barrier)), "r"(count)
-                : "memory");
-        }
-
-        return token;
-    }
-
-    _CUDA_AWBARRIER_STATIC_QUALIFIER
-    bool awbarrier_test_wait(uint64_t* barrier, uint64_t token)
-    {
-        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
-
-        uint16_t wait_complete;
-
-        asm volatile ("{"
-                      "    .reg .pred %%p;"
-                      "    mbarrier.test_wait.shared.b64 %%p, [%1], %2;"
-                      "    selp.u16 %0, 1, 0, %%p;"
-                      "}"
-            : "=h"(wait_complete)
-            : "r"(__nvvm_get_smem_pointer(barrier)), "l"(token)
-            : "memory");
-        return bool(wait_complete);
-    }
-};
-
-template<>
-struct ImplementationChooser<false> {
-    union AWBarrier {
-        struct {
-            uint32_t expected;
-            uint32_t pending;
-        } split;
-        uint64_t raw;
-    };
-
-    _CUDA_AWBARRIER_STATIC_QUALIFIER
-    void awbarrier_init(uint64_t* barrier, uint32_t expected_count)
-    {
-        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
-        _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count < (1 << 29));
-
-        AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
-
-        awbarrier->split.expected = 0x40000000 - expected_count;
-        awbarrier->split.pending = 0x80000000 - expected_count;
-    }
-
-    _CUDA_AWBARRIER_STATIC_QUALIFIER
-    void awbarrier_inval(uint64_t* barrier)
-    {
-        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
-    }
-
-    _CUDA_AWBARRIER_STATIC_QUALIFIER
-    uint32_t awbarrier_token_pending_count(uint64_t token)
-    {
-        const uint32_t pending = token >> 32;
-        return 0x80000000 - (pending & 0x7fffffff);
-    }
-
-    template<bool Drop>
-    _CUDA_AWBARRIER_STATIC_QUALIFIER
-    uint64_t awbarrier_arrive_drop(uint64_t* barrier)
-    {
-        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
-
-        AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
-
-        while ((*reinterpret_cast<volatile uint32_t*>(&awbarrier->split.pending) & 0x7fffffff) == 0);
-
-        if (Drop) {
-            (void)atomicAdd_block(&awbarrier->split.expected, 1);
-        }
-
-        __threadfence_block();
-
-        const uint32_t old_pending = atomicAdd_block(&awbarrier->split.pending, 1);
-        const uint32_t new_pending = old_pending + 1;
-        const bool reset = (old_pending ^ new_pending) & 0x80000000;
-
-        if (reset) {
-            __threadfence_block();
-
-            uint32_t new_expected = *reinterpret_cast<volatile uint32_t*>(&awbarrier->split.expected);
-            new_expected &= ~0x40000000;
-            if (new_expected & 0x20000000) {
-                new_expected |= 0x40000000;
-            }
-            atomicAdd_block(&awbarrier->split.pending, new_expected);
-        }
-
-        return static_cast<uint64_t>(old_pending) << 32;
-    }
-
-    template<bool Drop>
-    _CUDA_AWBARRIER_STATIC_QUALIFIER
-    uint64_t awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t count)
-    {
-        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
-        _CUDA_AWBARRIER_ASSERT(count > 0 && count < (1 << 29));
-
-        AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
-
-        while ((*reinterpret_cast<volatile uint32_t*>(&awbarrier->split.pending) & 0x7fffffff) == 0);
-
-        if (Drop) {
-            (void)atomicAdd_block(&awbarrier->split.expected, count);
-        }
-
-        return static_cast<uint64_t>(atomicAdd_block(&awbarrier->split.pending, count)) << 32;
-    }
-
-    _CUDA_AWBARRIER_STATIC_QUALIFIER
-    bool awbarrier_test_wait(uint64_t* barrier, uint64_t token)
-    {
-        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
-
-        volatile AWBarrier* awbarrier = reinterpret_cast<volatile AWBarrier*>(barrier);
-
-        return ((token >> 32) ^ awbarrier->split.pending) & 0x80000000;
-    }
-};
-
-_CUDA_AWBARRIER_QUALIFIER
-void awbarrier_init(uint64_t* barrier, uint32_t expected_count)
-{
-    ImplementationChooser<_CUDA_AWBARRIER_HAS_HW_MBARRIER>::awbarrier_init(barrier, expected_count);
-}
-
-_CUDA_AWBARRIER_QUALIFIER
-void awbarrier_inval(uint64_t* barrier)
-{
-    ImplementationChooser<_CUDA_AWBARRIER_HAS_HW_MBARRIER>::awbarrier_inval(barrier);
-}
-
-_CUDA_AWBARRIER_QUALIFIER
-uint32_t awbarrier_token_pending_count(uint64_t token)
-{
-    return ImplementationChooser<_CUDA_AWBARRIER_HAS_HW_MBARRIER>::awbarrier_token_pending_count(token);
-}
-
-template<bool Drop>
-_CUDA_AWBARRIER_QUALIFIER
-uint64_t awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t arrive_count)
-{
-    return ImplementationChooser<_CUDA_AWBARRIER_HAS_HW_MBARRIER>::awbarrier_arrive_drop_no_complete<Drop>(barrier, arrive_count);
-}
-
-template<bool Drop>
-_CUDA_AWBARRIER_QUALIFIER
-uint64_t awbarrier_arrive_drop(uint64_t* barrier)
-{
-    return ImplementationChooser<_CUDA_AWBARRIER_HAS_HW_MBARRIER>::awbarrier_arrive_drop<Drop>(barrier);
-}
-
-_CUDA_AWBARRIER_QUALIFIER
-bool awbarrier_test_wait(uint64_t* barrier, uint64_t token)
-{
-    return ImplementationChooser<_CUDA_AWBARRIER_HAS_HW_MBARRIER>::awbarrier_test_wait(barrier, token);
-}
-
-_CUDA_AWBARRIER_END_INTERNAL_NAMESPACE
-
-# endif /* !_CUDA_AWBARRIER_ARCH_700_OR_LATER */
-
-#endif /* !_CUDA_AWBARRIER_HELPERS_H_ */
--- a/include/cuda/cuda_awbarrier_primitives.h
+++ b/include/cuda/cuda_awbarrier_primitives.h
@ -1,115 +0,0 @@
-/*
- * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#ifndef _CUDA_AWBARRIER_PRIMITIVES_H_
-# define _CUDA_AWBARRIER_PRIMITIVES_H_
-
-# include "cuda_awbarrier_helpers.h"
-
-# if !defined(_CUDA_AWBARRIER_ARCH_700_OR_LATER)
-#  error This file requires compute capability 7.0 or greater.
-# endif
-
-typedef uint64_t __mbarrier_t;
-typedef uint64_t __mbarrier_token_t;
-
-_CUDA_AWBARRIER_STATIC_QUALIFIER __host__
-uint32_t __mbarrier_maximum_count()
-{
-    return _CUDA_AWBARRIER_MAX_COUNT;
-}
-
-_CUDA_AWBARRIER_STATIC_QUALIFIER
-void __mbarrier_init(__mbarrier_t* barrier, uint32_t expected_count)
-{
-    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
-    _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count <= _CUDA_AWBARRIER_MAX_COUNT);
-
-    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_init(barrier, expected_count);
-}
-
-_CUDA_AWBARRIER_STATIC_QUALIFIER
-void __mbarrier_inval(__mbarrier_t* barrier)
-{
-    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
-
-    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_inval(barrier);
-}
-
-_CUDA_AWBARRIER_STATIC_QUALIFIER
-__mbarrier_token_t __mbarrier_arrive(__mbarrier_t* barrier)
-{
-    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
-
-    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<false>(barrier);
-}
-
-_CUDA_AWBARRIER_STATIC_QUALIFIER
-__mbarrier_token_t __mbarrier_arrive_and_drop(__mbarrier_t* barrier)
-{
-    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
-
-    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<true>(barrier);
-}
-
-_CUDA_AWBARRIER_STATIC_QUALIFIER
-bool __mbarrier_test_wait(__mbarrier_t* barrier, __mbarrier_token_t token)
-{
-    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
-
-    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(barrier, token);
-}
-
-_CUDA_AWBARRIER_STATIC_QUALIFIER
-uint32_t __mbarrier_token_pending_count(__mbarrier_token_t token)
-{
-    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_token_pending_count(token);
-}
-
-#endif /* !_CUDA_AWBARRIER_PRIMITIVES_H_ */
--- a/include/cuda/cuda_bf16.h
+++ b/include/cuda/cuda_bf16.h
--- a/include/cuda/cuda_bf16.hpp
+++ b/include/cuda/cuda_bf16.hpp
--- a/include/cuda/cuda_d3d10_interop.h
+++ b/include/cuda/cuda_d3d10_interop.h
@ -1,724 +0,0 @@
-/*
- * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__CUDA_D3D10_INTEROP_H__)
-#define __CUDA_D3D10_INTEROP_H__
-
-#include "cuda_runtime_api.h"
-
-/** \cond impl_private */
-#if !defined(__dv)
-
-#if defined(__cplusplus)
-
-#define __dv(v) \
-        = v
-
-#else /* __cplusplus */
-
-#define __dv(v)
-
-#endif /* __cplusplus */
-
-#endif /* !__dv */
-/** \endcond impl_private */
-
-#include <d3d10_1.h>
-
-/** \cond impl_private */
-#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
-#define __CUDA_DEPRECATED
-#elif defined(_MSC_VER)
-#define __CUDA_DEPRECATED __declspec(deprecated)
-#elif defined(__GNUC__)
-#define __CUDA_DEPRECATED __attribute__((deprecated))
-#else
-#define __CUDA_DEPRECATED
-#endif
-/** \endcond impl_private */
-
-#if defined(__cplusplus)
-extern "C" {
-#endif /* __cplusplus */
-
-/**
- * \addtogroup CUDART_D3D10 Direct3D 10 Interoperability
- * This section describes the Direct3D 10 interoperability functions of the CUDA
- * runtime application programming interface. Note that mapping of Direct3D 10
- * resources is performed with the graphics API agnostic, resource mapping 
- * interface described in \ref CUDART_INTEROP "Graphics Interopability".
- *
- * @{
- */
-
-/**
- * CUDA devices corresponding to a D3D10 device
- */
-enum cudaD3D10DeviceList
-{
-  cudaD3D10DeviceListAll           = 1, /**< The CUDA devices for all GPUs used by a D3D10 device */
-  cudaD3D10DeviceListCurrentFrame  = 2, /**< The CUDA devices for the GPUs used by a D3D10 device in its currently rendering frame */
-  cudaD3D10DeviceListNextFrame     = 3  /**< The CUDA devices for the GPUs to be used by a D3D10 device in the next frame  */
-};
-
-/**
- * \brief Registers a Direct3D 10 resource for access by CUDA
- * 
- * Registers the Direct3D 10 resource \p pD3DResource for access by CUDA.  
- *
- * If this call is successful, then the application will be able to map and
- * unmap this resource until it is unregistered through
- * ::cudaGraphicsUnregisterResource(). Also on success, this call will increase the
- * internal reference count on \p pD3DResource. This reference count will be
- * decremented when this resource is unregistered through
- * ::cudaGraphicsUnregisterResource().
- *
- * This call potentially has a high-overhead and should not be called every frame
- * in interactive applications.
- *
- * The type of \p pD3DResource must be one of the following.
- *
- * - ::ID3D10Buffer: may be accessed via a device pointer
- * - ::ID3D10Texture1D: individual subresources of the texture may be accessed via arrays
- * - ::ID3D10Texture2D: individual subresources of the texture may be accessed via arrays
- * - ::ID3D10Texture3D: individual subresources of the texture may be accessed via arrays
- *
- * The \p flags argument may be used to specify additional parameters at register
- * time.  The valid values for this parameter are 
- *
- * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
- *   resource will be used.
- * - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will
- *   bind this resource to a surface reference.
- * - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform
- *   texture gather operations on this resource.
- *
- * Not all Direct3D resources of the above types may be used for
- * interoperability with CUDA.  The following are some limitations.
- *
- * - The primary rendertarget may not be registered with CUDA.
- * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
- *   or 32-bit integer or floating-point data cannot be shared.
- * - Surfaces of depth or stencil formats cannot be shared.
- *
- * A complete list of supported DXGI formats is as follows. For compactness the
- * notation A_{B,C,D} represents A_B, A_C, and A_D.
- * - DXGI_FORMAT_A8_UNORM
- * - DXGI_FORMAT_B8G8R8A8_UNORM
- * - DXGI_FORMAT_B8G8R8X8_UNORM
- * - DXGI_FORMAT_R16_FLOAT
- * - DXGI_FORMAT_R16G16B16A16_{FLOAT,SINT,SNORM,UINT,UNORM}
- * - DXGI_FORMAT_R16G16_{FLOAT,SINT,SNORM,UINT,UNORM}
- * - DXGI_FORMAT_R16_{SINT,SNORM,UINT,UNORM}
- * - DXGI_FORMAT_R32_FLOAT
- * - DXGI_FORMAT_R32G32B32A32_{FLOAT,SINT,UINT}
- * - DXGI_FORMAT_R32G32_{FLOAT,SINT,UINT}
- * - DXGI_FORMAT_R32_{SINT,UINT}
- * - DXGI_FORMAT_R8G8B8A8_{SINT,SNORM,UINT,UNORM,UNORM_SRGB}
- * - DXGI_FORMAT_R8G8_{SINT,SNORM,UINT,UNORM}
- * - DXGI_FORMAT_R8_{SINT,SNORM,UINT,UNORM}
- *
- * If \p pD3DResource is of incorrect type or is already registered, then 
- * ::cudaErrorInvalidResourceHandle is returned. 
- * If \p pD3DResource cannot be registered, then ::cudaErrorUnknown is returned.
- *
- * \param resource - Pointer to returned resource handle
- * \param pD3DResource - Direct3D resource to register
- * \param flags        - Parameters for resource registration
- * 
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidDevice,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa 
- * ::cudaGraphicsUnregisterResource,
- * ::cudaGraphicsMapResources, 
- * ::cudaGraphicsSubResourceGetMappedArray, 
- * ::cudaGraphicsResourceGetMappedPointer,
- * ::cuGraphicsD3D10RegisterResource 
- */
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsD3D10RegisterResource(struct cudaGraphicsResource **resource, ID3D10Resource *pD3DResource, unsigned int flags);
-
-/**
- * \brief Gets the device number for an adapter
- *
- * Returns in \p *device the CUDA-compatible device corresponding to the
- * adapter \p pAdapter obtained from ::IDXGIFactory::EnumAdapters. This call
- * will succeed only if a device on adapter \p pAdapter is CUDA-compatible.
- *
- * \param device   - Returns the device corresponding to pAdapter
- * \param pAdapter - D3D10 adapter to get device for
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa 
- * ::cudaGraphicsD3D10RegisterResource,
- * ::cuD3D10GetDevice 
- */
-extern __host__ cudaError_t CUDARTAPI cudaD3D10GetDevice(int *device, IDXGIAdapter *pAdapter);
-
-/**
- * \brief Gets the CUDA devices corresponding to a Direct3D 10 device
- * 
- * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices corresponding 
- * to the Direct3D 10 device \p pD3D10Device.
- * Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the the CUDA-compatible devices 
- * corresponding to the Direct3D 10 device \p pD3D10Device.
- *
- * If any of the GPUs being used to render \p pDevice are not CUDA capable then the
- * call will return ::cudaErrorNoDevice.
- *
- * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D10Device
- * \param pCudaDevices     - Returned CUDA devices corresponding to \p pD3D10Device
- * \param cudaDeviceCount  - The size of the output device array \p pCudaDevices
- * \param pD3D10Device     - Direct3D 10 device to query for CUDA devices
- * \param deviceList       - The set of devices to return.  This set may be
- *                           ::cudaD3D10DeviceListAll for all devices, 
- *                           ::cudaD3D10DeviceListCurrentFrame for the devices used to
- *                           render the current frame (in SLI), or
- *                           ::cudaD3D10DeviceListNextFrame for the devices used to
- *                           render the next frame (in SLI).
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorNoDevice,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa 
- * ::cudaGraphicsUnregisterResource,
- * ::cudaGraphicsMapResources, 
- * ::cudaGraphicsSubResourceGetMappedArray, 
- * ::cudaGraphicsResourceGetMappedPointer,
- * ::cuD3D10GetDevices 
- */
-extern __host__ cudaError_t CUDARTAPI cudaD3D10GetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, ID3D10Device *pD3D10Device, enum cudaD3D10DeviceList deviceList);
-
-/** @} */ /* END CUDART_D3D10 */
-
-/**
- * \addtogroup CUDART_D3D10_DEPRECATED Direct3D 10 Interoperability [DEPRECATED]
- * This section describes deprecated Direct3D 10 interoperability functions.
- *
- * @{
- */
-
-/**
- * CUDA D3D10 Register Flags
- */
-enum cudaD3D10RegisterFlags
-{
-  cudaD3D10RegisterFlagsNone  = 0,  /**< Default; Resource can be accessed through a void* */
-  cudaD3D10RegisterFlagsArray = 1   /**< Resource can be accessed through a CUarray* */
-};
-
-/**
- * CUDA D3D10 Map Flags
- */
-enum cudaD3D10MapFlags
-{
-  cudaD3D10MapFlagsNone         = 0,  /**< Default; Assume resource can be read/written */
-  cudaD3D10MapFlagsReadOnly     = 1,  /**< CUDA kernels will not write to this resource */
-  cudaD3D10MapFlagsWriteDiscard = 2   /**< CUDA kernels will only write to and will not read from this resource */
-};
-
-/**
- * \brief Gets the Direct3D device against which the current CUDA context was
- * created
- *
- * \deprecated This function is deprecated as of CUDA 5.0.
- *
- * This function is deprecated and should no longer be used.  It is
- * no longer necessary to associate a CUDA device with a D3D10
- * device in order to achieve maximum interoperability performance.
- *
- * \param ppD3D10Device - Returns the Direct3D device for this thread
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa 
- * ::cudaD3D10SetDirect3DDevice
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10GetDirect3DDevice(ID3D10Device **ppD3D10Device);
-
-/**
- * \brief Sets the Direct3D 10 device to use for interoperability with 
- * a CUDA device
- *
- * \deprecated This function is deprecated as of CUDA 5.0.
- *
- * This function is deprecated and should no longer be used.  It is
- * no longer necessary to associate a CUDA device with a D3D10
- * device in order to achieve maximum interoperability performance.
- *
- * \param pD3D10Device - Direct3D device to use for interoperability
- * \param device       - The CUDA device to use.  This device must be among the devices
- *                       returned when querying ::cudaD3D10DeviceListAll from ::cudaD3D10GetDevices,
- *                       may be set to -1 to automatically select an appropriate CUDA device.
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInitializationError,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorSetOnActiveProcess
- * \notefnerr
- *
- * \sa 
- * ::cudaD3D10GetDevice,
- * ::cudaGraphicsD3D10RegisterResource,
- * ::cudaDeviceReset
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10SetDirect3DDevice(ID3D10Device *pD3D10Device, int device __dv(-1));
-
-/**
- * \brief Registers a Direct3D 10 resource for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Registers the Direct3D resource \p pResource for access by CUDA.
- *
- * If this call is successful, then the application will be able to map and
- * unmap this resource until it is unregistered through
- * ::cudaD3D10UnregisterResource(). Also on success, this call will increase
- * the internal reference count on \p pResource. This reference count will be
- * decremented when this resource is unregistered through
- * ::cudaD3D10UnregisterResource().
- *
- * This call potentially has a high-overhead and should not be called every frame
- * in interactive applications.
- *
- * The type of \p pResource must be one of the following:
- *
- * - ::ID3D10Buffer: Cannot be used with \p flags set to
- * \p cudaD3D10RegisterFlagsArray.
- * - ::ID3D10Texture1D: No restrictions.
- * - ::ID3D10Texture2D: No restrictions.
- * - ::ID3D10Texture3D: No restrictions.
- *
- * The \p flags argument specifies the mechanism through which CUDA will
- * access the Direct3D resource. The following values are allowed.
- *
- * - ::cudaD3D10RegisterFlagsNone: Specifies that CUDA will access this
- * resource through a \p void*. The pointer, size, and pitch for each
- * subresource of this resource may be queried through
- * ::cudaD3D10ResourceGetMappedPointer(), ::cudaD3D10ResourceGetMappedSize(),
- * and ::cudaD3D10ResourceGetMappedPitch() respectively. This option is valid
- * for all resource types.
- * - ::cudaD3D10RegisterFlagsArray: Specifies that CUDA will access this
- * resource through a \p CUarray queried on a sub-resource basis through
- * ::cudaD3D10ResourceGetMappedArray(). This option is only valid for resources
- * of type ::ID3D10Texture1D, ::ID3D10Texture2D, and ::ID3D10Texture3D.
- *
- * Not all Direct3D resources of the above types may be used for
- * interoperability with CUDA. The following are some limitations.
- *
- * - The primary rendertarget may not be registered with CUDA.
- * - Resources allocated as shared may not be registered with CUDA.
- * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
- *   or 32-bit integer or floating-point data cannot be shared.
- * - Surfaces of depth or stencil formats cannot be shared.
- *
- * If Direct3D interoperability is not initialized on this context then
- * ::cudaErrorInvalidDevice is returned. If \p pResource is of incorrect type
- * or is already registered then ::cudaErrorInvalidResourceHandle is returned.
- * If \p pResource cannot be registered then ::cudaErrorUnknown is returned.
- *
- * \param pResource - Resource to register
- * \param flags     - Parameters for resource registration
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidDevice,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa ::cudaGraphicsD3D10RegisterResource
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10RegisterResource(ID3D10Resource *pResource, unsigned int flags);
-
-/**
- * \brief Unregisters a Direct3D resource
- *
- * \deprecated This function is deprecated as of CUDA 3.0. 
- *
- * Unregisters the Direct3D resource \p resource so it is not accessible by
- * CUDA unless registered again.
- *
- * If \p pResource is not registered, then ::cudaErrorInvalidResourceHandle
- * is returned.
- *
- * \param pResource - Resource to unregister
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa ::cudaGraphicsUnregisterResource
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10UnregisterResource(ID3D10Resource *pResource);
-
-/**
- * \brief Maps Direct3D Resources for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0.  
- *
- * Maps the \p count Direct3D resources in \p ppResources for access by CUDA.
- *
- * The resources in \p ppResources may be accessed in CUDA kernels until they
- * are unmapped. Direct3D should not access any resources while they are
- * mapped by CUDA. If an application does so, the results are undefined.
- *
- * This function provides the synchronization guarantee that any Direct3D
- * calls issued before ::cudaD3D10MapResources() will complete before any CUDA
- * kernels issued after ::cudaD3D10MapResources() begin.
- *
- * If any of \p ppResources have not been registered for use with CUDA or if
- * \p ppResources contains any duplicate entries then ::cudaErrorInvalidResourceHandle
- * is returned. If any of \p ppResources are presently mapped for access by
- * CUDA then ::cudaErrorUnknown is returned.
- *
- * \param count       - Number of resources to map for CUDA
- * \param ppResources - Resources to map for CUDA
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa ::cudaGraphicsMapResources
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10MapResources(int count, ID3D10Resource **ppResources);
-
-/**
- * \brief Unmaps Direct3D resources
- *
- * \deprecated This function is deprecated as of CUDA 3.0.   
- *
- * Unmaps the \p count Direct3D resource in \p ppResources.
- *
- * This function provides the synchronization guarantee that any CUDA kernels
- * issued before ::cudaD3D10UnmapResources() will complete before any Direct3D
- * calls issued after ::cudaD3D10UnmapResources() begin.
- *
- * If any of \p ppResources have not been registered for use with CUDA or if
- * \p ppResources contains any duplicate entries, then
- * ::cudaErrorInvalidResourceHandle is returned. If any of \p ppResources are
- * not presently mapped for access by CUDA then ::cudaErrorUnknown is returned.
- *
- * \param count       - Number of resources to unmap for CUDA
- * \param ppResources - Resources to unmap for CUDA
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa ::cudaGraphicsUnmapResources
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10UnmapResources(int count, ID3D10Resource **ppResources);
-
-/**
- * \brief Gets an array through which to access a subresource of a Direct3D
- * resource which has been mapped for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0. 
- *
- * Returns in \p *ppArray an array through which the subresource of the mapped
- * Direct3D resource \p pResource which corresponds to \p subResource may be
- * accessed. The value set in \p ppArray may change every time that
- * \p pResource is mapped.
- *
- * If \p pResource is not registered, then ::cudaErrorInvalidResourceHandle is
- * returned. If \p pResource was not registered with usage flags
- * ::cudaD3D10RegisterFlagsArray, then ::cudaErrorInvalidResourceHandle is
- * returned. If \p pResource is not mapped then ::cudaErrorUnknown is returned.
- *
- * For usage requirements of the \p subResource parameter, see
- * ::cudaD3D10ResourceGetMappedPointer().
- *
- * \param ppArray     - Returned array corresponding to subresource
- * \param pResource   - Mapped resource to access
- * \param subResource - Subresource of pResource to access
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa ::cudaGraphicsSubResourceGetMappedArray
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceGetMappedArray(cudaArray **ppArray, ID3D10Resource *pResource, unsigned int subResource);
-
-/**
- * \brief Set usage flags for mapping a Direct3D resource
- *
- * \deprecated This function is deprecated as of CUDA 3.0. 
- *
- * Set usage flags for mapping the Direct3D resource \p pResource.  
- *
- * Changes to flags will take effect the next time \p pResource is mapped.
- * The \p flags argument may be any of the following:
- *
- * - ::cudaD3D10MapFlagsNone: Specifies no hints about how this resource will
- * be used. It is therefore assumed that this resource will be read from and
- * written to by CUDA kernels. This is the default value.
- * - ::cudaD3D10MapFlagsReadOnly: Specifies that CUDA kernels which access
- * this resource will not write to this resource.
- * - ::cudaD3D10MapFlagsWriteDiscard: Specifies that CUDA kernels which access
- * this resource will not read from this resource and will write over the
- * entire contents of the resource, so none of the data previously stored in
- * the resource will be preserved.
- *
- * If \p pResource has not been registered for use with CUDA then
- * ::cudaErrorInvalidHandle is returned. If \p pResource is presently mapped
- * for access by CUDA then ::cudaErrorUnknown is returned.
- *
- * \param pResource - Registered resource to set flags for
- * \param flags     - Parameters for resource mapping
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown,
- * \notefnerr
- *
- * \sa ::cudaGraphicsResourceSetMapFlags
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceSetMapFlags(ID3D10Resource *pResource, unsigned int flags); 
-
-/**
- * \brief Gets the dimensions of a registered Direct3D surface
- *
- * \deprecated This function is deprecated as of CUDA 3.0. 
- *
- * Returns in \p *pWidth, \p *pHeight, and \p *pDepth the dimensions of the
- * subresource of the mapped Direct3D resource \p pResource which corresponds
- * to \p subResource.
- *
- * Since anti-aliased surfaces may have multiple samples per pixel, it is
- * possible that the dimensions of a resource will be an integer factor larger
- * than the dimensions reported by the Direct3D runtime.
- *
- * The parameters \p pWidth, \p pHeight, and \p pDepth are optional. For 2D
- * surfaces, the value returned in \p *pDepth will be 0.
- *
- * If \p pResource is not of type ::ID3D10Texture1D, ::ID3D10Texture2D, or
- * ::ID3D10Texture3D, or if \p pResource has not been registered for use with
- * CUDA, then ::cudaErrorInvalidHandle is returned.
-
- * For usage requirements of \p subResource parameters see
- * ::cudaD3D10ResourceGetMappedPointer().
- *
- * \param pWidth      - Returned width of surface
- * \param pHeight     - Returned height of surface
- * \param pDepth      - Returned depth of surface
- * \param pResource   - Registered resource to access
- * \param subResource - Subresource of pResource to access
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorInvalidResourceHandle,
- * \notefnerr
- *
- * \sa ::cudaGraphicsSubResourceGetMappedArray
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceGetSurfaceDimensions(size_t *pWidth, size_t *pHeight, size_t *pDepth, ID3D10Resource *pResource, unsigned int subResource); 
-
-/**
- * \brief Gets a pointer through which to access a subresource of a Direct3D
- * resource which has been mapped for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0. 
- *
- * Returns in \p *pPointer the base pointer of the subresource of the mapped
- * Direct3D resource \p pResource which corresponds to \p subResource. The
- * value set in \p pPointer may change every time that \p pResource is mapped.
- *
- * If \p pResource is not registered, then ::cudaErrorInvalidResourceHandle is
- * returned. If \p pResource was not registered with usage flags
- * ::cudaD3D9RegisterFlagsNone, then ::cudaErrorInvalidResourceHandle is
- * returned. If \p pResource is not mapped then ::cudaErrorUnknown is returned.
- *
- * If \p pResource is of type ::ID3D10Buffer then \p subResource must be 0.
- * If \p pResource is of any other type, then the value of \p subResource must
- * come from the subresource calculation in ::D3D10CalcSubResource().
- *
- * \param pPointer    - Returned pointer corresponding to subresource
- * \param pResource   - Mapped resource to access
- * \param subResource - Subresource of pResource to access
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa ::cudaGraphicsResourceGetMappedPointer
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceGetMappedPointer(void **pPointer, ID3D10Resource *pResource, unsigned int subResource);
-
-/**
- * \brief Gets the size of a subresource of a Direct3D resource which has been
- * mapped for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0. 
- *
- * Returns in \p *pSize the size of the subresource of the mapped Direct3D
- * resource \p pResource which corresponds to \p subResource. The value set in
- * \p pSize may change every time that \p pResource is mapped.
- *
- * If \p pResource has not been registered for use with CUDA then
- * ::cudaErrorInvalidHandle is returned. If \p pResource was not registered
- * with usage flags ::cudaD3D10RegisterFlagsNone, then
- * ::cudaErrorInvalidResourceHandle is returned. If \p pResource is not mapped for
- * access by CUDA then ::cudaErrorUnknown is returned.
- *
- * For usage requirements of the \p subResource parameter see
- * ::cudaD3D10ResourceGetMappedPointer().
- *
- * \param pSize       - Returned size of subresource
- * \param pResource   - Mapped resource to access
- * \param subResource - Subresource of pResource to access
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa ::cudaGraphicsResourceGetMappedPointer
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceGetMappedSize(size_t *pSize, ID3D10Resource *pResource, unsigned int subResource);
-
-/**
- * \brief Gets the pitch of a subresource of a Direct3D resource which has been
- * mapped for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0. 
- *
- * Returns in \p *pPitch and \p *pPitchSlice the pitch and Z-slice pitch of
- * the subresource of the mapped Direct3D resource \p pResource, which
- * corresponds to \p subResource. The values set in \p pPitch and
- * \p pPitchSlice may change every time that \p pResource is mapped.
- *
- * The pitch and Z-slice pitch values may be used to compute the location of a
- * sample on a surface as follows.
- *
- * For a 2D surface, the byte offset of the sample at position \b x, \b y from
- * the base pointer of the surface is:
- *
- * \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
- *
- * For a 3D surface, the byte offset of the sample at position \b x, \b y,
- * \b z from the base pointer of the surface is:
- *
- * \b z* \b slicePitch + \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
- *
- * Both parameters \p pPitch and \p pPitchSlice are optional and may be set to
- * NULL.
- *
- * If \p pResource is not of type ::ID3D10Texture1D, ::ID3D10Texture2D, or
- * ::ID3D10Texture3D, or if \p pResource has not been registered for use with
- * CUDA, then ::cudaErrorInvalidResourceHandle is returned. If \p pResource was
- * not registered with usage flags ::cudaD3D10RegisterFlagsNone, then
- * ::cudaErrorInvalidResourceHandle is returned. If \p pResource is not mapped
- * for access by CUDA then ::cudaErrorUnknown is returned.
- *
- * For usage requirements of the \p subResource parameter see
- * ::cudaD3D10ResourceGetMappedPointer().
- *
- * \param pPitch      - Returned pitch of subresource
- * \param pPitchSlice - Returned Z-slice pitch of subresource
- * \param pResource   - Mapped resource to access
- * \param subResource - Subresource of pResource to access
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa ::cudaGraphicsSubResourceGetMappedArray
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceGetMappedPitch(size_t *pPitch, size_t *pPitchSlice, ID3D10Resource *pResource, unsigned int subResource);
-
-/** @} */ /* END CUDART_D3D10_DEPRECATED */
-
-#if defined(__cplusplus)
-}
-#endif /* __cplusplus */
-
-#undef __dv
-#undef __CUDA_DEPRECATED
-
-#endif /* __CUDA_D3D10_INTEROP_H__ */
--- a/include/cuda/cuda_d3d11_interop.h
+++ b/include/cuda/cuda_d3d11_interop.h
@ -1,323 +0,0 @@
-/*
- * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__CUDA_D3D11_INTEROP_H__)
-#define __CUDA_D3D11_INTEROP_H__
-
-#include "cuda_runtime_api.h"
-
-/** \cond impl_private */
-#if !defined(__dv)
-
-#if defined(__cplusplus)
-
-#define __dv(v) \
-        = v
-
-#else /* __cplusplus */
-
-#define __dv(v)
-
-#endif /* __cplusplus */
-
-#endif /* !__dv */
-/** \endcond impl_private */
-
-#include <d3d11.h>
-
-/** \cond impl_private */
-#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
-#define __CUDA_DEPRECATED
-#elif defined(_MSC_VER)
-#define __CUDA_DEPRECATED __declspec(deprecated)
-#elif defined(__GNUC__)
-#define __CUDA_DEPRECATED __attribute__((deprecated))
-#else
-#define __CUDA_DEPRECATED
-#endif
-/** \endcond impl_private */
-
-#if defined(__cplusplus)
-extern "C" {
-#endif /* __cplusplus */
-
-/**
- * \addtogroup CUDART_D3D11 Direct3D 11 Interoperability
- * This section describes the Direct3D 11 interoperability functions of the CUDA
- * runtime application programming interface. Note that mapping of Direct3D 11
- * resources is performed with the graphics API agnostic, resource mapping 
- * interface described in \ref CUDART_INTEROP "Graphics Interopability".
- *
- * @{
- */
-
-/**
- * CUDA devices corresponding to a D3D11 device
- */
-enum cudaD3D11DeviceList
-{
-  cudaD3D11DeviceListAll           = 1, /**< The CUDA devices for all GPUs used by a D3D11 device */
-  cudaD3D11DeviceListCurrentFrame  = 2, /**< The CUDA devices for the GPUs used by a D3D11 device in its currently rendering frame */
-  cudaD3D11DeviceListNextFrame     = 3  /**< The CUDA devices for the GPUs to be used by a D3D11 device in the next frame  */
-};
-
-/**
- * \brief Register a Direct3D 11 resource for access by CUDA
- * 
- * Registers the Direct3D 11 resource \p pD3DResource for access by CUDA.  
- *
- * If this call is successful, then the application will be able to map and
- * unmap this resource until it is unregistered through
- * ::cudaGraphicsUnregisterResource(). Also on success, this call will increase the
- * internal reference count on \p pD3DResource. This reference count will be
- * decremented when this resource is unregistered through
- * ::cudaGraphicsUnregisterResource().
- *
- * This call potentially has a high-overhead and should not be called every frame
- * in interactive applications.
- *
- * The type of \p pD3DResource must be one of the following.
- *
- * - ::ID3D11Buffer: may be accessed via a device pointer
- * - ::ID3D11Texture1D: individual subresources of the texture may be accessed via arrays
- * - ::ID3D11Texture2D: individual subresources of the texture may be accessed via arrays
- * - ::ID3D11Texture3D: individual subresources of the texture may be accessed via arrays
- *
- * The \p flags argument may be used to specify additional parameters at register
- * time.  The valid values for this parameter are 
- *
- * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
- *   resource will be used.
- * - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will
- *   bind this resource to a surface reference.
- * - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform
- *   texture gather operations on this resource.
- *
- * Not all Direct3D resources of the above types may be used for
- * interoperability with CUDA.  The following are some limitations.
- *
- * - The primary rendertarget may not be registered with CUDA.
- * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
- *   or 32-bit integer or floating-point data cannot be shared.
- * - Surfaces of depth or stencil formats cannot be shared.
- *
- * A complete list of supported DXGI formats is as follows. For compactness the
- * notation A_{B,C,D} represents A_B, A_C, and A_D.
- * - DXGI_FORMAT_A8_UNORM
- * - DXGI_FORMAT_B8G8R8A8_UNORM
- * - DXGI_FORMAT_B8G8R8X8_UNORM
- * - DXGI_FORMAT_R16_FLOAT
- * - DXGI_FORMAT_R16G16B16A16_{FLOAT,SINT,SNORM,UINT,UNORM}
- * - DXGI_FORMAT_R16G16_{FLOAT,SINT,SNORM,UINT,UNORM}
- * - DXGI_FORMAT_R16_{SINT,SNORM,UINT,UNORM}
- * - DXGI_FORMAT_R32_FLOAT
- * - DXGI_FORMAT_R32G32B32A32_{FLOAT,SINT,UINT}
- * - DXGI_FORMAT_R32G32_{FLOAT,SINT,UINT}
- * - DXGI_FORMAT_R32_{SINT,UINT}
- * - DXGI_FORMAT_R8G8B8A8_{SINT,SNORM,UINT,UNORM,UNORM_SRGB}
- * - DXGI_FORMAT_R8G8_{SINT,SNORM,UINT,UNORM}
- * - DXGI_FORMAT_R8_{SINT,SNORM,UINT,UNORM}
- *
- * If \p pD3DResource is of incorrect type or is already registered, then 
- * ::cudaErrorInvalidResourceHandle is returned. 
- * If \p pD3DResource cannot be registered, then ::cudaErrorUnknown is returned.
- *
- * \param resource - Pointer to returned resource handle
- * \param pD3DResource - Direct3D resource to register
- * \param flags        - Parameters for resource registration
- * 
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidDevice,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa 
- * ::cudaGraphicsUnregisterResource,
- * ::cudaGraphicsMapResources, 
- * ::cudaGraphicsSubResourceGetMappedArray, 
- * ::cudaGraphicsResourceGetMappedPointer,
- * ::cuGraphicsD3D11RegisterResource 
- */
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsD3D11RegisterResource(struct cudaGraphicsResource **resource, ID3D11Resource *pD3DResource, unsigned int flags);
-
-/**
- * \brief Gets the device number for an adapter
- *
- * Returns in \p *device the CUDA-compatible device corresponding to the
- * adapter \p pAdapter obtained from ::IDXGIFactory::EnumAdapters. This call
- * will succeed only if a device on adapter \p pAdapter is CUDA-compatible.
- *
- * \param device   - Returns the device corresponding to pAdapter
- * \param pAdapter - D3D11 adapter to get device for
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa 
- * ::cudaGraphicsUnregisterResource,
- * ::cudaGraphicsMapResources, 
- * ::cudaGraphicsSubResourceGetMappedArray, 
- * ::cudaGraphicsResourceGetMappedPointer,
- * ::cuD3D11GetDevice 
- */
-extern __host__ cudaError_t CUDARTAPI cudaD3D11GetDevice(int *device, IDXGIAdapter *pAdapter);
-
-/**
- * \brief Gets the CUDA devices corresponding to a Direct3D 11 device
- * 
- * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices corresponding 
- * to the Direct3D 11 device \p pD3D11Device.
- * Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the the CUDA-compatible devices 
- * corresponding to the Direct3D 11 device \p pD3D11Device.
- *
- * If any of the GPUs being used to render \p pDevice are not CUDA capable then the
- * call will return ::cudaErrorNoDevice.
- *
- * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D11Device
- * \param pCudaDevices     - Returned CUDA devices corresponding to \p pD3D11Device
- * \param cudaDeviceCount  - The size of the output device array \p pCudaDevices
- * \param pD3D11Device     - Direct3D 11 device to query for CUDA devices
- * \param deviceList       - The set of devices to return.  This set may be
- *                           ::cudaD3D11DeviceListAll for all devices, 
- *                           ::cudaD3D11DeviceListCurrentFrame for the devices used to
- *                           render the current frame (in SLI), or
- *                           ::cudaD3D11DeviceListNextFrame for the devices used to
- *                           render the next frame (in SLI).
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorNoDevice,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa 
- * ::cudaGraphicsUnregisterResource,
- * ::cudaGraphicsMapResources, 
- * ::cudaGraphicsSubResourceGetMappedArray, 
- * ::cudaGraphicsResourceGetMappedPointer,
- * ::cuD3D11GetDevices 
- */
-extern __host__ cudaError_t CUDARTAPI cudaD3D11GetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, ID3D11Device *pD3D11Device, enum cudaD3D11DeviceList deviceList);
-
-/** @} */ /* END CUDART_D3D11 */
-
-/**
- * \addtogroup CUDART_D3D11_DEPRECATED Direct3D 11 Interoperability [DEPRECATED]
- * This section describes deprecated Direct3D 11 interoperability functions.
- *
- * @{
- */
-
-/**
- * \brief Gets the Direct3D device against which the current CUDA context was
- * created
- *
- * \deprecated This function is deprecated as of CUDA 5.0. 
- *
- * This function is deprecated and should no longer be used.  It is
- * no longer necessary to associate a CUDA device with a D3D11
- * device in order to achieve maximum interoperability performance.
- *
- * \param ppD3D11Device - Returns the Direct3D device for this thread
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa 
- * ::cudaD3D11SetDirect3DDevice
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D11GetDirect3DDevice(ID3D11Device **ppD3D11Device);
-
-/**
- * \brief Sets the Direct3D 11 device to use for interoperability with 
- * a CUDA device
- *
- * \deprecated This function is deprecated as of CUDA 5.0. 
- *
- * This function is deprecated and should no longer be used.  It is
- * no longer necessary to associate a CUDA device with a D3D11
- * device in order to achieve maximum interoperability performance.
- *
- * \param pD3D11Device - Direct3D device to use for interoperability
- * \param device       - The CUDA device to use.  This device must be among the devices
- *                       returned when querying ::cudaD3D11DeviceListAll from ::cudaD3D11GetDevices,
- *                       may be set to -1 to automatically select an appropriate CUDA device.
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInitializationError,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorSetOnActiveProcess
- * \notefnerr
- *
- * \sa 
- * ::cudaD3D11GetDevice,
- * ::cudaGraphicsD3D11RegisterResource,
- * ::cudaDeviceReset
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D11SetDirect3DDevice(ID3D11Device *pD3D11Device, int device __dv(-1));
-
-/** @} */ /* END CUDART_D3D11_DEPRECATED */
-
-#if defined(__cplusplus)
-}
-#endif /* __cplusplus */
-
-#undef __dv
-#undef __CUDA_DEPRECATED
-
-#endif /* __CUDA_D3D11_INTEROP_H__ */
--- a/include/cuda/cuda_d3d9_interop.h
+++ b/include/cuda/cuda_d3d9_interop.h
@ -1,782 +0,0 @@
-/*
- * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__CUDA_D3D9_INTEROP_H__)
-#define __CUDA_D3D9_INTEROP_H__
-
-#include "cuda_runtime_api.h"
-
-/** \cond impl_private */
-#if !defined(__dv)
-
-#if defined(__cplusplus)
-
-#define __dv(v) \
-        = v
-
-#else /* __cplusplus */
-
-#define __dv(v)
-
-#endif /* __cplusplus */
-
-#endif /* !__dv */
-/** \endcond impl_private */
-
-#include <d3d9.h>
-
-/** \cond impl_private */
-#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
-#define __CUDA_DEPRECATED
-#elif defined(_MSC_VER)
-#define __CUDA_DEPRECATED __declspec(deprecated)
-#elif defined(__GNUC__)
-#define __CUDA_DEPRECATED __attribute__((deprecated))
-#else
-#define __CUDA_DEPRECATED
-#endif
-/** \endcond impl_private */
-
-#if defined(__cplusplus)
-extern "C" {
-#endif /* __cplusplus */
-
-/**
- * \addtogroup CUDART_D3D9 Direct3D 9 Interoperability
- * This section describes the Direct3D 9 interoperability functions of the CUDA
- * runtime application programming interface. Note that mapping of Direct3D 9
- * resources is performed with the graphics API agnostic, resource mapping 
- * interface described in \ref CUDART_INTEROP "Graphics Interopability".
- *
- * @{
- */
-
-/**
- * CUDA devices corresponding to a D3D9 device
- */
-enum cudaD3D9DeviceList
-{
-  cudaD3D9DeviceListAll           = 1, /**< The CUDA devices for all GPUs used by a D3D9 device */
-  cudaD3D9DeviceListCurrentFrame  = 2, /**< The CUDA devices for the GPUs used by a D3D9 device in its currently rendering frame */
-  cudaD3D9DeviceListNextFrame     = 3  /**< The CUDA devices for the GPUs to be used by a D3D9 device in the next frame  */
-};
-
-/**
- * \brief Gets the Direct3D device against which the current CUDA context was
- * created
- *
- * Returns in \p *ppD3D9Device the Direct3D device against which this CUDA
- * context was created in ::cudaD3D9SetDirect3DDevice().
- *
- * \param ppD3D9Device - Returns the Direct3D device for this thread
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidGraphicsContext,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa 
- * ::cudaD3D9SetDirect3DDevice,
- * ::cuD3D9GetDirect3DDevice
- */
-extern __host__ cudaError_t CUDARTAPI cudaD3D9GetDirect3DDevice(IDirect3DDevice9 **ppD3D9Device);
-
-/**
- * \brief Register a Direct3D 9 resource for access by CUDA
- * 
- * Registers the Direct3D 9 resource \p pD3DResource for access by CUDA.  
- *
- * If this call is successful then the application will be able to map and
- * unmap this resource until it is unregistered through
- * ::cudaGraphicsUnregisterResource(). Also on success, this call will increase the
- * internal reference count on \p pD3DResource. This reference count will be
- * decremented when this resource is unregistered through
- * ::cudaGraphicsUnregisterResource().
- *
- * This call potentially has a high-overhead and should not be called every frame
- * in interactive applications.
- *
- * The type of \p pD3DResource must be one of the following.
- *
- * - ::IDirect3DVertexBuffer9: may be accessed through a device pointer
- * - ::IDirect3DIndexBuffer9: may be accessed through a device pointer
- * - ::IDirect3DSurface9: may be accessed through an array.
- *     Only stand-alone objects of type ::IDirect3DSurface9
- *     may be explicitly shared. In particular, individual mipmap levels and faces
- *     of cube maps may not be registered directly. To access individual surfaces
- *     associated with a texture, one must register the base texture object.
- * - ::IDirect3DBaseTexture9: individual surfaces on this texture may be accessed
- *     through an array.
- *
- * The \p flags argument may be used to specify additional parameters at register
- * time.  The valid values for this parameter are 
- *
- * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
- *   resource will be used.
- * - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will
- *   bind this resource to a surface reference.
- * - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform
- *   texture gather operations on this resource.
- *
- * Not all Direct3D resources of the above types may be used for
- * interoperability with CUDA.  The following are some limitations.
- *
- * - The primary rendertarget may not be registered with CUDA.
- * - Resources allocated as shared may not be registered with CUDA.
- * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
- *   or 32-bit integer or floating-point data cannot be shared.
- * - Surfaces of depth or stencil formats cannot be shared.
- *
- * A complete list of supported formats is as follows:
- * - D3DFMT_L8
- * - D3DFMT_L16
- * - D3DFMT_A8R8G8B8
- * - D3DFMT_X8R8G8B8
- * - D3DFMT_G16R16
- * - D3DFMT_A8B8G8R8
- * - D3DFMT_A8
- * - D3DFMT_A8L8
- * - D3DFMT_Q8W8V8U8
- * - D3DFMT_V16U16
- * - D3DFMT_A16B16G16R16F
- * - D3DFMT_A16B16G16R16
- * - D3DFMT_R32F
- * - D3DFMT_G16R16F
- * - D3DFMT_A32B32G32R32F
- * - D3DFMT_G32R32F
- * - D3DFMT_R16F
- *
- * If \p pD3DResource is of incorrect type or is already registered, then 
- * ::cudaErrorInvalidResourceHandle is returned. 
- * If \p pD3DResource cannot be registered, then ::cudaErrorUnknown is returned.
- *
- * \param resource - Pointer to returned resource handle
- * \param pD3DResource - Direct3D resource to register
- * \param flags        - Parameters for resource registration
- * 
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidDevice,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa 
- * ::cudaD3D9SetDirect3DDevice,
- * ::cudaGraphicsUnregisterResource,
- * ::cudaGraphicsMapResources,
- * ::cudaGraphicsSubResourceGetMappedArray,
- * ::cudaGraphicsResourceGetMappedPointer,
- * ::cuGraphicsD3D9RegisterResource
- */
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsD3D9RegisterResource(struct cudaGraphicsResource **resource, IDirect3DResource9 *pD3DResource, unsigned int flags);
-
-/**
- * \brief Gets the device number for an adapter
- *
- * Returns in \p *device the CUDA-compatible device corresponding to the
- * adapter name \p pszAdapterName obtained from ::EnumDisplayDevices or
- * ::IDirect3D9::GetAdapterIdentifier(). If no device on the adapter with name
- * \p pszAdapterName is CUDA-compatible then the call will fail.
- *
- * \param device         - Returns the device corresponding to pszAdapterName
- * \param pszAdapterName - D3D9 adapter to get device for
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa 
- * ::cudaD3D9SetDirect3DDevice,
- * ::cudaGraphicsD3D9RegisterResource,
- * ::cuD3D9GetDevice
- */
-extern __host__ cudaError_t CUDARTAPI cudaD3D9GetDevice(int *device, const char *pszAdapterName);
-
-/**
- * \brief Gets the CUDA devices corresponding to a Direct3D 9 device
- * 
- * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices corresponding 
- * to the Direct3D 9 device \p pD3D9Device.
- * Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the the CUDA-compatible devices 
- * corresponding to the Direct3D 9 device \p pD3D9Device.
- *
- * If any of the GPUs being used to render \p pDevice are not CUDA capable then the
- * call will return ::cudaErrorNoDevice.
- *
- * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D9Device
- * \param pCudaDevices     - Returned CUDA devices corresponding to \p pD3D9Device
- * \param cudaDeviceCount  - The size of the output device array \p pCudaDevices
- * \param pD3D9Device      - Direct3D 9 device to query for CUDA devices
- * \param deviceList       - The set of devices to return.  This set may be
- *                           ::cudaD3D9DeviceListAll for all devices, 
- *                           ::cudaD3D9DeviceListCurrentFrame for the devices used to
- *                           render the current frame (in SLI), or
- *                           ::cudaD3D9DeviceListNextFrame for the devices used to
- *                           render the next frame (in SLI).
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorNoDevice,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa 
- * ::cudaGraphicsUnregisterResource,
- * ::cudaGraphicsMapResources, 
- * ::cudaGraphicsSubResourceGetMappedArray, 
- * ::cudaGraphicsResourceGetMappedPointer,
- * ::cuD3D9GetDevices 
- */
-extern __host__ cudaError_t CUDARTAPI cudaD3D9GetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, IDirect3DDevice9 *pD3D9Device, enum cudaD3D9DeviceList deviceList);
-
-/**
- * \brief Sets the Direct3D 9 device to use for interoperability with 
- * a CUDA device
- *
- * Records \p pD3D9Device as the Direct3D 9 device to use for Direct3D 9
- * interoperability with the CUDA device \p device and sets \p device as 
- * the current device for the calling host thread.
- * 
- * If \p device has already been initialized then this call will fail with 
- * the error ::cudaErrorSetOnActiveProcess.  In this case it is necessary 
- * to reset \p device using ::cudaDeviceReset() before Direct3D 9 
- * interoperability on \p device may be enabled.
- *
- * Successfully initializing CUDA interoperability with \p pD3D9Device 
- * will increase the internal reference count on \p pD3D9Device.  This 
- * reference count will be decremented when \p device is reset using 
- * ::cudaDeviceReset().
- *
- * Note that this function is never required for correct functionality.  Use of 
- * this function will result in accelerated interoperability only when the
- * operating system is Windows Vista or Windows 7, and the device \p pD3DDdevice 
- * is not an IDirect3DDevice9Ex.  In all other cirumstances, this function is 
- * not necessary.
- *
- * \param pD3D9Device - Direct3D device to use for this thread
- * \param device      - The CUDA device to use.  This device must be among the devices
- *                      returned when querying ::cudaD3D9DeviceListAll from ::cudaD3D9GetDevices,
- *                      may be set to -1 to automatically select an appropriate CUDA device.
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInitializationError,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorSetOnActiveProcess
- * \notefnerr
- *
- * \sa 
- * ::cudaD3D9GetDevice,
- * ::cudaGraphicsD3D9RegisterResource,
- * ::cudaDeviceReset
- */
-extern __host__ cudaError_t CUDARTAPI cudaD3D9SetDirect3DDevice(IDirect3DDevice9 *pD3D9Device, int device __dv(-1));
-
-/** @} */ /* END CUDART_D3D9 */
-
-/**
- * \addtogroup CUDART_D3D9_DEPRECATED Direct3D 9 Interoperability [DEPRECATED]
- * This section describes deprecated Direct3D 9 interoperability functions.
- *
- * @{
- */
-
-/**
- * CUDA D3D9 Register Flags
- */
-enum cudaD3D9RegisterFlags
-{
-  cudaD3D9RegisterFlagsNone  = 0,  /**< Default; Resource can be accessed througa void* */
-  cudaD3D9RegisterFlagsArray = 1   /**< Resource can be accessed through a CUarray* */
-};
-
-/**
- * CUDA D3D9 Map Flags
- */
-enum cudaD3D9MapFlags
-{
-  cudaD3D9MapFlagsNone         = 0,  /**< Default; Assume resource can be read/written */
-  cudaD3D9MapFlagsReadOnly     = 1,  /**< CUDA kernels will not write to this resource */
-  cudaD3D9MapFlagsWriteDiscard = 2   /**< CUDA kernels will only write to and will not read from this resource */
-};
-
-/**
- * \brief Registers a Direct3D resource for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Registers the Direct3D resource \p pResource for access by CUDA.
- *
- * If this call is successful, then the application will be able to map and
- * unmap this resource until it is unregistered through
- * ::cudaD3D9UnregisterResource(). Also on success, this call will increase
- * the internal reference count on \p pResource. This reference count will be
- * decremented when this resource is unregistered through
- * ::cudaD3D9UnregisterResource().
- *
- * This call potentially has a high-overhead and should not be called every frame
- * in interactive applications.
- *
- * The type of \p pResource must be one of the following.
- *
- * - ::IDirect3DVertexBuffer9: No notes.
- * - ::IDirect3DIndexBuffer9: No notes.
- * - ::IDirect3DSurface9: Only stand-alone objects of type ::IDirect3DSurface9
- * may be explicitly shared. In particular, individual mipmap levels and faces
- * of cube maps may not be registered directly. To access individual surfaces
- * associated with a texture, one must register the base texture object.
- * - ::IDirect3DBaseTexture9: When a texture is registered, all surfaces
- * associated with all mipmap levels of all faces of the texture will be
- * accessible to CUDA.
- *
- * The \p flags argument specifies the mechanism through which CUDA will
- * access the Direct3D resource. The following value is allowed:
- *
- * - ::cudaD3D9RegisterFlagsNone: Specifies that CUDA will access this
- * resource through a \p void*. The pointer, size, and pitch for each
- * subresource of this resource may be queried through
- * ::cudaD3D9ResourceGetMappedPointer(), ::cudaD3D9ResourceGetMappedSize(),
- * and ::cudaD3D9ResourceGetMappedPitch() respectively. This option is valid
- * for all resource types.
- *
- * Not all Direct3D resources of the above types may be used for
- * interoperability with CUDA. The following are some limitations:
- *
- * - The primary rendertarget may not be registered with CUDA.
- * - Resources allocated as shared may not be registered with CUDA.
- * - Any resources allocated in ::D3DPOOL_SYSTEMMEM or ::D3DPOOL_MANAGED may
- *   not be registered with CUDA.
- * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
- *   or 32-bit integer or floating-point data cannot be shared.
- * - Surfaces of depth or stencil formats cannot be shared.
- *
- * If Direct3D interoperability is not initialized on this context, then
- * ::cudaErrorInvalidDevice is returned. If \p pResource is of incorrect type
- * (e.g, is a non-stand-alone ::IDirect3DSurface9) or is already registered,
- * then ::cudaErrorInvalidResourceHandle is returned. If \p pResource cannot
- * be registered then ::cudaErrorUnknown is returned.
- *
- * \param pResource - Resource to register
- * \param flags     - Parameters for resource registration
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa 
- * ::cudaGraphicsD3D9RegisterResource
- */
-extern __host__ cudaError_t CUDARTAPI cudaD3D9RegisterResource(IDirect3DResource9 *pResource, unsigned int flags);
-
-/**
- * \brief Unregisters a Direct3D resource for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0. 
- *
- * Unregisters the Direct3D resource \p pResource so it is not accessible by
- * CUDA unless registered again.
- *
- * If \p pResource is not registered, then ::cudaErrorInvalidResourceHandle is
- * returned.
- *
- * \param pResource - Resource to unregister
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa 
- * ::cudaGraphicsUnregisterResource
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9UnregisterResource(IDirect3DResource9 *pResource);
-
-/**
- * \brief Map Direct3D resources for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Maps the \p count Direct3D resources in \p ppResources for access by CUDA.
- *
- * The resources in \p ppResources may be accessed in CUDA kernels until they
- * are unmapped. Direct3D should not access any resources while they are
- * mapped by CUDA. If an application does so, the results are undefined.
- *
- * This function provides the synchronization guarantee that any Direct3D
- * calls issued before ::cudaD3D9MapResources() will complete before any CUDA
- * kernels issued after ::cudaD3D9MapResources() begin.
- *
- * If any of \p ppResources have not been registered for use with CUDA or if
- * \p ppResources contains any duplicate entries then
- * ::cudaErrorInvalidResourceHandle is returned. If any of \p ppResources are
- * presently mapped for access by CUDA then ::cudaErrorUnknown is returned.
- *
- * \param count       - Number of resources to map for CUDA
- * \param ppResources - Resources to map for CUDA
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa 
- * ::cudaGraphicsMapResources
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9MapResources(int count, IDirect3DResource9 **ppResources);
-
-/**
- * \brief Unmap Direct3D resources for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Unmaps the \p count Direct3D resources in \p ppResources.  
- *
- * This function provides the synchronization guarantee that any CUDA kernels
- * issued before ::cudaD3D9UnmapResources() will complete before any Direct3D
- * calls issued after ::cudaD3D9UnmapResources() begin.
- *
- * If any of \p ppResources have not been registered for use with CUDA or if
- * \p ppResources contains any duplicate entries, then
- * ::cudaErrorInvalidResourceHandle is returned. If any of \p ppResources are
- * not presently mapped for access by CUDA then ::cudaErrorUnknown is returned.
- *
- * \param count       - Number of resources to unmap for CUDA
- * \param ppResources - Resources to unmap for CUDA
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa 
-  * ::cudaGraphicsUnmapResources
-  */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9UnmapResources(int count, IDirect3DResource9 **ppResources);
-
-/**
- * \brief Set usage flags for mapping a Direct3D resource
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Set flags for mapping the Direct3D resource \p pResource.
- *
- * Changes to flags will take effect the next time \p pResource is mapped.
- * The \p flags argument may be any of the following:
- *
- * - ::cudaD3D9MapFlagsNone: Specifies no hints about how this resource will
- * be used. It is therefore assumed that this resource will be read from and
- * written to by CUDA kernels. This is the default value.
- * - ::cudaD3D9MapFlagsReadOnly: Specifies that CUDA kernels which access this
- * resource will not write to this resource.
- * - ::cudaD3D9MapFlagsWriteDiscard: Specifies that CUDA kernels which access
- * this resource will not read from this resource and will write over the
- * entire contents of the resource, so none of the data previously stored in
- * the resource will be preserved.
- *
- * If \p pResource has not been registered for use with CUDA, then
- * ::cudaErrorInvalidResourceHandle is returned. If \p pResource is presently
- * mapped for access by CUDA, then ::cudaErrorUnknown is returned.
- *
- * \param pResource - Registered resource to set flags for
- * \param flags     - Parameters for resource mapping
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa
- * ::cudaInteropResourceSetMapFlags
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceSetMapFlags(IDirect3DResource9 *pResource, unsigned int flags); 
-
-/**
- * \brief Get the dimensions of a registered Direct3D surface
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Returns in \p *pWidth, \p *pHeight, and \p *pDepth the dimensions of the
- * subresource of the mapped Direct3D resource \p pResource which corresponds
- * to \p face and \p level.
- *
- * Since anti-aliased surfaces may have multiple samples per pixel, it is
- * possible that the dimensions of a resource will be an integer factor larger
- * than the dimensions reported by the Direct3D runtime.
- *
- * The parameters \p pWidth, \p pHeight, and \p pDepth are optional. For 2D
- * surfaces, the value returned in \p *pDepth will be 0.
- *
- * If \p pResource is not of type ::IDirect3DBaseTexture9 or
- * ::IDirect3DSurface9 or if \p pResource has not been registered for use with
- * CUDA, then ::cudaErrorInvalidResourceHandle is returned.
- *
- * For usage requirements of \p face and \p level parameters, see
- * ::cudaD3D9ResourceGetMappedPointer.
- *
- * \param pWidth    - Returned width of surface
- * \param pHeight   - Returned height of surface
- * \param pDepth    - Returned depth of surface
- * \param pResource - Registered resource to access
- * \param face      - Face of resource to access
- * \param level     - Level of resource to access
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorInvalidResourceHandle,
- * \notefnerr
- *
- * \sa 
- * ::cudaGraphicsSubResourceGetMappedArray
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceGetSurfaceDimensions(size_t *pWidth, size_t *pHeight, size_t *pDepth, IDirect3DResource9 *pResource, unsigned int face, unsigned int level); 
-
-/**
- * \brief Get an array through which to access a subresource of a Direct3D
- * resource which has been mapped for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Returns in \p *pArray an array through which the subresource of the mapped
- * Direct3D resource \p pResource, which corresponds to \p face and \p level
- * may be accessed. The value set in \p pArray may change every time that
- * \p pResource is mapped.
- *
- * If \p pResource is not registered then ::cudaErrorInvalidResourceHandle is
- * returned. If \p pResource was not registered with usage flags
- * ::cudaD3D9RegisterFlagsArray, then ::cudaErrorInvalidResourceHandle is
- * returned. If \p pResource is not mapped, then ::cudaErrorUnknown is
- * returned.
- *
- * For usage requirements of \p face and \p level parameters, see
- * ::cudaD3D9ResourceGetMappedPointer().
- *
- * \param ppArray   - Returned array corresponding to subresource
- * \param pResource - Mapped resource to access
- * \param face      - Face of resource to access
- * \param level     - Level of resource to access
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa 
- * ::cudaGraphicsSubResourceGetMappedArray
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceGetMappedArray(cudaArray **ppArray, IDirect3DResource9 *pResource, unsigned int face, unsigned int level);
-
-/**
- * \brief Get a pointer through which to access a subresource of a Direct3D
- * resource which has been mapped for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Returns in \p *pPointer the base pointer of the subresource of the mapped
- * Direct3D resource \p pResource, which corresponds to \p face and \p level.
- * The value set in \p pPointer may change every time that \p pResource is
- * mapped.
- *
- * If \p pResource is not registered, then ::cudaErrorInvalidResourceHandle is
- * returned. If \p pResource was not registered with usage flags
- * ::cudaD3D9RegisterFlagsNone, then ::cudaErrorInvalidResourceHandle is
- * returned. If \p pResource is not mapped, then ::cudaErrorUnknown is
- * returned.
- *
- * If \p pResource is of type ::IDirect3DCubeTexture9, then \p face must one
- * of the values enumerated by type ::D3DCUBEMAP_FACES. For all other types,
- * \p face must be 0. If \p face is invalid, then ::cudaErrorInvalidValue is
- * returned.
- *
- * If \p pResource is of type ::IDirect3DBaseTexture9, then \p level must
- * correspond to a valid mipmap level. Only mipmap level 0 is supported for
- * now. For all other types \p level must be 0. If \p level is invalid, then
- * ::cudaErrorInvalidValue is returned.
- *
- * \param pPointer  - Returned pointer corresponding to subresource
- * \param pResource - Mapped resource to access
- * \param face      - Face of resource to access
- * \param level     - Level of resource to access
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa 
- * ::cudaGraphicsResourceGetMappedPointer
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceGetMappedPointer(void **pPointer, IDirect3DResource9 *pResource, unsigned int face, unsigned int level);
-
-/**
- * \brief Get the size of a subresource of a Direct3D resource which has been
- * mapped for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Returns in \p *pSize the size of the subresource of the mapped Direct3D
- * resource \p pResource, which corresponds to \p face and \p level. The value
- * set in \p pSize may change every time that \p pResource is mapped.
- *
- * If \p pResource has not been registered for use with CUDA then
- * ::cudaErrorInvalidResourceHandle is returned. If \p pResource was not
- * registered with usage flags ::cudaD3D9RegisterFlagsNone, then
- * ::cudaErrorInvalidResourceHandle is returned. If \p pResource is not mapped
- * for access by CUDA then ::cudaErrorUnknown is returned.
- *
- * For usage requirements of \p face and \p level parameters, see
- * ::cudaD3D9ResourceGetMappedPointer().
- *
- * \param pSize     - Returned size of subresource
- * \param pResource - Mapped resource to access
- * \param face      - Face of resource to access
- * \param level     - Level of resource to access
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa 
- * ::cudaGraphicsResourceGetMappedPointer
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceGetMappedSize(size_t *pSize, IDirect3DResource9 *pResource, unsigned int face, unsigned int level);
-
-/**
- * \brief Get the pitch of a subresource of a Direct3D resource which has been
- * mapped for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0.
- *
- * Returns in \p *pPitch and \p *pPitchSlice the pitch and Z-slice pitch of
- * the subresource of the mapped Direct3D resource \p pResource, which
- * corresponds to \p face and \p level. The values set in \p pPitch and
- * \p pPitchSlice may change every time that \p pResource is mapped.
- *
- * The pitch and Z-slice pitch values may be used to compute the location of a
- * sample on a surface as follows.
- *
- * For a 2D surface, the byte offset of the sample at position \b x, \b y from
- * the base pointer of the surface is:
- *
- * \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
- *
- * For a 3D surface, the byte offset of the sample at position \b x, \b y,
- * \b z from the base pointer of the surface is:
- *
- * \b z* \b slicePitch + \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
- *
- * Both parameters \p pPitch and \p pPitchSlice are optional and may be set to
- * NULL.
- *
- * If \p pResource is not of type ::IDirect3DBaseTexture9 or one of its
- * sub-types or if \p pResource has not been registered for use with CUDA,
- * then ::cudaErrorInvalidResourceHandle is returned. If \p pResource was not
- * registered with usage flags ::cudaD3D9RegisterFlagsNone, then
- * ::cudaErrorInvalidResourceHandle is returned. If \p pResource is not mapped
- * for access by CUDA then ::cudaErrorUnknown is returned.
- *
- * For usage requirements of \p face and \p level parameters, see
- * ::cudaD3D9ResourceGetMappedPointer().
- *
- * \param pPitch      - Returned pitch of subresource
- * \param pPitchSlice - Returned Z-slice pitch of subresource
- * \param pResource   - Mapped resource to access
- * \param face        - Face of resource to access
- * \param level       - Level of resource to access
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa 
- * ::cudaGraphicsResourceGetMappedPointer
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceGetMappedPitch(size_t *pPitch, size_t *pPitchSlice, IDirect3DResource9 *pResource, unsigned int face, unsigned int level);
-
-/* D3D9 1.x interop interface */
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9Begin(IDirect3DDevice9 *pDevice);
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9End(void);
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9RegisterVertexBuffer(IDirect3DVertexBuffer9 *pVB);
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9UnregisterVertexBuffer(IDirect3DVertexBuffer9 *pVB);
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9MapVertexBuffer(void **dptr, IDirect3DVertexBuffer9 *pVB);
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9UnmapVertexBuffer(IDirect3DVertexBuffer9 *pVB);
-
-/** @} */ /* END CUDART_D3D9_DEPRECATED */
-
-#if defined(__cplusplus)
-}
-#endif /* __cplusplus */
-
-#undef __dv
-#undef __CUDA_DEPRECATED
-
-#endif /* __CUDA_D3D9_INTEROP_H__ */
--- a/include/cuda/cuda_device_runtime_api.h
+++ b/include/cuda/cuda_device_runtime_api.h
@ -1,265 +0,0 @@
-/*
- * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__CUDA_DEVICE_RUNTIME_API_H__)
-#define __CUDA_DEVICE_RUNTIME_API_H__
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#if !defined(__CUDACC_RTC__)
-
-#if !defined(__CUDACC_INTERNAL_NO_STUBS__) && !defined(__CUDACC_RDC__) && !defined(__CUDACC_EWP__) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__)
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-struct cudaFuncAttributes;
-
-
-inline __device__  cudaError_t CUDARTAPI cudaMalloc(void **p, size_t s) 
-{ 
-  return cudaErrorUnknown;
-}
-
-inline __device__  cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c) 
-{ 
-  return cudaErrorUnknown;
-}
-
-inline __device__  cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
-{
-  return cudaErrorUnknown;
-}
-
-inline __device__  cudaError_t CUDARTAPI cudaGetDevice(int *device)
-{
-  return cudaErrorUnknown;
-}
-
-inline __device__  cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
-{
-  return cudaErrorUnknown;
-}
-
-inline __device__  cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
-{
-  return cudaErrorUnknown;
-}
-
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* !defined(__CUDACC_INTERNAL_NO_STUBS__) && !defined(__CUDACC_RDC__) &&  !defined(__CUDACC_EWP__) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__) */
-
-#endif /* !defined(__CUDACC_RTC__) */
-
-#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
-# define __DEPRECATED__(msg)
-#elif defined(_WIN32)
-# define __DEPRECATED__(msg) __declspec(deprecated(msg))
-#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
-# define __DEPRECATED__(msg) __attribute__((deprecated))
-#else
-# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
-#endif
-
-#if defined(__CUDA_ARCH__) && !defined(__CDPRT_SUPPRESS_SYNC_DEPRECATION_WARNING)
-# define __CDPRT_DEPRECATED(func_name) __DEPRECATED__("Use of "#func_name" from device code is deprecated and will not be supported in a future release. Disable this warning with -D__CDPRT_SUPPRESS_SYNC_DEPRECATION_WARNING.")
-#else
-# define __CDPRT_DEPRECATED(func_name)
-#endif
-
-#if defined(__cplusplus) && defined(__CUDACC__)         /* Visible to nvcc front-end only */
-#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)   // Visible to SM>=3.5 and "__host__ __device__" only
-
-#include "driver_types.h"
-#include "crt/host_defines.h"
-
-extern "C"
-{
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
-extern __device__ __cudart_builtin__ __CDPRT_DEPRECATED(cudaDeviceSynchronize) cudaError_t CUDARTAPI cudaDeviceSynchronize(void);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaDeviceSynchronizeDeprecationAvoidance(void);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void);
-extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);
-extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord_ptsz(cudaEvent_t event, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream, unsigned int flags);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags_ptsz(cudaEvent_t event, cudaStream_t stream, unsigned int flags);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
-
-/**
- * \ingroup CUDART_EXECUTION
- * \brief Obtains a parameter buffer
- *
- * Obtains a parameter buffer which can be filled with parameters for a kernel launch.
- * Parameters passed to ::cudaLaunchDevice must be allocated via this function.
- *
- * This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
- * CUDA user code should use <<< >>> to launch kernels.
- *
- * \param alignment - Specifies alignment requirement of the parameter buffer
- * \param size      - Specifies size requirement in bytes
- *
- * \return
- * Returns pointer to the allocated parameterBuffer
- * \notefnerr
- *
- * \sa cudaLaunchDevice
- */
-extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBuffer(size_t alignment, size_t size);
-
-/**
- * \ingroup CUDART_EXECUTION
- * \brief Launches a specified kernel
- *
- * Launches a specified kernel with the specified parameter buffer. A parameter buffer can be obtained
- * by calling ::cudaGetParameterBuffer().
- *
- * This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
- * CUDA user code should use <<< >>> to launch the kernels.
- *
- * \param func            - Pointer to the kernel to be launched
- * \param parameterBuffer - Holds the parameters to the launched kernel. parameterBuffer can be NULL. (Optional)
- * \param gridDimension   - Specifies grid dimensions
- * \param blockDimension  - Specifies block dimensions
- * \param sharedMemSize   - Specifies size of shared memory
- * \param stream          - Specifies the stream to be used
- *
- * \return
- * ::cudaSuccess, ::cudaErrorInvalidDevice, ::cudaErrorLaunchMaxDepthExceeded, ::cudaErrorInvalidConfiguration,
- * ::cudaErrorStartupFailure, ::cudaErrorLaunchPendingCountExceeded, ::cudaErrorLaunchOutOfResources
- * \notefnerr
- * \n Please refer to Execution Configuration and Parameter Buffer Layout from the CUDA Programming
- * Guide for the detailed descriptions of launch configuration and parameter layout respectively.
- *
- * \sa cudaGetParameterBuffer
- */
-extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream);
-
-#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__)
-    // When compiling for the device and per thread default stream is enabled, add
-    // a static inline redirect to the per thread stream entry points.
-
-    static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
-    cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream)
-    {
-        return cudaLaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
-    }
-
-    static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
-    cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream)
-    {
-        return cudaLaunchDeviceV2_ptsz(parameterBuffer, stream);
-    }
-#else
-    extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
-    extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream);
-#endif
-
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
-
-extern __device__ __cudart_builtin__ unsigned long long CUDARTAPI cudaCGGetIntrinsicHandle(enum cudaCGScope scope);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGSynchronize(unsigned long long handle, unsigned int flags);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGSynchronizeGrid(unsigned long long handle, unsigned int flags);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetSize(unsigned int *numThreads, unsigned int *numGrids, unsigned long long handle);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetRank(unsigned int *threadRank, unsigned int *gridRank, unsigned long long handle);
-}
-
-template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaMalloc(T **devPtr, size_t size);
-template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *attr, T *entry);
-template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize);
-template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
-
-
-#endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)
-#endif /* defined(__cplusplus) && defined(__CUDACC__) */
-
-#undef __DEPRECATED__
-#undef __CDPRT_DEPRECATED
-
-#endif /* !__CUDA_DEVICE_RUNTIME_API_H__ */
--- a/include/cuda/cuda_egl_interop.h
+++ b/include/cuda/cuda_egl_interop.h
@ -1,642 +0,0 @@
-/*
- * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__CUDA_EGL_INTEROP_H__)
-#define __CUDA_EGL_INTEROP_H__
-
-#include "cuda_runtime_api.h"
-#include "cuda_runtime.h"
-#include "cudart_platform.h"
-#include "EGL/egl.h"
-#include "EGL/eglext.h"
-
-#if defined(__cplusplus)
-extern "C" {
-#endif /* __cplusplus */
-
-/**
- * \addtogroup CUDART_TYPES
- * @{
- */
-
- /**
- * Maximum number of planes per frame
- */
-#define CUDA_EGL_MAX_PLANES 3
-
-/**
- * CUDA EglFrame type - array or pointer
- */
-typedef enum cudaEglFrameType_enum
-{
-    cudaEglFrameTypeArray = 0,  /**< Frame type CUDA array */
-    cudaEglFrameTypePitch = 1,  /**< Frame type CUDA pointer */
-} cudaEglFrameType;
-
-/**
- * Resource location flags- sysmem or vidmem
- *
- * For CUDA context on iGPU, since video and system memory are equivalent -
- * these flags will not have an effect on the execution.
- *
- * For CUDA context on dGPU, applications can use the flag ::cudaEglResourceLocationFlags
- * to give a hint about the desired location.
- *
- * ::cudaEglResourceLocationSysmem - the frame data is made resident on the system memory
- * to be accessed by CUDA.
- *
- * ::cudaEglResourceLocationVidmem - the frame data is made resident on the dedicated
- * video memory to be accessed by CUDA.
- *
- * There may be an additional latency due to new allocation and data migration,
- * if the frame is produced on a different memory.
- */
-typedef enum cudaEglResourceLocationFlags_enum {
-    cudaEglResourceLocationSysmem   = 0x00,       /**< Resource location sysmem */
-    cudaEglResourceLocationVidmem   = 0x01,       /**< Resource location vidmem */
-} cudaEglResourceLocationFlags;
-
-/**
- * CUDA EGL Color Format - The different planar and multiplanar formats currently supported for CUDA_EGL interops.
- */
-typedef enum cudaEglColorFormat_enum {
-    cudaEglColorFormatYUV420Planar            = 0,  /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
-    cudaEglColorFormatYUV420SemiPlanar        = 1,  /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV420Planar. */
-    cudaEglColorFormatYUV422Planar            = 2,  /**< Y, U, V  each in a separate  surface, U/V width = 1/2 Y width, U/V height = Y height. */
-    cudaEglColorFormatYUV422SemiPlanar        = 3,  /**< Y, UV in two surfaces with VU byte ordering, width, height ratio same as YUV422Planar. */
-    cudaEglColorFormatARGB                    = 6,  /**< R/G/B/A four channels in one surface with BGRA byte ordering. */
-    cudaEglColorFormatRGBA                    = 7,  /**< R/G/B/A four channels in one surface with ABGR byte ordering. */
-    cudaEglColorFormatL                       = 8,  /**< single luminance channel in one surface. */
-    cudaEglColorFormatR                       = 9,  /**< single color channel in one surface. */
-    cudaEglColorFormatYUV444Planar            = 10, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
-    cudaEglColorFormatYUV444SemiPlanar        = 11, /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV444Planar. */
-    cudaEglColorFormatYUYV422                 = 12, /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
-    cudaEglColorFormatUYVY422                 = 13, /**< Y, U, V in one surface, interleaved as YUYV in one channel. */
-    cudaEglColorFormatABGR                    = 14, /**< R/G/B/A four channels in one surface with RGBA byte ordering. */
-    cudaEglColorFormatBGRA                    = 15, /**< R/G/B/A four channels in one surface with ARGB byte ordering. */
-    cudaEglColorFormatA                       = 16, /**< Alpha color format - one channel in one surface. */
-    cudaEglColorFormatRG                      = 17, /**< R/G color format - two channels in one surface with GR byte ordering */
-    cudaEglColorFormatAYUV                    = 18, /**< Y, U, V, A four channels in one surface, interleaved as VUYA. */
-    cudaEglColorFormatYVU444SemiPlanar        = 19, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
-    cudaEglColorFormatYVU422SemiPlanar        = 20, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
-    cudaEglColorFormatYVU420SemiPlanar        = 21, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
-    cudaEglColorFormatY10V10U10_444SemiPlanar = 22, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
-    cudaEglColorFormatY10V10U10_420SemiPlanar = 23, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
-    cudaEglColorFormatY12V12U12_444SemiPlanar = 24, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
-    cudaEglColorFormatY12V12U12_420SemiPlanar = 25, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
-    cudaEglColorFormatVYUY_ER                 = 26, /**< Extended Range Y, U, V in one surface, interleaved as YVYU in one channel. */
-    cudaEglColorFormatUYVY_ER                 = 27, /**< Extended Range Y, U, V in one surface, interleaved as YUYV in one channel. */
-    cudaEglColorFormatYUYV_ER                 = 28, /**< Extended Range Y, U, V in one surface, interleaved as UYVY in one channel. */
-    cudaEglColorFormatYVYU_ER                 = 29, /**< Extended Range Y, U, V in one surface, interleaved as VYUY in one channel. */
-    cudaEglColorFormatYUVA_ER                 = 31, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as AVUY. */
-    cudaEglColorFormatAYUV_ER                 = 32, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as VUYA. */
-    cudaEglColorFormatYUV444Planar_ER         = 33, /**< Extended Range Y, U, V in three surfaces, U/V width = Y width, U/V height = Y height. */
-    cudaEglColorFormatYUV422Planar_ER         = 34, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
-    cudaEglColorFormatYUV420Planar_ER         = 35, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
-    cudaEglColorFormatYUV444SemiPlanar_ER     = 36, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = Y width, U/V height = Y height. */
-    cudaEglColorFormatYUV422SemiPlanar_ER     = 37, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
-    cudaEglColorFormatYUV420SemiPlanar_ER     = 38, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
-    cudaEglColorFormatYVU444Planar_ER         = 39, /**< Extended Range Y, V, U in three surfaces, U/V width = Y width, U/V height = Y height. */
-    cudaEglColorFormatYVU422Planar_ER         = 40, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
-    cudaEglColorFormatYVU420Planar_ER         = 41, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
-    cudaEglColorFormatYVU444SemiPlanar_ER     = 42, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
-    cudaEglColorFormatYVU422SemiPlanar_ER     = 43, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
-    cudaEglColorFormatYVU420SemiPlanar_ER     = 44, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
-    cudaEglColorFormatBayerRGGB               = 45, /**< Bayer format - one channel in one surface with interleaved RGGB ordering. */
-    cudaEglColorFormatBayerBGGR               = 46, /**< Bayer format - one channel in one surface with interleaved BGGR ordering. */
-    cudaEglColorFormatBayerGRBG               = 47, /**< Bayer format - one channel in one surface with interleaved GRBG ordering. */
-    cudaEglColorFormatBayerGBRG               = 48, /**< Bayer format - one channel in one surface with interleaved GBRG ordering. */
-    cudaEglColorFormatBayer10RGGB             = 49, /**< Bayer10 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
-    cudaEglColorFormatBayer10BGGR             = 50, /**< Bayer10 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
-    cudaEglColorFormatBayer10GRBG             = 51, /**< Bayer10 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
-    cudaEglColorFormatBayer10GBRG             = 52, /**< Bayer10 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
-    cudaEglColorFormatBayer12RGGB             = 53, /**< Bayer12 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
-    cudaEglColorFormatBayer12BGGR             = 54, /**< Bayer12 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
-    cudaEglColorFormatBayer12GRBG             = 55, /**< Bayer12 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
-    cudaEglColorFormatBayer12GBRG             = 56, /**< Bayer12 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
-    cudaEglColorFormatBayer14RGGB             = 57, /**< Bayer14 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
-    cudaEglColorFormatBayer14BGGR             = 58, /**< Bayer14 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
-    cudaEglColorFormatBayer14GRBG             = 59, /**< Bayer14 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
-    cudaEglColorFormatBayer14GBRG             = 60, /**< Bayer14 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
-    cudaEglColorFormatBayer20RGGB             = 61, /**< Bayer20 format - one channel in one surface with interleaved RGGB ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
-    cudaEglColorFormatBayer20BGGR             = 62, /**< Bayer20 format - one channel in one surface with interleaved BGGR ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
-    cudaEglColorFormatBayer20GRBG             = 63, /**< Bayer20 format - one channel in one surface with interleaved GRBG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
-    cudaEglColorFormatBayer20GBRG             = 64, /**< Bayer20 format - one channel in one surface with interleaved GBRG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
-    cudaEglColorFormatYVU444Planar            = 65, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
-    cudaEglColorFormatYVU422Planar            = 66, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
-    cudaEglColorFormatYVU420Planar            = 67, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
-    cudaEglColorFormatBayerIspRGGB            = 68, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved RGGB ordering and mapped to opaque integer datatype. */
-    cudaEglColorFormatBayerIspBGGR            = 69, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved BGGR ordering and mapped to opaque integer datatype. */
-    cudaEglColorFormatBayerIspGRBG            = 70, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GRBG ordering and mapped to opaque integer datatype. */
-    cudaEglColorFormatBayerIspGBRG            = 71, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GBRG ordering and mapped to opaque integer datatype. */
-    cudaEglColorFormatBayerBCCR               = 72, /**< Bayer format - one channel in one surface with interleaved BCCR ordering. */
-    cudaEglColorFormatBayerRCCB               = 73, /**< Bayer format - one channel in one surface with interleaved RCCB ordering. */
-    cudaEglColorFormatBayerCRBC               = 74, /**< Bayer format - one channel in one surface with interleaved CRBC ordering. */
-    cudaEglColorFormatBayerCBRC               = 75, /**< Bayer format - one channel in one surface with interleaved CBRC ordering. */
-    cudaEglColorFormatBayer10CCCC             = 76, /**< Bayer10 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
-    cudaEglColorFormatBayer12BCCR             = 77, /**< Bayer12 format - one channel in one surface with interleaved BCCR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
-    cudaEglColorFormatBayer12RCCB             = 78, /**< Bayer12 format - one channel in one surface with interleaved RCCB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
-    cudaEglColorFormatBayer12CRBC             = 79, /**< Bayer12 format - one channel in one surface with interleaved CRBC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
-    cudaEglColorFormatBayer12CBRC             = 80, /**< Bayer12 format - one channel in one surface with interleaved CBRC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
-    cudaEglColorFormatBayer12CCCC             = 81, /**< Bayer12 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
-    cudaEglColorFormatY                       = 82, /**< Color format for single Y plane. */
-    cudaEglColorFormatYUV420SemiPlanar_2020   = 83, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
-    cudaEglColorFormatYVU420SemiPlanar_2020   = 84, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
-    cudaEglColorFormatYUV420Planar_2020       = 85, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
-    cudaEglColorFormatYVU420Planar_2020       = 86, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
-    cudaEglColorFormatYUV420SemiPlanar_709    = 87, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
-    cudaEglColorFormatYVU420SemiPlanar_709    = 88, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
-    cudaEglColorFormatYUV420Planar_709        = 89, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
-    cudaEglColorFormatYVU420Planar_709        = 90, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
-    cudaEglColorFormatY10V10U10_420SemiPlanar_709  = 91, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
-    cudaEglColorFormatY10V10U10_420SemiPlanar_2020 = 92, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
-    cudaEglColorFormatY10V10U10_422SemiPlanar_2020 = 93, /**< Y10, V10U10  in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height =  Y height. */
-    cudaEglColorFormatY10V10U10_422SemiPlanar      = 94, /**< Y10, V10U10  in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height =  Y height. */
-    cudaEglColorFormatY10V10U10_422SemiPlanar_709  = 95, /**< Y10, V10U10  in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height =  Y height. */
-    cudaEglColorFormatY_ER                         = 96, /**< Extended Range Color format for single Y plane. */
-    cudaEglColorFormatY_709_ER                     = 97, /**< Extended Range Color format for single Y plane. */
-    cudaEglColorFormatY10_ER                       = 98, /**< Extended Range Color format for single Y10 plane. */
-    cudaEglColorFormatY10_709_ER                   = 99, /**< Extended Range Color format for single Y10 plane. */
-    cudaEglColorFormatY12_ER                       = 100, /**< Extended Range Color format for single Y12 plane. */
-    cudaEglColorFormatY12_709_ER                   = 101, /**< Extended Range Color format for single Y12 plane. */
-    cudaEglColorFormatYUVA                         = 102, /**< Y, U, V, A four channels in one surface, interleaved as AVUY. */
-    cudaEglColorFormatYVYU                         = 104, /**< Y, U, V in one surface, interleaved as YVYU in one channel. */
-    cudaEglColorFormatVYUY                         = 105, /**< Y, U, V in one surface, interleaved as VYUY in one channel. */
-    cudaEglColorFormatY10V10U10_420SemiPlanar_ER     = 106, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
-    cudaEglColorFormatY10V10U10_420SemiPlanar_709_ER = 107, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
-    cudaEglColorFormatY10V10U10_444SemiPlanar_ER     = 108, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */ 
-    cudaEglColorFormatY10V10U10_444SemiPlanar_709_ER = 109, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
-    cudaEglColorFormatY12V12U12_420SemiPlanar_ER     = 110, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
-    cudaEglColorFormatY12V12U12_420SemiPlanar_709_ER = 111, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
-    cudaEglColorFormatY12V12U12_444SemiPlanar_ER     = 112, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
-    cudaEglColorFormatY12V12U12_444SemiPlanar_709_ER = 113, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
-} cudaEglColorFormat;
-
-/**
- * CUDA EGL Plane Descriptor - structure defining each plane of a CUDA EGLFrame
- */
-typedef struct cudaEglPlaneDesc_st {
-    unsigned int width;                         /**< Width of plane */
-    unsigned int height;                        /**< Height of plane */
-    unsigned int depth;                         /**< Depth of plane */
-    unsigned int pitch;                         /**< Pitch of plane */
-    unsigned int numChannels;                   /**< Number of channels for the plane */
-    struct cudaChannelFormatDesc channelDesc;   /**< Channel Format Descriptor */
-    unsigned int reserved[4];                   /**< Reserved for future use */
-} cudaEglPlaneDesc;
-
-/**
- * CUDA EGLFrame Descriptor - structure defining one frame of EGL.
- *
- * Each frame may contain one or more planes depending on whether the surface is Multiplanar or not.
- * Each plane of EGLFrame is represented by ::cudaEglPlaneDesc which is defined as:
- * \code
- * typedef struct cudaEglPlaneDesc_st {
- *     unsigned int width;
- *     unsigned int height;
- *     unsigned int depth;
- *     unsigned int pitch;
- *     unsigned int numChannels;
- *     struct cudaChannelFormatDesc channelDesc;
- *     unsigned int reserved[4];
- * } cudaEglPlaneDesc;
- * \endcode
-
-*/
-typedef struct cudaEglFrame_st {
-   union {
-       cudaArray_t            pArray[CUDA_EGL_MAX_PLANES];     /**< Array of CUDA arrays corresponding to each plane*/
-       struct cudaPitchedPtr  pPitch[CUDA_EGL_MAX_PLANES];     /**< Array of Pointers corresponding to each plane*/
-   } frame;
-   cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];     /**< CUDA EGL Plane Descriptor ::cudaEglPlaneDesc*/
-   unsigned int planeCount;                             /**< Number of planes */
-   cudaEglFrameType frameType;                          /**< Array or Pitch */
-   cudaEglColorFormat eglColorFormat;                   /**< CUDA EGL Color Format*/
-} cudaEglFrame;
-
-/**
- * CUDA EGLSream Connection
- */
-typedef struct  CUeglStreamConnection_st *cudaEglStreamConnection;
-
-/** @} */ /* END CUDART_TYPES */
-
-/**
- * \addtogroup CUDART_EGL EGL Interoperability
- * This section describes the EGL interoperability functions of the CUDA
- * runtime application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Registers an EGL image
- *
- * Registers the EGLImageKHR specified by \p image for access by
- * CUDA. A handle to the registered object is returned as \p pCudaResource.
- * Additional Mapping/Unmapping is not required for the registered resource and
- * ::cudaGraphicsResourceGetMappedEglFrame can be directly called on the \p pCudaResource.
- *
- * The application will be responsible for synchronizing access to shared objects.
- * The application must ensure that any pending operation which access the objects have completed
- * before passing control to CUDA. This may be accomplished by issuing and waiting for
- * glFinish command on all GLcontexts (for OpenGL and likewise for other APIs).
- * The application will be also responsible for ensuring that any pending operation on the
- * registered CUDA resource has completed prior to executing subsequent commands in other APIs
- * accesing the same memory objects.
- * This can be accomplished by calling cuCtxSynchronize or cuEventSynchronize (preferably).
- *
- * The surface's intended usage is specified using \p flags, as follows:
- *
- * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
- *   resource will be used. It is therefore assumed that this resource will be
- *   read from and written to by CUDA. This is the default value.
- * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
- *   will not write to this resource.
- * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
- *   CUDA will not read from this resource and will write over the
- *   entire contents of the resource, so none of the data previously
- *   stored in the resource will be preserved.
- *
- * The EGLImageKHR is an object which can be used to create EGLImage target resource. It is defined as a void pointer.
- * typedef void* EGLImageKHR
- *
- * \param pCudaResource   - Pointer to the returned object handle
- * \param image           - An EGLImageKHR image which can be used to create target resource.
- * \param flags           - Map flags
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorUnknown
- *
- * \sa
- * ::cudaGraphicsUnregisterResource,
- * ::cudaGraphicsResourceGetMappedEglFrame,
- * ::cuGraphicsEGLRegisterImage
- */
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsEGLRegisterImage(struct cudaGraphicsResource **pCudaResource, EGLImageKHR image, unsigned int flags);
-
-/**
- * \brief Connect CUDA to EGLStream as a consumer.
- *
- * Connect CUDA as a consumer to EGLStreamKHR specified by \p eglStream.
- *
- * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
- * API to another.
- *
- * \param conn              - Pointer to the returned connection handle
- * \param eglStream         - EGLStreamKHR handle
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorUnknown
- *
- * \sa
- * ::cudaEGLStreamConsumerDisconnect,
- * ::cudaEGLStreamConsumerAcquireFrame,
- * ::cudaEGLStreamConsumerReleaseFrame,
- * ::cuEGLStreamConsumerConnect
- */
-extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerConnect(cudaEglStreamConnection *conn, EGLStreamKHR eglStream);
-
-/**
- * \brief Connect CUDA to EGLStream as a consumer with given flags.
- *
- * Connect CUDA as a consumer to EGLStreamKHR specified by \p stream with specified \p flags defined by
- * ::cudaEglResourceLocationFlags.
- *
- * The flags specify whether the consumer wants to access frames from system memory or video memory.
- * Default is ::cudaEglResourceLocationVidmem.
- *
- * \param conn              - Pointer to the returned connection handle
- * \param eglStream         - EGLStreamKHR handle
- * \param flags             - Flags denote intended location - system or video.
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorUnknown
- *
- * \sa
- * ::cudaEGLStreamConsumerDisconnect,
- * ::cudaEGLStreamConsumerAcquireFrame,
- * ::cudaEGLStreamConsumerReleaseFrame,
- * ::cuEGLStreamConsumerConnectWithFlags
- */
-extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerConnectWithFlags(cudaEglStreamConnection *conn, EGLStreamKHR eglStream, unsigned int flags);
-
-/**
- * \brief Disconnect CUDA as a consumer to EGLStream .
- *
- * Disconnect CUDA as a consumer to EGLStreamKHR.
- *
- * \param conn            - Conection to disconnect.
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorUnknown
- *
- * \sa
- * ::cudaEGLStreamConsumerConnect,
- * ::cudaEGLStreamConsumerAcquireFrame,
- * ::cudaEGLStreamConsumerReleaseFrame,
- * ::cuEGLStreamConsumerDisconnect
- */
-extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerDisconnect(cudaEglStreamConnection *conn);
-
-/**
- * \brief Acquire an image frame from the EGLStream with CUDA as a consumer.
- *
- * Acquire an image frame from EGLStreamKHR.
- * ::cudaGraphicsResourceGetMappedEglFrame can be called on \p pCudaResource to get
- * ::cudaEglFrame.
- *
- * \param conn            - Connection on which to acquire
- * \param pCudaResource   - CUDA resource on which the EGLStream frame will be mapped for use.
- * \param pStream         - CUDA stream for synchronization and any data migrations
- * implied by ::cudaEglResourceLocationFlags.
- * \param timeout         - Desired timeout in usec.
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorUnknown,
- * ::cudaErrorLaunchTimeout
- *
- * \sa
- * ::cudaEGLStreamConsumerConnect,
- * ::cudaEGLStreamConsumerDisconnect,
- * ::cudaEGLStreamConsumerReleaseFrame,
- * ::cuEGLStreamConsumerAcquireFrame
- */
-
-extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerAcquireFrame(cudaEglStreamConnection *conn,
-        cudaGraphicsResource_t *pCudaResource, cudaStream_t *pStream, unsigned int timeout);
-/**
- * \brief Releases the last frame acquired from the EGLStream.
- *
- * Release the acquired image frame specified by \p pCudaResource to EGLStreamKHR.
- *
- * \param conn            - Connection on which to release
- * \param pCudaResource   - CUDA resource whose corresponding frame is to be released
- * \param pStream         - CUDA stream on which release will be done.
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorUnknown
- *
- * \sa
- * ::cudaEGLStreamConsumerConnect,
- * ::cudaEGLStreamConsumerDisconnect,
- * ::cudaEGLStreamConsumerAcquireFrame,
- * ::cuEGLStreamConsumerReleaseFrame
- */
-extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerReleaseFrame(cudaEglStreamConnection *conn,
-                                                  cudaGraphicsResource_t pCudaResource, cudaStream_t *pStream);
-
-/**
- * \brief Connect CUDA to EGLStream as a producer.
- *
- * Connect CUDA as a producer to EGLStreamKHR specified by \p stream.
- *
- * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
- * API to another.
- *
- * \param conn   - Pointer to the returned connection handle
- * \param eglStream - EGLStreamKHR handle
- * \param width  - width of the image to be submitted to the stream
- * \param height - height of the image to be submitted to the stream
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorUnknown
- *
- * \sa
- * ::cudaEGLStreamProducerDisconnect,
- * ::cudaEGLStreamProducerPresentFrame,
- * ::cudaEGLStreamProducerReturnFrame,
- * ::cuEGLStreamProducerConnect
- */
-extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerConnect(cudaEglStreamConnection *conn,
-                                                EGLStreamKHR eglStream, EGLint width, EGLint height);
-
-/**
- * \brief Disconnect CUDA as a producer  to EGLStream .
- *
- * Disconnect CUDA as a producer to EGLStreamKHR.
- *
- * \param conn            - Conection to disconnect.
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorUnknown
- *
- * \sa
- * ::cudaEGLStreamProducerConnect,
- * ::cudaEGLStreamProducerPresentFrame,
- * ::cudaEGLStreamProducerReturnFrame,
- * ::cuEGLStreamProducerDisconnect
- */
-extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerDisconnect(cudaEglStreamConnection *conn);
-
-/**
- * \brief Present a CUDA eglFrame to the EGLStream with CUDA as a producer.
- *
- * The ::cudaEglFrame is defined as:
- * \code
- * typedef struct cudaEglFrame_st {
- *     union {
- *         cudaArray_t            pArray[CUDA_EGL_MAX_PLANES];
- *         struct cudaPitchedPtr  pPitch[CUDA_EGL_MAX_PLANES];
- *     } frame;
- *     cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];
- *     unsigned int planeCount;
- *     cudaEglFrameType frameType;
- *     cudaEglColorFormat eglColorFormat;
- * } cudaEglFrame;
- * \endcode
- *
- * For ::cudaEglFrame of type ::cudaEglFrameTypePitch, the application may present sub-region of a memory
- * allocation. In that case, ::cudaPitchedPtr::ptr will specify the start address of the sub-region in
- * the allocation and ::cudaEglPlaneDesc will specify the dimensions of the sub-region.
- *
- * \param conn            - Connection on which to present the CUDA array
- * \param eglframe        - CUDA Eglstream Proucer Frame handle to be sent to the consumer over EglStream.
- * \param pStream         - CUDA stream on which to present the frame.
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorUnknown
- *
- * \sa
- * ::cudaEGLStreamProducerConnect,
- * ::cudaEGLStreamProducerDisconnect,
- * ::cudaEGLStreamProducerReturnFrame,
- * ::cuEGLStreamProducerPresentFrame
- */
-extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerPresentFrame(cudaEglStreamConnection *conn,
-                                                 cudaEglFrame eglframe, cudaStream_t *pStream);
-
-/**
- * \brief Return the CUDA eglFrame to the EGLStream last released by the consumer.
- * 
- * This API can potentially return cudaErrorLaunchTimeout if the consumer has not 
- * returned a frame to EGL stream. If timeout is returned the application can retry.
- *
- * \param conn            - Connection on which to present the CUDA array
- * \param eglframe        - CUDA Eglstream Proucer Frame handle returned from the consumer over EglStream.
- * \param pStream         - CUDA stream on which to return the frame.
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorLaunchTimeout,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorUnknown
- *
- * \sa
- * ::cudaEGLStreamProducerConnect,
- * ::cudaEGLStreamProducerDisconnect,
- * ::cudaEGLStreamProducerPresentFrame,
- * ::cuEGLStreamProducerReturnFrame
- */
-extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerReturnFrame(cudaEglStreamConnection *conn,
-                                                cudaEglFrame *eglframe, cudaStream_t *pStream);
-
-/**
- * \brief Get an eglFrame through which to access a registered EGL graphics resource.
- *
- * Returns in \p *eglFrame an eglFrame pointer through which the registered graphics resource
- * \p resource may be accessed.
- * This API can only be called for EGL graphics resources.
- *
- * The ::cudaEglFrame is defined as
- * \code
- * typedef struct cudaEglFrame_st {
- *     union {
- *         cudaArray_t             pArray[CUDA_EGL_MAX_PLANES];
- *         struct cudaPitchedPtr   pPitch[CUDA_EGL_MAX_PLANES];
- *     } frame;
- *     cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];
- *     unsigned int planeCount;
- *     cudaEglFrameType frameType;
- *     cudaEglColorFormat eglColorFormat;
- * } cudaEglFrame;
- * \endcode
- *
- *
- * \param eglFrame   - Returned eglFrame.
- * \param resource   - Registered resource to access.
- * \param index      - Index for cubemap surfaces.
- * \param mipLevel   - Mipmap level for the subresource to access.
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorUnknown
- *
- * \note Note that in case of multiplanar \p *eglFrame, pitch of only first plane (unsigned int cudaEglPlaneDesc::pitch) is to be considered by the application.
- *
- * \sa
- * ::cudaGraphicsSubResourceGetMappedArray,
- * ::cudaGraphicsResourceGetMappedPointer,
- * ::cuGraphicsResourceGetMappedEglFrame
- */
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedEglFrame(cudaEglFrame* eglFrame,
-                                        cudaGraphicsResource_t resource, unsigned int index, unsigned int mipLevel);
-
-/**
- * \brief Creates an event from EGLSync object
- *
- * Creates an event *phEvent from an EGLSyncKHR eglSync with the flages specified
- * via \p flags. Valid flags include:
- * - ::cudaEventDefault: Default event creation flag.
- * - ::cudaEventBlockingSync: Specifies that the created event should use blocking
- * synchronization.  A CPU thread that uses ::cudaEventSynchronize() to wait on
- * an event created with this flag will block until the event has actually
- * been completed.
- *
- * ::cudaEventRecord and TimingData are not supported for events created from EGLSync.
- *
- * The EGLSyncKHR is an opaque handle to an EGL sync object.
- * typedef void* EGLSyncKHR
- *
- * \param phEvent - Returns newly created event
- * \param eglSync - Opaque handle to EGLSync object
- * \param flags   - Event creation flags
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInitializationError,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorLaunchFailure,
- * ::cudaErrorMemoryAllocation
- *
- * \sa
- * ::cudaEventQuery,
- * ::cudaEventSynchronize,
- * ::cudaEventDestroy
- */
-extern __host__ cudaError_t CUDARTAPI cudaEventCreateFromEGLSync(cudaEvent_t *phEvent, EGLSyncKHR eglSync, unsigned int flags);
-
-/** @} */ /* END CUDART_EGL */
-
-#if defined(__cplusplus)
-}
-#endif /* __cplusplus */
-
-#endif /* __CUDA_EGL_INTEROP_H__ */
-
--- a/include/cuda/cuda_fp16.h
+++ b/include/cuda/cuda_fp16.h
--- a/include/cuda/cuda_fp16.hpp
+++ b/include/cuda/cuda_fp16.hpp
--- a/include/cuda/cuda_gl_interop.h
+++ b/include/cuda/cuda_gl_interop.h
@ -1,508 +0,0 @@
-/*
- * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__CUDA_GL_INTEROP_H__)
-#define __CUDA_GL_INTEROP_H__
-
-#include "cuda_runtime_api.h"
-
-#if defined(__APPLE__)
-
-#include <OpenGL/gl.h>
-
-#else /* __APPLE__ */
-
-#if defined(__arm__) || defined(__aarch64__)
-#ifndef GL_VERSION
-#error Please include the appropriate gl headers before including cuda_gl_interop.h
-#endif
-#else
-#include <GL/gl.h>
-#endif
-
-#endif /* __APPLE__ */
-
-/** \cond impl_private */
-#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
-#define __CUDA_DEPRECATED
-#elif defined(_MSC_VER)
-#define __CUDA_DEPRECATED __declspec(deprecated)
-#elif defined(__GNUC__)
-#define __CUDA_DEPRECATED __attribute__((deprecated))
-#else
-#define __CUDA_DEPRECATED
-#endif
-/** \endcond impl_private */
-
-#if defined(__cplusplus)
-extern "C" {
-#endif /* __cplusplus */
-
-/**
- * \addtogroup CUDART_OPENGL OpenGL Interoperability
- * This section describes the OpenGL interoperability functions of the CUDA
- * runtime application programming interface. Note that mapping of OpenGL
- * resources is performed with the graphics API agnostic, resource mapping 
- * interface described in \ref CUDART_INTEROP "Graphics Interopability".
- *
- * @{
- */
-
-/**
- * CUDA devices corresponding to the current OpenGL context
- */
-enum cudaGLDeviceList
-{
-  cudaGLDeviceListAll           = 1, /**< The CUDA devices for all GPUs used by the current OpenGL context */
-  cudaGLDeviceListCurrentFrame  = 2, /**< The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame */
-  cudaGLDeviceListNextFrame     = 3  /**< The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame  */
-};
-
-/**
- * \brief Gets the CUDA devices associated with the current OpenGL context
- *
- * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices 
- * corresponding to the current OpenGL context. Also returns in \p *pCudaDevices 
- * at most \p cudaDeviceCount of the CUDA-compatible devices corresponding to 
- * the current OpenGL context. If any of the GPUs being used by the current OpenGL
- * context are not CUDA capable then the call will return ::cudaErrorNoDevice.
- *
- * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to the 
- *                           current OpenGL context
- * \param pCudaDevices     - Returned CUDA devices corresponding to the current 
- *                           OpenGL context
- * \param cudaDeviceCount  - The size of the output device array \p pCudaDevices
- * \param deviceList       - The set of devices to return.  This set may be
- *                           ::cudaGLDeviceListAll for all devices, 
- *                           ::cudaGLDeviceListCurrentFrame for the devices used to
- *                           render the current frame (in SLI), or
- *                           ::cudaGLDeviceListNextFrame for the devices used to
- *                           render the next frame (in SLI).
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorNoDevice,
- * ::cudaErrorInvalidGraphicsContext,
- * ::cudaErrorUnknown
- *
- * \note This function is not supported on Mac OS X.
- * \notefnerr
- *
- * \sa 
- * ::cudaGraphicsUnregisterResource,
- * ::cudaGraphicsMapResources, 
- * ::cudaGraphicsSubResourceGetMappedArray, 
- * ::cudaGraphicsResourceGetMappedPointer,
- * ::cuGLGetDevices 
- */
-extern __host__ cudaError_t CUDARTAPI cudaGLGetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, enum cudaGLDeviceList deviceList);
-
-/**
- * \brief Register an OpenGL texture or renderbuffer object
- *
- * Registers the texture or renderbuffer object specified by \p image for access by CUDA.
- * A handle to the registered object is returned as \p resource.
- *
- * \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D, 
- * ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY, 
- * or ::GL_RENDERBUFFER.
- *
- * The register flags \p flags specify the intended usage, as follows: 
- * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
- *   resource will be used. It is therefore assumed that this resource will be
- *   read from and written to by CUDA. This is the default value.
- * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
- *   will not write to this resource.
- * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
- *   CUDA will not read from this resource and will write over the
- *   entire contents of the resource, so none of the data previously
- *   stored in the resource will be preserved.
- * - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will
- *   bind this resource to a surface reference.
- * - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform
- *   texture gather operations on this resource.
- *
- * The following image formats are supported. For brevity's sake, the list is abbreviated.
- * For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats 
- * {GL_R8, GL_R16, GL_RG8, GL_RG16} :
- * - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY
- * - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I}
- * - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X
- * {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT}
- *
- * The following image classes are currently disallowed:
- * - Textures with borders
- * - Multisampled renderbuffers
- *
- * \param resource - Pointer to the returned object handle
- * \param image    - name of texture or renderbuffer object to be registered
- * \param target   - Identifies the type of object specified by \p image 
- * \param flags    - Register flags
- * 
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidDevice,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa 
- * ::cudaGraphicsUnregisterResource,
- * ::cudaGraphicsMapResources, 
- * ::cudaGraphicsSubResourceGetMappedArray,
- * ::cuGraphicsGLRegisterImage
- */
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterImage(struct cudaGraphicsResource **resource, GLuint image, GLenum target, unsigned int flags);
-
-/**
- * \brief Registers an OpenGL buffer object
- *
- * Registers the buffer object specified by \p buffer for access by
- * CUDA.  A handle to the registered object is returned as \p
- * resource.  The register flags \p flags specify the intended usage,
- * as follows:
- *
- * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
- *   resource will be used. It is therefore assumed that this resource will be
- *   read from and written to by CUDA. This is the default value.
- * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
- *   will not write to this resource.
- * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
- *   CUDA will not read from this resource and will write over the
- *   entire contents of the resource, so none of the data previously
- *   stored in the resource will be preserved.
- *
- * \param resource - Pointer to the returned object handle
- * \param buffer   - name of buffer object to be registered
- * \param flags    - Register flags
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidDevice,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa 
- * ::cudaGraphicsUnregisterResource,
- * ::cudaGraphicsMapResources,
- * ::cudaGraphicsResourceGetMappedPointer,
- * ::cuGraphicsGLRegisterBuffer
- */
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterBuffer(struct cudaGraphicsResource **resource, GLuint buffer, unsigned int flags);
-
-#ifdef _WIN32
-#ifndef WGL_NV_gpu_affinity
-typedef void* HGPUNV;
-#endif
-
-/**
- * \brief Gets the CUDA device associated with hGpu
- *
- * Returns the CUDA device associated with a hGpu, if applicable.
- *
- * \param device - Returns the device associated with hGpu, or -1 if hGpu is
- * not a compute device.
- * \param hGpu   - Handle to a GPU, as queried via WGL_NV_gpu_affinity
- *
- * \return
- * ::cudaSuccess
- * \notefnerr
- *
- * \sa
- * ::WGL_NV_gpu_affinity,
- * ::cuWGLGetDevice
- */
-extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV hGpu);
-#endif
-
-/** @} */ /* END CUDART_OPENGL */
-
-/**
- * \addtogroup CUDART_OPENGL_DEPRECATED OpenGL Interoperability [DEPRECATED]
- * This section describes deprecated OpenGL interoperability functionality.
- *
- * @{
- */
-
-/**
- * CUDA GL Map Flags
- */
-enum cudaGLMapFlags
-{
-  cudaGLMapFlagsNone         = 0,  /**< Default; Assume resource can be read/written */
-  cudaGLMapFlagsReadOnly     = 1,  /**< CUDA kernels will not write to this resource */
-  cudaGLMapFlagsWriteDiscard = 2   /**< CUDA kernels will only write to and will not read from this resource */
-};
-
-/**
- * \brief Sets a CUDA device to use OpenGL interoperability
- *
- * \deprecated This function is deprecated as of CUDA 5.0. 
- *
- * This function is deprecated and should no longer be used.  It is
- * no longer necessary to associate a CUDA device with an OpenGL
- * context in order to achieve maximum interoperability performance.
- *
- * \param device - Device to use for OpenGL interoperability
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidDevice,
- * ::cudaErrorSetOnActiveProcess
- * \notefnerr
- *
- * \sa ::cudaGraphicsGLRegisterBuffer, ::cudaGraphicsGLRegisterImage
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device);
-
-/**
- * \brief Registers a buffer object for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0. 
- *
- * Registers the buffer object of ID \p bufObj for access by
- * CUDA. This function must be called before CUDA can map the buffer
- * object.  The OpenGL context used to create the buffer, or another
- * context from the same share group, must be bound to the current
- * thread when this is called.
- *
- * \param bufObj - Buffer object ID to register
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInitializationError
- * \notefnerr
- *
- * \sa ::cudaGraphicsGLRegisterBuffer
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint bufObj);
-
-/**
- * \brief Maps a buffer object for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0. 
- *
- * Maps the buffer object of ID \p bufObj into the address space of
- * CUDA and returns in \p *devPtr the base pointer of the resulting
- * mapping.  The buffer must have previously been registered by
- * calling ::cudaGLRegisterBufferObject().  While a buffer is mapped
- * by CUDA, any OpenGL operation which references the buffer will
- * result in undefined behavior.  The OpenGL context used to create
- * the buffer, or another context from the same share group, must be
- * bound to the current thread when this is called.
- *
- * All streams in the current thread are synchronized with the current
- * GL context.
- *
- * \param devPtr - Returned device pointer to CUDA object
- * \param bufObj - Buffer object ID to map
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorMapBufferObjectFailed
- * \notefnerr
- *
- * \sa ::cudaGraphicsMapResources
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj);
-
-/**
- * \brief Unmaps a buffer object for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0. 
- *
- * Unmaps the buffer object of ID \p bufObj for access by CUDA.  When
- * a buffer is unmapped, the base address returned by
- * ::cudaGLMapBufferObject() is invalid and subsequent references to
- * the address result in undefined behavior.  The OpenGL context used
- * to create the buffer, or another context from the same share group,
- * must be bound to the current thread when this is called.
- *
- * All streams in the current thread are synchronized with the current
- * GL context.
- *
- * \param bufObj - Buffer object to unmap
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorUnmapBufferObjectFailed
- * \notefnerr
- *
- * \sa ::cudaGraphicsUnmapResources
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj);
-
-/**
- * \brief Unregisters a buffer object for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0. 
- *
- * Unregisters the buffer object of ID \p bufObj for access by CUDA
- * and releases any CUDA resources associated with the buffer.  Once a
- * buffer is unregistered, it may no longer be mapped by CUDA.  The GL
- * context used to create the buffer, or another context from the
- * same share group, must be bound to the current thread when this is
- * called.
- *
- * \param bufObj - Buffer object to unregister
- *
- * \return
- * ::cudaSuccess
- * \notefnerr
- *
- * \sa ::cudaGraphicsUnregisterResource
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint bufObj);
-
-/**
- * \brief Set usage flags for mapping an OpenGL buffer
- *
- * \deprecated This function is deprecated as of CUDA 3.0. 
- *
- * Set flags for mapping the OpenGL buffer \p bufObj
- *
- * Changes to flags will take effect the next time \p bufObj is mapped.
- * The \p flags argument may be any of the following:
- *
- * - ::cudaGLMapFlagsNone: Specifies no hints about how this buffer will
- * be used. It is therefore assumed that this buffer will be read from and
- * written to by CUDA kernels. This is the default value.
- * - ::cudaGLMapFlagsReadOnly: Specifies that CUDA kernels which access this
- * buffer will not write to the buffer.
- * - ::cudaGLMapFlagsWriteDiscard: Specifies that CUDA kernels which access
- * this buffer will not read from the buffer and will write over the
- * entire contents of the buffer, so none of the data previously stored in
- * the buffer will be preserved.
- *
- * If \p bufObj has not been registered for use with CUDA, then
- * ::cudaErrorInvalidResourceHandle is returned. If \p bufObj is presently
- * mapped for access by CUDA, then ::cudaErrorUnknown is returned.
- *
- * \param bufObj    - Registered buffer object to set flags for
- * \param flags     - Parameters for buffer mapping
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorInvalidResourceHandle,
- * ::cudaErrorUnknown
- * \notefnerr
- *
- * \sa ::cudaGraphicsResourceSetMapFlags
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetBufferObjectMapFlags(GLuint bufObj, unsigned int flags); 
-
-/**
- * \brief Maps a buffer object for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0. 
- *
- * Maps the buffer object of ID \p bufObj into the address space of
- * CUDA and returns in \p *devPtr the base pointer of the resulting
- * mapping.  The buffer must have previously been registered by
- * calling ::cudaGLRegisterBufferObject().  While a buffer is mapped
- * by CUDA, any OpenGL operation which references the buffer will
- * result in undefined behavior.  The OpenGL context used to create
- * the buffer, or another context from the same share group, must be
- * bound to the current thread when this is called.
- *
- * Stream /p stream is synchronized with the current GL context.
- *
- * \param devPtr - Returned device pointer to CUDA object
- * \param bufObj - Buffer object ID to map
- * \param stream - Stream to synchronize
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorMapBufferObjectFailed
- * \notefnerr
- *
- * \sa ::cudaGraphicsMapResources
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObjectAsync(void **devPtr, GLuint bufObj, cudaStream_t stream);
-
-/**
- * \brief Unmaps a buffer object for access by CUDA
- *
- * \deprecated This function is deprecated as of CUDA 3.0. 
- *
- * Unmaps the buffer object of ID \p bufObj for access by CUDA.  When
- * a buffer is unmapped, the base address returned by
- * ::cudaGLMapBufferObject() is invalid and subsequent references to
- * the address result in undefined behavior.  The OpenGL context used
- * to create the buffer, or another context from the same share group,
- * must be bound to the current thread when this is called.
- *
- * Stream /p stream is synchronized with the current GL context.
- *
- * \param bufObj - Buffer object to unmap
- * \param stream - Stream to synchronize
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorUnmapBufferObjectFailed
- * \notefnerr
- *
- * \sa ::cudaGraphicsUnmapResources
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObjectAsync(GLuint bufObj, cudaStream_t stream);
-
-/** @} */ /* END CUDART_OPENGL_DEPRECATED */
-
-#if defined(__cplusplus)
-}
-#endif /* __cplusplus */
-
-#undef __CUDA_DEPRECATED
-
-#endif /* __CUDA_GL_INTEROP_H__ */
-
--- a/include/cuda/cuda_occupancy.h
+++ b/include/cuda/cuda_occupancy.h
--- a/include/cuda/cuda_pipeline.h
+++ b/include/cuda/cuda_pipeline.h
@ -1,224 +0,0 @@
-/*
- * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#ifndef _CUDA_PIPELINE_H_
-# define _CUDA_PIPELINE_H_
-
-# include "cuda_pipeline_primitives.h"
-
-# if !defined(_CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER)
-#  error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
-         -std=c++11 compiler option.
-# endif
-
-# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
-#  include "cuda_awbarrier.h"
-# endif
-
-// Integration with libcu++'s cuda::barrier<cuda::thread_scope_block>.
-
-# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
-#  if defined(_LIBCUDACXX_CUDA_ABI_VERSION)
-#   define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION _LIBCUDACXX_CUDA_ABI_VERSION
-#  else
-#   define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION 4
-#  endif
-
-#  define _LIBCUDACXX_PIPELINE_CONCAT(X, Y) X ## Y
-#  define _LIBCUDACXX_PIPELINE_CONCAT2(X, Y) _LIBCUDACXX_PIPELINE_CONCAT(X, Y)
-#  define _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE _LIBCUDACXX_PIPELINE_CONCAT2(__, _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION)
-
-namespace cuda { inline namespace _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE {
-    struct __block_scope_barrier_base;
-}}
-
-# endif
-
-_CUDA_PIPELINE_BEGIN_NAMESPACE
-
-template<size_t N, typename T>
-_CUDA_PIPELINE_QUALIFIER
-auto segment(T* ptr) -> T(*)[N];
-
-class pipeline {
-public:
-    pipeline(const pipeline&) = delete;
-    pipeline(pipeline&&) = delete;
-    pipeline& operator=(const pipeline&) = delete;
-    pipeline& operator=(pipeline&&) = delete;
-
-    _CUDA_PIPELINE_QUALIFIER pipeline();
-    _CUDA_PIPELINE_QUALIFIER size_t commit();
-    _CUDA_PIPELINE_QUALIFIER void commit_and_wait();
-    _CUDA_PIPELINE_QUALIFIER void wait(size_t batch);
-    template<unsigned N>
-    _CUDA_PIPELINE_QUALIFIER void wait_prior();
-
-# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
-    _CUDA_PIPELINE_QUALIFIER void arrive_on(awbarrier& barrier);
-    _CUDA_PIPELINE_QUALIFIER void arrive_on(cuda::__block_scope_barrier_base& barrier);
-# endif
-
-private:
-    size_t current_batch;
-};
-
-template<class T>
-_CUDA_PIPELINE_QUALIFIER
-void memcpy_async(T& dst, const T& src, pipeline& pipe);
-
-template<class T, size_t DstN, size_t SrcN>
-_CUDA_PIPELINE_QUALIFIER
-void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe);
-
-template<size_t N, typename T>
-_CUDA_PIPELINE_QUALIFIER
-auto segment(T* ptr) -> T(*)[N]
-{
-    return (T(*)[N])ptr;
-}
-
-_CUDA_PIPELINE_QUALIFIER
-pipeline::pipeline()
-    : current_batch(0)
-{
-}
-
-_CUDA_PIPELINE_QUALIFIER
-size_t pipeline::commit()
-{
-    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit();
-    return this->current_batch++;
-}
-
-_CUDA_PIPELINE_QUALIFIER
-void pipeline::commit_and_wait()
-{
-    (void)pipeline::commit();
-    pipeline::wait_prior<0>();
-}
-
-_CUDA_PIPELINE_QUALIFIER
-void pipeline::wait(size_t batch)
-{
-    const size_t prior = this->current_batch > batch ? this->current_batch - batch : 0;
-
-    switch (prior) {
-    case  0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); break;
-    case  1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); break;
-    case  2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); break;
-    case  3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); break;
-    case  4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); break;
-    case  5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); break;
-    case  6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); break;
-    case  7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); break;
-    default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); break;
-    }
-}
-
-template<unsigned N>
-_CUDA_PIPELINE_QUALIFIER
-void pipeline::wait_prior()
-{
-    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<N>();
-}
-
-# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
-_CUDA_PIPELINE_QUALIFIER
-void pipeline::arrive_on(awbarrier& barrier)
-{
-    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(&barrier.barrier);
-}
-
-_CUDA_PIPELINE_QUALIFIER
-void pipeline::arrive_on(cuda::__block_scope_barrier_base & barrier)
-{
-    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(reinterpret_cast<uint64_t *>(&barrier));
-}
-# endif
-
-template<class T>
-_CUDA_PIPELINE_QUALIFIER
-void memcpy_async(T& dst, const T& src, pipeline& pipe)
-{
-    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&src) & (alignof(T) - 1)));
-    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&dst) & (alignof(T) - 1)));
-
-    if (__is_trivially_copyable(T)) {
-        _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_relaxed<sizeof(T), alignof(T)>(
-                reinterpret_cast<void*>(&dst), reinterpret_cast<const void*>(&src));
-    } else {
-        dst = src;
-    }
-}
-
-template<class T, size_t DstN, size_t SrcN>
-_CUDA_PIPELINE_QUALIFIER
-void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe)
-{
-    constexpr size_t dst_size = sizeof(*dst);
-    constexpr size_t src_size = sizeof(*src);
-    static_assert(dst_size == 4 || dst_size == 8 || dst_size == 16, "Unsupported copy size.");
-    static_assert(src_size <= dst_size, "Source size must be less than or equal to destination size.");
-    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (dst_size - 1)));
-    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (dst_size - 1)));
-
-    if (__is_trivially_copyable(T)) {
-        _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_strict<sizeof(*dst), sizeof(*src)>(
-                reinterpret_cast<void*>(*dst), reinterpret_cast<const void*>(*src));
-    } else {
-        for (size_t i = 0; i < DstN; ++i) {
-            (*dst)[i] = (i < SrcN) ? (*src)[i] : T();
-        }
-    }
-}
-
-_CUDA_PIPELINE_END_NAMESPACE
-
-#endif /* !_CUDA_PIPELINE_H_ */
--- a/include/cuda/cuda_pipeline_helpers.h
+++ b/include/cuda/cuda_pipeline_helpers.h
@ -1,373 +0,0 @@
-/*
- * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#ifndef _CUDA_PIPELINE_HELPERS_H_
-# define _CUDA_PIPELINE_HELPERS_H_
-
-# define _CUDA_PIPELINE_NAMESPACE       nvcuda::experimental
-# define _CUDA_PIPELINE_BEGIN_NAMESPACE namespace nvcuda { namespace experimental {
-# define _CUDA_PIPELINE_END_NAMESPACE   } }
-
-# define _CUDA_PIPELINE_INTERNAL_NAMESPACE       _CUDA_PIPELINE_NAMESPACE::__pipeline_internal
-# define _CUDA_PIPELINE_BEGIN_INTERNAL_NAMESPACE _CUDA_PIPELINE_BEGIN_NAMESPACE namespace __pipeline_internal {
-# define _CUDA_PIPELINE_END_INTERNAL_NAMESPACE   } _CUDA_PIPELINE_END_NAMESPACE
-
-# if !defined(_CUDA_PIPELINE_QUALIFIER)
-#  define _CUDA_PIPELINE_QUALIFIER inline __device__
-# endif
-# if !defined(_CUDA_PIPELINE_STATIC_QUALIFIER)
-#  define _CUDA_PIPELINE_STATIC_QUALIFIER static inline __device__
-# endif
-
-# if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
-#  define _CUDA_PIPELINE_ARCH_700_OR_LATER
-# endif
-
-# if (__CUDA_ARCH__ >= 800)
-#  define _CUDA_PIPELINE_HAS_ASYNC_COPY 1
-# else
-#  define _CUDA_PIPELINE_HAS_ASYNC_COPY 0
-# endif
-
-# if !defined(_CUDA_PIPELINE_MAX_STAGES)
-#  define _CUDA_PIPELINE_MAX_STAGES 8
-# endif
-
-# if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && (_MSC_VER >= 1900)))
-#  define _CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER
-# endif
-
-# if !defined(_CUDA_PIPELINE_DEBUG)
-#  if defined(__CUDACC_DEBUG__)
-#   define _CUDA_PIPELINE_DEBUG 1
-#  else
-#   define _CUDA_PIPELINE_DEBUG 0
-#  endif
-# endif
-
-# if defined(_CUDA_PIPELINE_DEBUG) && (_CUDA_PIPELINE_DEBUG == 1) && !defined(NDEBUG)
-#  if !defined(__CUDACC_RTC__)
-#   include <cassert>
-#  endif
-#  define _CUDA_PIPELINE_ASSERT(x) assert((x));
-#  define _CUDA_PIPELINE_ABORT() assert(0);
-# else
-#  define _CUDA_PIPELINE_ASSERT(x)
-#  define _CUDA_PIPELINE_ABORT() __trap();
-# endif
-
-# if defined(_CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER)
-#  define _CUDA_PIPELINE_STATIC_ASSERT(c, m) static_assert(c, m)
-# else
-#  define _CUDA_PIPELINE_STATIC_ASSERT(c, m)
-# endif
-
-# if (defined(_MSC_VER) && !defined(_WIN64)) || defined(__arm__)
-#  define _CUDA_PIPELINE_ASM_PTR_CONSTRAINT "r"
-# else
-#  define _CUDA_PIPELINE_ASM_PTR_CONSTRAINT "l"
-# endif
-
-# if defined(__CUDACC_RTC__)
-typedef unsigned int       uint32_t;
-typedef unsigned long long uint64_t;
-typedef uint64_t           uintptr_t;
-# else
-#  include <stdint.h>
-# endif
-
-_CUDA_PIPELINE_BEGIN_INTERNAL_NAMESPACE
-
-_CUDA_PIPELINE_STATIC_ASSERT(sizeof(short) ==  2, "Size mismatch for type 'short'");
-_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int)   ==  4, "Size mismatch for type 'int'");
-_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int2)  ==  8, "Size mismatch for type 'int2'");
-_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int4)  == 16, "Size mismatch for type 'int4'");
-
-extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *);
-
-template<size_t CopySize, size_t SourceSize>
-_CUDA_PIPELINE_QUALIFIER
-void pipeline_memcpy_sync(void* __restrict__ dst, const void* __restrict__ src)
-{
-    _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
-    _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
-    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
-    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
-
-    char* const d = reinterpret_cast<char*>(dst);
-    const char* const s = reinterpret_cast<const char*>(src);
-
-    size_t copy_step_size;
-    if (SourceSize == 0) {
-        copy_step_size = CopySize;
-    } else if (SourceSize == 2 || SourceSize == 4 || SourceSize == 8 || SourceSize == 16) {
-        copy_step_size = SourceSize;
-    } else {
-        copy_step_size = 1;
-    }
-
-    for (size_t i = 0; i < CopySize; i += copy_step_size) {
-        const bool copy_source = SourceSize && (i < SourceSize);
-
-        switch (copy_step_size) {
-        case 1:
-            d[i] = copy_source ? s[i] : char();
-            break;
-        case 2:
-            *reinterpret_cast<short*>(d + i) = copy_source ? *reinterpret_cast<const short*>(s + i) : short();
-            break;
-        case 4:
-            *reinterpret_cast<int*>(d + i) = copy_source ? *reinterpret_cast<const int*>(s + i) : int();
-            break;
-        case 8:
-            *reinterpret_cast<int2*>(d + i) = copy_source ? *reinterpret_cast<const int2*>(s + i) : int2();
-            break;
-        case 16:
-            *reinterpret_cast<int4*>(d + i) = copy_source ? *reinterpret_cast<const int4*>(s + i) : int4();
-            break;
-        }
-    }
-}
-
-template<bool UseHwAsyncCopy>
-struct ImplementationChooser;
-
-template<>
-struct ImplementationChooser<true> {
-    template<size_t CopySize, size_t SourceSize>
-    struct CpAsyncChooser {
-        _CUDA_PIPELINE_STATIC_QUALIFIER
-        void cp_async(void* __restrict__ dst, const void* __restrict__ src)
-        {
-            asm volatile ("cp.async.ca.shared.global [%0], [%1], %2, %3;"
-                :
-                : "r"(__nvvm_get_smem_pointer(dst)), _CUDA_PIPELINE_ASM_PTR_CONSTRAINT(src), "n"(CopySize),
-                  "n"(SourceSize)
-                : "memory");
-        }
-    };
-
-    template<size_t SourceSize>
-    struct CpAsyncChooser<16, SourceSize> {
-        _CUDA_PIPELINE_STATIC_QUALIFIER
-        void cp_async(void* __restrict__ dst, const void* __restrict__ src)
-        {
-            asm volatile ("cp.async.cg.shared.global [%0], [%1], %2, %3;"
-                :
-                : "r"(__nvvm_get_smem_pointer(dst)), _CUDA_PIPELINE_ASM_PTR_CONSTRAINT(src), "n"(16), "n"(SourceSize)
-                : "memory");
-        }
-    };
-
-    template<size_t CopySize, size_t SourceSize>
-    _CUDA_PIPELINE_STATIC_QUALIFIER
-    void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
-    {
-        _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
-        _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
-        _CUDA_PIPELINE_ASSERT(__isShared(dst));
-        _CUDA_PIPELINE_ASSERT(__isGlobal(src));
-        _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
-        _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
-
-        CpAsyncChooser<CopySize, SourceSize>::cp_async(dst, src);
-    }
-
-    _CUDA_PIPELINE_STATIC_QUALIFIER
-    void pipeline_commit()
-    {
-        asm volatile ("cp.async.commit_group;");
-    }
-
-    template<unsigned N>
-    _CUDA_PIPELINE_STATIC_QUALIFIER
-    void pipeline_wait_prior()
-    {
-        asm volatile ("cp.async.wait_group %0;"
-            :
-            : "n"(N < _CUDA_PIPELINE_MAX_STAGES ? N : _CUDA_PIPELINE_MAX_STAGES));
-    }
-
-    _CUDA_PIPELINE_STATIC_QUALIFIER
-    void pipeline_arrive_on(uint64_t* barrier)
-    {
-        _CUDA_PIPELINE_ASSERT(__isShared(barrier));
-
-        asm volatile ("cp.async.mbarrier.arrive.shared.b64 [%0];"
-            :
-            : "r"(__nvvm_get_smem_pointer(barrier)));
-    }
-};
-
-template<>
-struct ImplementationChooser<false> {
-    template<size_t CopySize, size_t SourceSize>
-    _CUDA_PIPELINE_STATIC_QUALIFIER
-    void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
-    {
-        _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
-        _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
-        _CUDA_PIPELINE_ASSERT(__isShared(dst));
-        _CUDA_PIPELINE_ASSERT(__isGlobal(src));
-        _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
-        _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
-
-        pipeline_memcpy_sync<CopySize, SourceSize>(dst, src);
-    }
-
-    _CUDA_PIPELINE_STATIC_QUALIFIER
-    void pipeline_commit()
-    {
-    }
-
-    template<unsigned N>
-    _CUDA_PIPELINE_STATIC_QUALIFIER
-    void pipeline_wait_prior()
-    {
-    }
-
-    _CUDA_PIPELINE_STATIC_QUALIFIER
-    void pipeline_arrive_on(uint64_t* barrier)
-    {
-    }
-};
-
-template<size_t CopySize, size_t SourceSize>
-_CUDA_PIPELINE_QUALIFIER
-void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
-{
-    _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
-    _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
-    _CUDA_PIPELINE_ASSERT(__isShared(dst));
-    _CUDA_PIPELINE_ASSERT(__isGlobal(src));
-    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
-    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
-
-    ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_memcpy_async<CopySize, SourceSize>(dst, src);
-}
-
-_CUDA_PIPELINE_QUALIFIER
-void pipeline_commit()
-{
-    ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_commit();
-}
-
-template<unsigned N>
-_CUDA_PIPELINE_QUALIFIER
-void pipeline_wait_prior()
-{
-    ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_wait_prior<N>();
-}
-
-_CUDA_PIPELINE_QUALIFIER
-void pipeline_arrive_on(uint64_t* barrier)
-{
-    ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_arrive_on(barrier);
-}
-
-template<size_t CopySize, size_t SourceSize>
-_CUDA_PIPELINE_QUALIFIER
-void pipeline_copy_strict(void* __restrict__ dst, const void* __restrict__ src)
-{
-    _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
-    _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size.");
-    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
-    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
-
-    if (__isGlobal(src) && __isShared(dst)) {
-        pipeline_memcpy_async<CopySize, SourceSize>(dst, src);
-    } else {
-        pipeline_memcpy_sync<CopySize, SourceSize>(dst, src);
-    }
-}
-
-template<size_t CopySize, size_t Align>
-_CUDA_PIPELINE_QUALIFIER
-void pipeline_copy_relaxed(void* __restrict__ dst, const void* __restrict__ src)
-{
-    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (Align - 1)));
-    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (Align - 1)));
-
-    const char* s = reinterpret_cast<const char*>(src);
-    char* d = reinterpret_cast<char*>(dst);
-    size_t remaining = CopySize;
-
-    while (remaining) {
-        if ((Align >= 16) && (remaining >= 16)) {
-            pipeline_copy_strict<16, 16>(dst, src);
-            d += 16;
-            s += 16;
-            remaining -= 16;
-        } else if ((Align >= 8) && (remaining >= 8)) {
-            pipeline_copy_strict<8, 8>(dst, src);
-            d += 8;
-            s += 8;
-            remaining -= 8;
-        } else if ((Align >= 4) && (remaining >= 4)) {
-            pipeline_copy_strict<4, 4>(dst, src);
-            d += 4;
-            s += 4;
-            remaining -= 4;
-        } else if ((Align >= 2) && (remaining >= 2)) {
-            *reinterpret_cast<short*>(d) = *reinterpret_cast<const short*>(s);
-            d += 2;
-            s += 2;
-            remaining -= 2;
-        } else {
-            *d = *s;
-            d += 1;
-            s += 1;
-            remaining -= 1;
-        }
-    }
-}
-
-_CUDA_PIPELINE_END_INTERNAL_NAMESPACE
-
-#endif /* !_CUDA_PIPELINE_HELPERS_H_ */
--- a/include/cuda/cuda_pipeline_primitives.h
+++ b/include/cuda/cuda_pipeline_primitives.h
@ -1,148 +0,0 @@
-/*
- * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#ifndef _CUDA_PIPELINE_PRIMITIVES_H_
-# define _CUDA_PIPELINE_PRIMITIVES_H_
-
-# include "cuda_pipeline_helpers.h"
-
-_CUDA_PIPELINE_STATIC_QUALIFIER
-void __pipeline_memcpy_async(void* __restrict__ dst_shared, const void* __restrict__ src_global, size_t size_and_align,
-                             size_t zfill = 0)
-{
-    _CUDA_PIPELINE_ASSERT(size_and_align == 4 || size_and_align == 8 || size_and_align == 16);
-    _CUDA_PIPELINE_ASSERT(zfill <= size_and_align);
-    _CUDA_PIPELINE_ASSERT(__isShared(dst_shared));
-    _CUDA_PIPELINE_ASSERT(__isGlobal(src_global));
-    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst_shared) & (size_and_align - 1)));
-    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src_global) & (size_and_align - 1)));
-
-    switch (size_and_align) {
-    case 16:
-        switch (zfill) {
-        case  0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 16>(dst_shared, src_global); return;
-        case  1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 15>(dst_shared, src_global); return;
-        case  2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 14>(dst_shared, src_global); return;
-        case  3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 13>(dst_shared, src_global); return;
-        case  4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 12>(dst_shared, src_global); return;
-        case  5: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 11>(dst_shared, src_global); return;
-        case  6: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 10>(dst_shared, src_global); return;
-        case  7: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  9>(dst_shared, src_global); return;
-        case  8: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  8>(dst_shared, src_global); return;
-        case  9: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  7>(dst_shared, src_global); return;
-        case 10: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  6>(dst_shared, src_global); return;
-        case 11: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  5>(dst_shared, src_global); return;
-        case 12: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  4>(dst_shared, src_global); return;
-        case 13: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  3>(dst_shared, src_global); return;
-        case 14: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  2>(dst_shared, src_global); return;
-        case 15: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  1>(dst_shared, src_global); return;
-        case 16: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  0>(dst_shared, src_global); return;
-        default: _CUDA_PIPELINE_ABORT();                                                                   return;
-        }
-    case 8:
-        switch (zfill) {
-        case  0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  8>(dst_shared, src_global); return;
-        case  1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  7>(dst_shared, src_global); return;
-        case  2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  6>(dst_shared, src_global); return;
-        case  3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  5>(dst_shared, src_global); return;
-        case  4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  4>(dst_shared, src_global); return;
-        case  5: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  3>(dst_shared, src_global); return;
-        case  6: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  2>(dst_shared, src_global); return;
-        case  7: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  1>(dst_shared, src_global); return;
-        case  8: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  0>(dst_shared, src_global); return;
-        default: _CUDA_PIPELINE_ABORT();                                                                   return;
-        }
-    case 4:
-        switch (zfill) {
-        case  0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  4>(dst_shared, src_global); return;
-        case  1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  3>(dst_shared, src_global); return;
-        case  2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  2>(dst_shared, src_global); return;
-        case  3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  1>(dst_shared, src_global); return;
-        case  4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  0>(dst_shared, src_global); return;
-        default: _CUDA_PIPELINE_ABORT();                                                                   return;
-        }
-    default:
-        _CUDA_PIPELINE_ABORT();
-        return;
-    }
-}
-
-_CUDA_PIPELINE_STATIC_QUALIFIER
-void __pipeline_commit()
-{
-    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit();
-}
-
-_CUDA_PIPELINE_STATIC_QUALIFIER
-void __pipeline_wait_prior(size_t prior)
-{
-    switch (prior) {
-    case  0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); return;
-    case  1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); return;
-    case  2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); return;
-    case  3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); return;
-    case  4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); return;
-    case  5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); return;
-    case  6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); return;
-    case  7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); return;
-    default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); return;
-    }
-}
-
-# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
-#  include "cuda_awbarrier_primitives.h"
-
-_CUDA_PIPELINE_STATIC_QUALIFIER
-void __pipeline_arrive_on(__mbarrier_t* barrier)
-{
-    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(barrier);
-}
-# endif
-
-#endif /* !_CUDA_PIPELINE_PRIMITIVES_H_ */
--- a/include/cuda/cuda_profiler_api.h
+++ b/include/cuda/cuda_profiler_api.h
@ -1,207 +0,0 @@
-/*
- * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__CUDA_PROFILER_API_H__)
-#define __CUDA_PROFILER_API_H__
-
-#include "driver_types.h"
-
-#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
-#define __CUDA_DEPRECATED
-#elif defined(_MSC_VER)
-#define __CUDA_DEPRECATED __declspec(deprecated)
-#elif defined(__GNUC__)
-#define __CUDA_DEPRECATED __attribute__((deprecated))
-#else
-#define __CUDA_DEPRECATED
-#endif
-
-#if defined(__cplusplus)
-extern "C" {
-#endif /* __cplusplus */
-
-/**
- * \ingroup CUDART
- * \defgroup CUDART_PROFILER_DEPRECATED Profiler Control [DEPRECATED]
- *
- * ___MANBRIEF___ profiler control functions of the CUDA runtime API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the profiler control functions of the CUDA runtime
- * application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Initialize the CUDA profiler.
- *
- * \deprecated
- *	
- * Using this API user can initialize the CUDA profiler by specifying
- * the configuration file, output file and output file format. This
- * API is generally used to profile different set of counters by
- * looping the kernel launch. The \p configFile parameter can be used
- * to select profiling options including profiler counters. Refer to
- * the "Compute Command Line Profiler User Guide" for supported
- * profiler options and counters.
- *
- * Limitation: The CUDA profiler cannot be initialized with this API
- * if another profiling tool is already active, as indicated by the
- * ::cudaErrorProfilerDisabled return code.
- *
- * Typical usage of the profiling APIs is as follows: 
- *
- * for each set of counters/options\n
- * {\n
- *      cudaProfilerInitialize(); //Initialize profiling,set the counters/options in 
- * the config file \n
- *      ...\n
- *      cudaProfilerStart(); \n
- *      // code to be profiled \n
- *      cudaProfilerStop();\n
- *      ...\n
- *      cudaProfilerStart(); \n
- *      // code to be profiled \n
- *      cudaProfilerStop();\n
- *      ...\n
- * }\n
- *
- *
- * \param configFile - Name of the config file that lists the counters/options
- * for profiling.
- * \param outputFile - Name of the outputFile where the profiling results will
- * be stored.
- * \param outputMode - outputMode, can be ::cudaKeyValuePair OR ::cudaCSV.
- *
- * \return
- * ::cudaSuccess,
- * ::cudaErrorInvalidValue,
- * ::cudaErrorProfilerDisabled
- * \notefnerr
- *
- * \sa
- * ::cudaProfilerStart,
- * ::cudaProfilerStop,
- * ::cuProfilerInitialize
- */
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaProfilerInitialize(const char *configFile, 
-                                                             const char *outputFile, 
-                                                             cudaOutputMode_t outputMode);
-
-/** @} */ /* END CUDART_PROFILER_DEPRECATED */
-
-/**
- * \ingroup CUDART
- * \defgroup CUDART_PROFILER Profiler Control
- *
- * ___MANBRIEF___ profiler control functions of the CUDA runtime API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the profiler control functions of the CUDA runtime
- * application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Enable profiling.
- *
- * Enables profile collection by the active profiling tool for the
- * current context. If profiling is already enabled, then
- * cudaProfilerStart() has no effect.
- *
- * cudaProfilerStart and cudaProfilerStop APIs are used to
- * programmatically control the profiling granularity by allowing
- * profiling to be done only on selective pieces of code.
- * 
- *
- * \return
- * ::cudaSuccess
- * \notefnerr
- *
- * \sa
- * ::cudaProfilerInitialize,
- * ::cudaProfilerStop,
- * ::cuProfilerStart
- */
-extern __host__ cudaError_t CUDARTAPI cudaProfilerStart(void);
-
-/**
- * \brief Disable profiling.
- *
- * Disables profile collection by the active profiling tool for the
- * current context. If profiling is already disabled, then
- * cudaProfilerStop() has no effect.
- *
- * cudaProfilerStart and cudaProfilerStop APIs are used to
- * programmatically control the profiling granularity by allowing
- * profiling to be done only on selective pieces of code.
- *
- * \return
- * ::cudaSuccess
- * \notefnerr
- *
- * \sa
- * ::cudaProfilerInitialize,
- * ::cudaProfilerStart,
- * ::cuProfilerStop
- */
-extern __host__ cudaError_t CUDARTAPI cudaProfilerStop(void);
-
-/** @} */ /* END CUDART_PROFILER */
-
-#undef __CUDA_DEPRECATED
-
-#if defined(__cplusplus)
-}
-#endif /* __cplusplus */
-
-#endif /* !__CUDA_PROFILER_API_H__ */
-
--- a/include/cuda/cuda_runtime.h
+++ b/include/cuda/cuda_runtime.h
--- a/include/cuda/cuda_runtime_api.h
+++ b/include/cuda/cuda_runtime_api.h
--- a/include/cuda/cuda_surface_types.h
+++ b/include/cuda/cuda_surface_types.h
@ -1,103 +0,0 @@
-/*
- * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__CUDA_SURFACE_TYPES_H__)
-#define __CUDA_SURFACE_TYPES_H__
-
-#if defined(__cplusplus) && defined(__CUDACC__)
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#if !defined(__CUDACC_RTC__)
-#define EXCLUDE_FROM_RTC
-#include "channel_descriptor.h"
-#undef EXCLUDE_FROM_RTC
-#endif /* !__CUDACC_RTC__ */
-#include "cuda_runtime_api.h"
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-template<class T, int dim = 1>
-struct __device_builtin_surface_type__ surface : public surfaceReference
-{
-#if !defined(__CUDACC_RTC__)
-  __host__ surface(void)
-  {
-    channelDesc = cudaCreateChannelDesc<T>();
-  }
-
-  __host__ surface(struct cudaChannelFormatDesc desc)
-  {
-    channelDesc = desc;
-  }
-#endif /* !__CUDACC_RTC__ */  
-};
-
-template<int dim>
-struct  __device_builtin_surface_type__  surface<void, dim> : public surfaceReference
-{
-#if !defined(__CUDACC_RTC__)
-  __host__ surface(void)
-  {
-    channelDesc = cudaCreateChannelDesc<void>();
-  }
-#endif /* !__CUDACC_RTC__ */  
-};
-
-#endif /* __cplusplus && __CUDACC__ */
-
-#endif /* !__CUDA_SURFACE_TYPES_H__ */
--- a/include/cuda/cuda_texture_types.h
+++ b/include/cuda/cuda_texture_types.h
@ -1,109 +0,0 @@
-/*
- * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__CUDA_TEXTURE_TYPES_H__)
-#define __CUDA_TEXTURE_TYPES_H__
-
-#if defined(__cplusplus) && defined(__CUDACC__)
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#if !defined(__CUDACC_RTC__)
-#define EXCLUDE_FROM_RTC
-#include "channel_descriptor.h"
-#undef EXCLUDE_FROM_RTC
-#endif /* !__CUDACC_RTC__ */
-#include "cuda_runtime_api.h"
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-template<class T, int texType = cudaTextureType1D, enum cudaTextureReadMode mode = cudaReadModeElementType>
-struct __device_builtin_texture_type__ texture : public textureReference
-{
-#if !defined(__CUDACC_RTC__)
-  __host__ texture(int                         norm  = 0,
-                   enum cudaTextureFilterMode  fMode = cudaFilterModePoint,
-                   enum cudaTextureAddressMode aMode = cudaAddressModeClamp)
-  {
-    normalized     = norm;
-    filterMode     = fMode;
-    addressMode[0] = aMode;
-    addressMode[1] = aMode;
-    addressMode[2] = aMode;
-    channelDesc    = cudaCreateChannelDesc<T>();
-    sRGB           = 0;
-  }
-
-  __host__ texture(int                          norm,
-                   enum cudaTextureFilterMode   fMode,
-                   enum cudaTextureAddressMode  aMode,
-                   struct cudaChannelFormatDesc desc)
-  {
-    normalized     = norm;
-    filterMode     = fMode;
-    addressMode[0] = aMode;
-    addressMode[1] = aMode;
-    addressMode[2] = aMode;
-    channelDesc    = desc;
-    sRGB           = 0;
-  }
-#endif /* !__CUDACC_RTC__ */
-};
-
-#endif /* __cplusplus && __CUDACC__ */
-
-#endif /* !__CUDA_TEXTURE_TYPES_H__ */
--- a/include/cuda/cudalibxt.h
+++ b/include/cuda/cudalibxt.h
@ -1,97 +0,0 @@
- /* Copyright 2013,2014 NVIDIA Corporation.  All rights reserved.
-  *
-  * NOTICE TO LICENSEE:
-  *
-  * The source code and/or documentation ("Licensed Deliverables") are
-  * subject to NVIDIA intellectual property rights under U.S. and
-  * international Copyright laws.
-  *
-  * The Licensed Deliverables contained herein are PROPRIETARY and
-  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
-  * conditions of a form of NVIDIA software license agreement by and
-  * between NVIDIA and Licensee ("License Agreement") or electronically
-  * accepted by Licensee.  Notwithstanding any terms or conditions to
-  * the contrary in the License Agreement, reproduction or disclosure
-  * of the Licensed Deliverables to any third party without the express
-  * written consent of NVIDIA is prohibited.
-  *
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
-  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
-  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
-  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
-  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
-  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
-  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
-  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
-  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
-  * OF THESE LICENSED DELIVERABLES.
-  *
-  * U.S. Government End Users.  These Licensed Deliverables are a
-  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
-  * 1995), consisting of "commercial computer software" and "commercial
-  * computer software documentation" as such terms are used in 48
-  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
-  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
-  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
-  * U.S. Government End Users acquire the Licensed Deliverables with
-  * only those rights set forth herein.
-  *
-  * Any use of the Licensed Deliverables in individual and commercial
-  * software must include, in the user documentation and internal
-  * comments to the code, the above Disclaimer and U.S. Government End
-  * Users Notice.
-  */
-
-/*!
-* \file cudalibxt.h  
-* \brief Public header file for the NVIDIA library multi-GPU support structures  
-*/ 
-
-#ifndef _CUDA_LIB_XT_H_
-#define _CUDA_LIB_XT_H_
-#include <cuda_runtime.h>
-
-#define CUDA_XT_DESCRIPTOR_VERSION 0x01000000 // This is added to CUDART_VERSION
-
-enum cudaXtCopyType_t {
-    LIB_XT_COPY_HOST_TO_DEVICE,
-    LIB_XT_COPY_DEVICE_TO_HOST,
-    LIB_XT_COPY_DEVICE_TO_DEVICE
-} ;
-typedef enum cudaXtCopyType_t cudaLibXtCopyType;
-
-enum libFormat_t {
-    LIB_FORMAT_CUFFT        = 0x0,
-    LIB_FORMAT_UNDEFINED    = 0x1
-};
-
-typedef enum libFormat_t libFormat;
-
-#define MAX_CUDA_DESCRIPTOR_GPUS 64
-
-struct cudaXtDesc_t{
-    int version;                             //descriptor version
-    int nGPUs;                               //number of GPUs 
-    int GPUs[MAX_CUDA_DESCRIPTOR_GPUS];      //array of device IDs
-    void *data[MAX_CUDA_DESCRIPTOR_GPUS];    //array of pointers to data, one per GPU
-    size_t size[MAX_CUDA_DESCRIPTOR_GPUS];   //array of data sizes, one per GPU
-    void *cudaXtState;                       //opaque CUDA utility structure
-};
-typedef struct cudaXtDesc_t cudaXtDesc;
-
-struct cudaLibXtDesc_t{
-    int version;                //descriptor version
-    cudaXtDesc *descriptor;     //multi-GPU memory descriptor
-    libFormat library;          //which library recognizes the format
-    int subFormat;              //library specific enumerator of sub formats
-    void *libDescriptor;        //library specific descriptor e.g. FFT transform plan object
-};
-typedef struct cudaLibXtDesc_t cudaLibXtDesc;
-
-
-#endif
-
--- a/include/cuda/cudart_platform.h
+++ b/include/cuda/cudart_platform.h
@ -1,57 +0,0 @@
-/*
- * Copyright 2016 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#ifndef __CUDART_PLATFORM_H__
-#define __CUDART_PLATFORM_H__
-
-#if ((defined(__linux__) || defined(__QNX__)) && (defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)))
-#define isEglSupported 1
-#endif
-
-#endif
--- a/include/cuda/cufft.h
+++ b/include/cuda/cufft.h
@ -1,322 +0,0 @@
- /* Copyright 2005-2021 NVIDIA Corporation.  All rights reserved.
-  *
-  * NOTICE TO LICENSEE:
-  *
-  * The source code and/or documentation ("Licensed Deliverables") are
-  * subject to NVIDIA intellectual property rights under U.S. and
-  * international Copyright laws.
-  *
-  * The Licensed Deliverables contained herein are PROPRIETARY and
-  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
-  * conditions of a form of NVIDIA software license agreement by and
-  * between NVIDIA and Licensee ("License Agreement") or electronically
-  * accepted by Licensee.  Notwithstanding any terms or conditions to
-  * the contrary in the License Agreement, reproduction or disclosure
-  * of the Licensed Deliverables to any third party without the express
-  * written consent of NVIDIA is prohibited.
-  *
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
-  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
-  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
-  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
-  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
-  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
-  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
-  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
-  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
-  * OF THESE LICENSED DELIVERABLES.
-  *
-  * U.S. Government End Users.  These Licensed Deliverables are a
-  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
-  * 1995), consisting of "commercial computer software" and "commercial
-  * computer software documentation" as such terms are used in 48
-  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
-  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
-  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
-  * U.S. Government End Users acquire the Licensed Deliverables with
-  * only those rights set forth herein.
-  *
-  * Any use of the Licensed Deliverables in individual and commercial
-  * software must include, in the user documentation and internal
-  * comments to the code, the above Disclaimer and U.S. Government End
-  * Users Notice.
-  */
-
-/*!
-* \file cufft.h
-* \brief Public header file for the NVIDIA CUDA FFT library (CUFFT)
-*/
-
-#ifndef _CUFFT_H_
-#define _CUFFT_H_
-
-
-#include "cuComplex.h"
-#include "driver_types.h"
-#include "library_types.h"
-
-#ifndef CUFFTAPI
-#ifdef _WIN32
-#define CUFFTAPI __stdcall
-#elif __GNUC__ >= 4
-#define CUFFTAPI __attribute__ ((visibility ("default")))
-#else
-#define CUFFTAPI
-#endif
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define CUFFT_VER_MAJOR 10
-#define CUFFT_VER_MINOR 7
-#define CUFFT_VER_PATCH 1
-#define CUFFT_VER_BUILD 0
-
-// cuFFT library version
-//
-// CUFFT_VERSION / 1000 - major version
-// CUFFT_VERSION / 100 % 100 - minor version
-// CUFFT_VERSION % 100 - patch level
-#define CUFFT_VERSION 10701
-
-// CUFFT API function return values
-typedef enum cufftResult_t {
-  CUFFT_SUCCESS        = 0x0,
-  CUFFT_INVALID_PLAN   = 0x1,
-  CUFFT_ALLOC_FAILED   = 0x2,
-  CUFFT_INVALID_TYPE   = 0x3,
-  CUFFT_INVALID_VALUE  = 0x4,
-  CUFFT_INTERNAL_ERROR = 0x5,
-  CUFFT_EXEC_FAILED    = 0x6,
-  CUFFT_SETUP_FAILED   = 0x7,
-  CUFFT_INVALID_SIZE   = 0x8,
-  CUFFT_UNALIGNED_DATA = 0x9,
-  CUFFT_INCOMPLETE_PARAMETER_LIST = 0xA,
-  CUFFT_INVALID_DEVICE = 0xB,
-  CUFFT_PARSE_ERROR = 0xC,
-  CUFFT_NO_WORKSPACE = 0xD,
-  CUFFT_NOT_IMPLEMENTED = 0xE,
-  CUFFT_LICENSE_ERROR = 0x0F,
-  CUFFT_NOT_SUPPORTED = 0x10
-
-} cufftResult;
-
-#define MAX_CUFFT_ERROR 0x11
-
-
-// CUFFT defines and supports the following data types
-
-
-// cufftReal is a single-precision, floating-point real data type.
-// cufftDoubleReal is a double-precision, real data type.
-typedef float cufftReal;
-typedef double cufftDoubleReal;
-
-// cufftComplex is a single-precision, floating-point complex data type that
-// consists of interleaved real and imaginary components.
-// cufftDoubleComplex is the double-precision equivalent.
-typedef cuComplex cufftComplex;
-typedef cuDoubleComplex cufftDoubleComplex;
-
-// CUFFT transform directions
-#define CUFFT_FORWARD -1 // Forward FFT
-#define CUFFT_INVERSE  1 // Inverse FFT
-
-// CUFFT supports the following transform types
-typedef enum cufftType_t {
-  CUFFT_R2C = 0x2a,     // Real to Complex (interleaved)
-  CUFFT_C2R = 0x2c,     // Complex (interleaved) to Real
-  CUFFT_C2C = 0x29,     // Complex to Complex, interleaved
-  CUFFT_D2Z = 0x6a,     // Double to Double-Complex
-  CUFFT_Z2D = 0x6c,     // Double-Complex to Double
-  CUFFT_Z2Z = 0x69      // Double-Complex to Double-Complex
-} cufftType;
-
-// CUFFT supports the following data layouts
-typedef enum cufftCompatibility_t {
-    CUFFT_COMPATIBILITY_FFTW_PADDING    = 0x01    // The default value
-} cufftCompatibility;
-
-#define CUFFT_COMPATIBILITY_DEFAULT   CUFFT_COMPATIBILITY_FFTW_PADDING
-
-//
-// structure definition used by the shim between old and new APIs
-//
-#define MAX_SHIM_RANK 3
-
-// cufftHandle is a handle type used to store and access CUFFT plans.
-typedef int cufftHandle;
-
-
-cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan,
-                                 int nx,
-                                 cufftType type,
-                                 int batch);
-
-cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan,
-                                 int nx, int ny,
-                                 cufftType type);
-
-cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan,
-                                 int nx, int ny, int nz,
-                                 cufftType type);
-
-cufftResult CUFFTAPI cufftPlanMany(cufftHandle *plan,
-                                   int rank,
-                                   int *n,
-                                   int *inembed, int istride, int idist,
-                                   int *onembed, int ostride, int odist,
-                                   cufftType type,
-                                   int batch);
-
-cufftResult CUFFTAPI cufftMakePlan1d(cufftHandle plan,
-                                     int nx,
-                                     cufftType type,
-                                     int batch,
-                                     size_t *workSize);
-
-cufftResult CUFFTAPI cufftMakePlan2d(cufftHandle plan,
-                                     int nx, int ny,
-                                     cufftType type,
-                                     size_t *workSize);
-
-cufftResult CUFFTAPI cufftMakePlan3d(cufftHandle plan,
-                                     int nx, int ny, int nz,
-                                     cufftType type,
-                                     size_t *workSize);
-
-cufftResult CUFFTAPI cufftMakePlanMany(cufftHandle plan,
-                                       int rank,
-                                       int *n,
-                                       int *inembed, int istride, int idist,
-                                       int *onembed, int ostride, int odist,
-                                       cufftType type,
-                                       int batch,
-                                       size_t *workSize);
-
-cufftResult CUFFTAPI cufftMakePlanMany64(cufftHandle plan,
-                                         int rank,
-                                         long long int *n,
-                                         long long int *inembed,
-                                         long long int istride,
-                                         long long int idist,
-                                         long long int *onembed,
-                                         long long int ostride, long long int odist,
-                                         cufftType type,
-                                         long long int batch,
-                                         size_t * workSize);
-
-cufftResult CUFFTAPI cufftGetSizeMany64(cufftHandle plan,
-                                        int rank,
-                                        long long int *n,
-                                        long long int *inembed,
-                                        long long int istride, long long int idist,
-                                        long long int *onembed,
-                                        long long int ostride, long long int odist,
-                                        cufftType type,
-                                        long long int batch,
-                                        size_t *workSize);
-
-
-
-
-cufftResult CUFFTAPI cufftEstimate1d(int nx,
-                                     cufftType type,
-                                     int batch,
-                                     size_t *workSize);
-
-cufftResult CUFFTAPI cufftEstimate2d(int nx, int ny,
-                                     cufftType type,
-                                     size_t *workSize);
-
-cufftResult CUFFTAPI cufftEstimate3d(int nx, int ny, int nz,
-                                     cufftType type,
-                                     size_t *workSize);
-
-cufftResult CUFFTAPI cufftEstimateMany(int rank,
-                                       int *n,
-                                       int *inembed, int istride, int idist,
-                                       int *onembed, int ostride, int odist,
-                                       cufftType type,
-                                       int batch,
-                                       size_t *workSize);
-
-cufftResult CUFFTAPI cufftCreate(cufftHandle * handle);
-
-cufftResult CUFFTAPI cufftGetSize1d(cufftHandle handle,
-                                    int nx,
-                                    cufftType type,
-                                    int batch,
-                                    size_t *workSize );
-
-cufftResult CUFFTAPI cufftGetSize2d(cufftHandle handle,
-                                    int nx, int ny,
-                                    cufftType type,
-                                    size_t *workSize);
-
-cufftResult CUFFTAPI cufftGetSize3d(cufftHandle handle,
-                                    int nx, int ny, int nz,
-                                    cufftType type,
-                                    size_t *workSize);
-
-cufftResult CUFFTAPI cufftGetSizeMany(cufftHandle handle,
-                                      int rank, int *n,
-                                      int *inembed, int istride, int idist,
-                                      int *onembed, int ostride, int odist,
-                                      cufftType type, int batch, size_t *workArea);
-
-cufftResult CUFFTAPI cufftGetSize(cufftHandle handle, size_t *workSize);
-
-cufftResult CUFFTAPI cufftSetWorkArea(cufftHandle plan, void *workArea);
-
-cufftResult CUFFTAPI cufftSetAutoAllocation(cufftHandle plan, int autoAllocate);
-
-cufftResult CUFFTAPI cufftExecC2C(cufftHandle plan,
-                                  cufftComplex *idata,
-                                  cufftComplex *odata,
-                                  int direction);
-
-cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan,
-                                  cufftReal *idata,
-                                  cufftComplex *odata);
-
-cufftResult CUFFTAPI cufftExecC2R(cufftHandle plan,
-                                  cufftComplex *idata,
-                                  cufftReal *odata);
-
-cufftResult CUFFTAPI cufftExecZ2Z(cufftHandle plan,
-                                  cufftDoubleComplex *idata,
-                                  cufftDoubleComplex *odata,
-                                  int direction);
-
-cufftResult CUFFTAPI cufftExecD2Z(cufftHandle plan,
-                                  cufftDoubleReal *idata,
-                                  cufftDoubleComplex *odata);
-
-cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan,
-                                  cufftDoubleComplex *idata,
-                                  cufftDoubleReal *odata);
-
-
-// utility functions
-cufftResult CUFFTAPI cufftSetStream(cufftHandle plan,
-                                    cudaStream_t stream);
-
-cufftResult CUFFTAPI cufftDestroy(cufftHandle plan);
-
-cufftResult CUFFTAPI cufftGetVersion(int *version);
-
-cufftResult CUFFTAPI cufftGetProperty(libraryPropertyType type,
-                                      int *value);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _CUFFT_H_ */
--- a/include/cuda/cufftXt.h
+++ b/include/cuda/cufftXt.h
@ -1,257 +0,0 @@
-
- /* Copyright 2005-2014 NVIDIA Corporation.  All rights reserved.
-  *
-  * NOTICE TO LICENSEE:
-  *
-  * The source code and/or documentation ("Licensed Deliverables") are
-  * subject to NVIDIA intellectual property rights under U.S. and
-  * international Copyright laws.
-  *
-  * The Licensed Deliverables contained herein are PROPRIETARY and
-  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
-  * conditions of a form of NVIDIA software license agreement by and
-  * between NVIDIA and Licensee ("License Agreement") or electronically
-  * accepted by Licensee.  Notwithstanding any terms or conditions to
-  * the contrary in the License Agreement, reproduction or disclosure
-  * of the Licensed Deliverables to any third party without the express
-  * written consent of NVIDIA is prohibited.
-  *
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
-  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
-  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
-  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
-  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
-  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
-  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
-  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
-  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
-  * OF THESE LICENSED DELIVERABLES.
-  *
-  * U.S. Government End Users.  These Licensed Deliverables are a
-  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
-  * 1995), consisting of "commercial computer software" and "commercial
-  * computer software documentation" as such terms are used in 48
-  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
-  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
-  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
-  * U.S. Government End Users acquire the Licensed Deliverables with
-  * only those rights set forth herein.
-  *
-  * Any use of the Licensed Deliverables in individual and commercial
-  * software must include, in the user documentation and internal
-  * comments to the code, the above Disclaimer and U.S. Government End
-  * Users Notice.
-  */
-
-/*!
-* \file cufftXt.h
-* \brief Public header file for the NVIDIA CUDA FFT library (CUFFT)
-*/
-
-#ifndef _CUFFTXT_H_
-#define _CUFFTXT_H_
-#include "cudalibxt.h"
-#include "cufft.h"
-
-
-#ifndef CUFFTAPI
-#ifdef _WIN32
-#define CUFFTAPI __stdcall
-#else
-#define CUFFTAPI
-#endif
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//
-// cufftXtSubFormat identifies the data layout of
-// a memory descriptor owned by cufft.
-// note that multi GPU cufft does not yet support out-of-place transforms
-//
-
-typedef enum cufftXtSubFormat_t {
-    CUFFT_XT_FORMAT_INPUT = 0x00,              //by default input is in linear order across GPUs
-    CUFFT_XT_FORMAT_OUTPUT = 0x01,             //by default output is in scrambled order depending on transform
-    CUFFT_XT_FORMAT_INPLACE = 0x02,            //by default inplace is input order, which is linear across GPUs
-    CUFFT_XT_FORMAT_INPLACE_SHUFFLED = 0x03,   //shuffled output order after execution of the transform
-    CUFFT_XT_FORMAT_1D_INPUT_SHUFFLED = 0x04,  //shuffled input order prior to execution of 1D transforms
-    CUFFT_FORMAT_UNDEFINED = 0x05
-} cufftXtSubFormat;
-
-//
-// cufftXtCopyType specifies the type of copy for cufftXtMemcpy
-//
-typedef enum cufftXtCopyType_t {
-    CUFFT_COPY_HOST_TO_DEVICE = 0x00,
-    CUFFT_COPY_DEVICE_TO_HOST = 0x01,
-    CUFFT_COPY_DEVICE_TO_DEVICE = 0x02,
-    CUFFT_COPY_UNDEFINED = 0x03
-} cufftXtCopyType;
-
-//
-// cufftXtQueryType specifies the type of query for cufftXtQueryPlan
-//
-typedef enum cufftXtQueryType_t {
-    CUFFT_QUERY_1D_FACTORS = 0x00,
-    CUFFT_QUERY_UNDEFINED = 0x01
-} cufftXtQueryType;
-
-typedef struct cufftXt1dFactors_t {
-    long long int size;
-    long long int stringCount;
-    long long int stringLength;
-    long long int substringLength;
-    long long int factor1;
-    long long int factor2;
-    long long int stringMask;
-    long long int substringMask;
-    long long int factor1Mask;
-    long long int factor2Mask;
-    int stringShift;
-    int substringShift;
-    int factor1Shift;
-    int factor2Shift;
-} cufftXt1dFactors;
-
-//
-// cufftXtWorkAreaPolicy specifies policy for cufftXtSetWorkAreaPolicy
-//
-typedef enum cufftXtWorkAreaPolicy_t {
-    CUFFT_WORKAREA_MINIMAL = 0, /* maximum reduction */
-    CUFFT_WORKAREA_USER = 1, /* use workSize parameter as limit */
-    CUFFT_WORKAREA_PERFORMANCE = 2, /* default - 1x overhead or more, maximum performance */
-} cufftXtWorkAreaPolicy;
-
-// multi-GPU routines
-cufftResult CUFFTAPI cufftXtSetGPUs(cufftHandle handle, int nGPUs, int *whichGPUs);
-
-cufftResult CUFFTAPI cufftXtMalloc(cufftHandle plan,
-                                   cudaLibXtDesc ** descriptor,
-                                   cufftXtSubFormat format);
-
-cufftResult CUFFTAPI cufftXtMemcpy(cufftHandle plan,
-                                   void *dstPointer,
-                                   void *srcPointer,
-                                   cufftXtCopyType type);
-
-cufftResult CUFFTAPI cufftXtFree(cudaLibXtDesc *descriptor);
-
-cufftResult CUFFTAPI cufftXtSetWorkArea(cufftHandle plan, void **workArea);
-
-cufftResult CUFFTAPI cufftXtExecDescriptorC2C(cufftHandle plan,
-                                              cudaLibXtDesc *input,
-                                              cudaLibXtDesc *output,
-                                              int direction);
-
-cufftResult CUFFTAPI cufftXtExecDescriptorR2C(cufftHandle plan,
-                                              cudaLibXtDesc *input,
-                                              cudaLibXtDesc *output);
-
-cufftResult CUFFTAPI cufftXtExecDescriptorC2R(cufftHandle plan,
-                                              cudaLibXtDesc *input,
-                                              cudaLibXtDesc *output);
-
-cufftResult CUFFTAPI cufftXtExecDescriptorZ2Z(cufftHandle plan,
-                                              cudaLibXtDesc *input,
-                                              cudaLibXtDesc *output,
-                                              int direction);
-
-cufftResult CUFFTAPI cufftXtExecDescriptorD2Z(cufftHandle plan,
-                                              cudaLibXtDesc *input,
-                                              cudaLibXtDesc *output);
-
-cufftResult CUFFTAPI cufftXtExecDescriptorZ2D(cufftHandle plan,
-                                              cudaLibXtDesc *input,
-                                              cudaLibXtDesc *output);
-
-// Utility functions
-
-cufftResult CUFFTAPI cufftXtQueryPlan(cufftHandle plan, void *queryStruct, cufftXtQueryType queryType);
-
-
-// callbacks
-
-
-typedef enum cufftXtCallbackType_t {
-    CUFFT_CB_LD_COMPLEX = 0x0,
-    CUFFT_CB_LD_COMPLEX_DOUBLE = 0x1,
-    CUFFT_CB_LD_REAL = 0x2,
-    CUFFT_CB_LD_REAL_DOUBLE = 0x3,
-    CUFFT_CB_ST_COMPLEX = 0x4,
-    CUFFT_CB_ST_COMPLEX_DOUBLE = 0x5,
-    CUFFT_CB_ST_REAL = 0x6,
-    CUFFT_CB_ST_REAL_DOUBLE = 0x7,
-    CUFFT_CB_UNDEFINED = 0x8
-
-} cufftXtCallbackType;
-
-typedef cufftComplex (*cufftCallbackLoadC)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
-typedef cufftDoubleComplex (*cufftCallbackLoadZ)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
-typedef cufftReal (*cufftCallbackLoadR)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
-typedef cufftDoubleReal(*cufftCallbackLoadD)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
-
-typedef void (*cufftCallbackStoreC)(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPointer);
-typedef void (*cufftCallbackStoreZ)(void *dataOut, size_t offset, cufftDoubleComplex element, void *callerInfo, void *sharedPointer);
-typedef void (*cufftCallbackStoreR)(void *dataOut, size_t offset, cufftReal element, void *callerInfo, void *sharedPointer);
-typedef void (*cufftCallbackStoreD)(void *dataOut, size_t offset, cufftDoubleReal element, void *callerInfo, void *sharedPointer);
-
-
-cufftResult CUFFTAPI cufftXtSetCallback(cufftHandle plan, void **callback_routine, cufftXtCallbackType cbType, void **caller_info);
-cufftResult CUFFTAPI cufftXtClearCallback(cufftHandle plan, cufftXtCallbackType cbType);
-cufftResult CUFFTAPI cufftXtSetCallbackSharedSize(cufftHandle plan, cufftXtCallbackType cbType, size_t sharedSize);
-
-cufftResult CUFFTAPI cufftXtMakePlanMany(cufftHandle plan,
-                                         int rank,
-                                         long long int *n,
-                                         long long int *inembed,
-                                         long long int istride,
-                                         long long int idist,
-                                         cudaDataType inputtype,
-                                         long long int *onembed,
-                                         long long int ostride,
-                                         long long int odist,
-                                         cudaDataType outputtype,
-                                         long long int batch,
-                                         size_t *workSize,
-                                       	 cudaDataType executiontype);
-
-cufftResult CUFFTAPI cufftXtGetSizeMany(cufftHandle plan,
-                                        int rank,
-                                        long long int *n,
-                                        long long int *inembed,
-                                        long long int istride,
-                                        long long int idist,
-                                        cudaDataType inputtype,
-                                        long long int *onembed,
-                                        long long int ostride,
-                                        long long int odist,
-                                        cudaDataType outputtype,
-                                        long long int batch,
-                                        size_t *workSize,
-                                        cudaDataType executiontype);
-
-
-cufftResult CUFFTAPI cufftXtExec(cufftHandle plan,
-                                 void *input,
-                                 void *output,
-                                 int direction);
-
-cufftResult CUFFTAPI cufftXtExecDescriptor(cufftHandle plan,
-                                           cudaLibXtDesc *input,
-                                           cudaLibXtDesc *output,
-                                           int direction);
-
-cufftResult CUFFTAPI cufftXtSetWorkAreaPolicy(cufftHandle plan, cufftXtWorkAreaPolicy policy, size_t *workSize);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/include/cuda/cufftw.h
+++ b/include/cuda/cufftw.h
@ -1,454 +0,0 @@
-
- /* Copyright 2005-2014 NVIDIA Corporation.  All rights reserved.
-  *
-  * NOTICE TO LICENSEE:
-  *
-  * The source code and/or documentation ("Licensed Deliverables") are
-  * subject to NVIDIA intellectual property rights under U.S. and
-  * international Copyright laws.
-  *
-  * The Licensed Deliverables contained herein are PROPRIETARY and
-  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
-  * conditions of a form of NVIDIA software license agreement by and
-  * between NVIDIA and Licensee ("License Agreement") or electronically
-  * accepted by Licensee.  Notwithstanding any terms or conditions to
-  * the contrary in the License Agreement, reproduction or disclosure
-  * of the Licensed Deliverables to any third party without the express
-  * written consent of NVIDIA is prohibited.
-  *
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
-  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
-  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
-  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
-  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
-  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
-  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
-  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
-  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
-  * OF THESE LICENSED DELIVERABLES.
-  *
-  * U.S. Government End Users.  These Licensed Deliverables are a
-  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
-  * 1995), consisting of "commercial computer software" and "commercial
-  * computer software documentation" as such terms are used in 48
-  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
-  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
-  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
-  * U.S. Government End Users acquire the Licensed Deliverables with
-  * only those rights set forth herein.
-  *
-  * Any use of the Licensed Deliverables in individual and commercial
-  * software must include, in the user documentation and internal
-  * comments to the code, the above Disclaimer and U.S. Government End
-  * Users Notice.
-  */
-
-/*!
-* \file cufftw.h
-* \brief Public header file for the NVIDIA CUDA FFTW library (CUFFTW)
-*/
-
-#ifndef _CUFFTW_H_
-#define _CUFFTW_H_
-
-
-#include <stdio.h>
-#include "cufft.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// transform direction
-#define FFTW_FORWARD -1
-#define FFTW_INVERSE  1
-#define FFTW_BACKWARD 1
-
-// Planner flags
-
-#define FFTW_ESTIMATE           0x01
-#define FFTW_MEASURE            0x02
-#define FFTW_PATIENT            0x03
-#define FFTW_EXHAUSTIVE         0x04
-#define FFTW_WISDOM_ONLY        0x05
-
-//Algorithm restriction flags
-
-#define FFTW_DESTROY_INPUT      0x08
-#define FFTW_PRESERVE_INPUT     0x0C
-#define FFTW_UNALIGNED          0x10
-    
-// CUFFTW defines and supports the following data types
-
-// note if complex.h has been included we use the C99 complex types
-#if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined (complex)
-  typedef double _Complex fftw_complex;
-  typedef float _Complex fftwf_complex;
-#else
-  typedef double fftw_complex[2];
-  typedef float fftwf_complex[2];
-#endif
-
-typedef void *fftw_plan;
-
-typedef void *fftwf_plan;
-
-typedef struct {
-    int n;
-    int is;
-    int os;
-} fftw_iodim;
-
-typedef fftw_iodim fftwf_iodim;
-    
-typedef struct {
-    ptrdiff_t n;
-    ptrdiff_t is;
-    ptrdiff_t os;
-} fftw_iodim64;
-
-typedef fftw_iodim64 fftwf_iodim64;
-    
-
-// CUFFTW defines and supports the following double precision APIs
-
-
-fftw_plan CUFFTAPI fftw_plan_dft_1d(int n, 
-                                    fftw_complex *in,
-                                    fftw_complex *out, 
-                                    int sign, 
-                                    unsigned flags);
-
-fftw_plan CUFFTAPI fftw_plan_dft_2d(int n0,
-                                    int n1, 
-                                    fftw_complex *in,
-                                    fftw_complex *out, 
-                                    int sign, 
-                                    unsigned flags);
-
-fftw_plan CUFFTAPI fftw_plan_dft_3d(int n0,
-                                    int n1,
-                                    int n2, 
-                                    fftw_complex *in,
-                                    fftw_complex *out, 
-                                    int sign, 
-                                    unsigned flags);
-
-fftw_plan CUFFTAPI fftw_plan_dft(int rank,
-                                 const int *n,
-                                 fftw_complex *in,
-                                 fftw_complex *out, 
-                                 int sign, 
-                                 unsigned flags);
-
-fftw_plan CUFFTAPI fftw_plan_dft_r2c_1d(int n, 
-                                        double *in,
-                                        fftw_complex *out, 
-                                        unsigned flags);
-
-fftw_plan CUFFTAPI fftw_plan_dft_r2c_2d(int n0,
-                                        int n1, 
-                                        double *in,
-                                        fftw_complex *out, 
-                                        unsigned flags);
-
-fftw_plan CUFFTAPI fftw_plan_dft_r2c_3d(int n0,
-                                        int n1,
-                                        int n2, 
-                                        double *in,
-                                        fftw_complex *out, 
-                                        unsigned flags);
-
-fftw_plan CUFFTAPI fftw_plan_dft_r2c(int rank,
-                                     const int *n,
-                                     double *in,
-                                     fftw_complex *out, 
-                                     unsigned flags);
-
-fftw_plan CUFFTAPI fftw_plan_dft_c2r_1d(int n, 
-                                        fftw_complex *in,
-                                        double *out, 
-                                        unsigned flags);
-
-fftw_plan CUFFTAPI fftw_plan_dft_c2r_2d(int n0,
-                                        int n1, 
-                                        fftw_complex *in,
-                                        double *out, 
-                                        unsigned flags);
-
-fftw_plan CUFFTAPI fftw_plan_dft_c2r_3d(int n0,
-                                        int n1,
-                                        int n2, 
-                                        fftw_complex *in,
-                                        double *out, 
-                                        unsigned flags);
-
-fftw_plan CUFFTAPI fftw_plan_dft_c2r(int rank,
-                                     const int *n,
-                                     fftw_complex *in,
-                                     double *out, 
-                                     unsigned flags);
-
-
-fftw_plan CUFFTAPI fftw_plan_many_dft(int rank,
-                                      const int *n,
-                                      int batch,
-                                      fftw_complex *in,
-                                      const int *inembed, int istride, int idist,
-                                      fftw_complex *out,
-                                      const int *onembed, int ostride, int odist,
-                                      int sign, unsigned flags);
-
-fftw_plan CUFFTAPI fftw_plan_many_dft_r2c(int rank,
-                                          const int *n,
-                                          int batch,
-                                          double *in,
-                                          const int *inembed, int istride, int idist,
-                                          fftw_complex *out,
-                                          const int *onembed, int ostride, int odist,
-                                          unsigned flags);
-
-fftw_plan CUFFTAPI fftw_plan_many_dft_c2r(int rank,
-                                          const int *n,
-                                          int batch,
-                                          fftw_complex *in,
-                                          const int *inembed, int istride, int idist,
-                                          double *out,
-                                          const int *onembed, int ostride, int odist,
-                                          unsigned flags);
-
-fftw_plan CUFFTAPI fftw_plan_guru_dft(int rank, const fftw_iodim *dims,
-                                      int batch_rank, const fftw_iodim *batch_dims,
-                                      fftw_complex *in, fftw_complex *out,
-                                      int sign, unsigned flags);
-
-fftw_plan CUFFTAPI fftw_plan_guru_dft_r2c(int rank, const fftw_iodim *dims,
-                                          int batch_rank, const fftw_iodim *batch_dims,
-                                          double *in, fftw_complex *out, 
-                                          unsigned flags);
-
-fftw_plan CUFFTAPI fftw_plan_guru_dft_c2r(int rank, const fftw_iodim *dims,
-                                          int batch_rank, const fftw_iodim *batch_dims,
-                                          fftw_complex *in, double *out, 
-                                          unsigned flags);
-
-void CUFFTAPI fftw_execute(const fftw_plan plan);
-
-void CUFFTAPI fftw_execute_dft(const fftw_plan plan, 
-                               fftw_complex *idata,
-                               fftw_complex *odata);
-
-void CUFFTAPI fftw_execute_dft_r2c(const fftw_plan plan, 
-                                   double *idata,
-                                   fftw_complex *odata);
-
-void CUFFTAPI fftw_execute_dft_c2r(const fftw_plan plan, 
-                                   fftw_complex *idata,
-                                   double *odata);
-                                   
-                                   
-// CUFFTW defines and supports the following single precision APIs
-
-fftwf_plan CUFFTAPI fftwf_plan_dft_1d(int n, 
-                                      fftwf_complex *in,
-                                      fftwf_complex *out, 
-                                      int sign, 
-                                      unsigned flags);
-                                   
-fftwf_plan CUFFTAPI fftwf_plan_dft_2d(int n0,
-                                      int n1, 
-                                      fftwf_complex *in,
-                                      fftwf_complex *out, 
-                                      int sign, 
-                                      unsigned flags);
-
-fftwf_plan CUFFTAPI fftwf_plan_dft_3d(int n0,
-                                      int n1,
-                                      int n2, 
-                                      fftwf_complex *in,
-                                      fftwf_complex *out, 
-                                      int sign, 
-                                      unsigned flags);
-
-fftwf_plan CUFFTAPI fftwf_plan_dft(int rank,
-                                   const int *n,
-                                   fftwf_complex *in,
-                                   fftwf_complex *out, 
-                                   int sign, 
-                                   unsigned flags);
-
-fftwf_plan CUFFTAPI fftwf_plan_dft_r2c_1d(int n, 
-                                          float *in,
-                                          fftwf_complex *out, 
-                                          unsigned flags);
-
-fftwf_plan CUFFTAPI fftwf_plan_dft_r2c_2d(int n0,
-                                          int n1, 
-                                          float *in,
-                                          fftwf_complex *out, 
-                                          unsigned flags);
-
-fftwf_plan CUFFTAPI fftwf_plan_dft_r2c_3d(int n0,
-                                          int n1,
-                                          int n2, 
-                                          float *in,
-                                          fftwf_complex *out, 
-                                          unsigned flags);
-
-fftwf_plan CUFFTAPI fftwf_plan_dft_r2c(int rank,
-                                       const int *n,
-                                       float *in,
-                                       fftwf_complex *out, 
-                                       unsigned flags);
-
-fftwf_plan CUFFTAPI fftwf_plan_dft_c2r_1d(int n, 
-                                          fftwf_complex *in,
-                                          float *out, 
-                                          unsigned flags);
-                                      
-fftwf_plan CUFFTAPI fftwf_plan_dft_c2r_2d(int n0,
-                                          int n1, 
-                                          fftwf_complex *in,
-                                          float *out, 
-                                          unsigned flags);
-
-fftwf_plan CUFFTAPI fftwf_plan_dft_c2r_3d(int n0,
-                                        int n1,
-                                        int n2, 
-                                        fftwf_complex *in,
-                                        float *out, 
-                                        unsigned flags);
-
-fftwf_plan CUFFTAPI fftwf_plan_dft_c2r(int rank,
-                                       const int *n,
-                                       fftwf_complex *in,
-                                       float *out, 
-                                       unsigned flags);
-
-fftwf_plan CUFFTAPI fftwf_plan_many_dft(int rank,
-                                        const int *n,
-                                        int batch,
-                                        fftwf_complex *in,
-                                        const int *inembed, int istride, int idist,
-                                        fftwf_complex *out,
-                                        const int *onembed, int ostride, int odist,
-                                        int sign, unsigned flags);
-
-fftwf_plan CUFFTAPI fftwf_plan_many_dft_r2c(int rank,
-                                            const int *n,
-                                            int batch,
-                                            float *in,
-                                            const int *inembed, int istride, int idist,
-                                            fftwf_complex *out,
-                                            const int *onembed, int ostride, int odist,
-                                            unsigned flags);
-
-fftwf_plan CUFFTAPI fftwf_plan_many_dft_c2r(int rank,
-                                            const int *n,
-                                            int batch,
-                                            fftwf_complex *in,
-                                            const int *inembed, int istride, int idist,
-                                            float *out,
-                                            const int *onembed, int ostride, int odist,
-                                            unsigned flags);
-
-fftwf_plan CUFFTAPI fftwf_plan_guru_dft(int rank, const fftwf_iodim *dims,
-                                        int batch_rank, const fftwf_iodim *batch_dims,
-                                        fftwf_complex *in, fftwf_complex *out,
-                                        int sign, unsigned flags);
-                                        
-fftwf_plan CUFFTAPI fftwf_plan_guru_dft_r2c(int rank, const fftwf_iodim *dims,
-                                            int batch_rank, const fftwf_iodim *batch_dims,
-                                            float *in, fftwf_complex *out, 
-                                            unsigned flags);
-
-fftwf_plan CUFFTAPI fftwf_plan_guru_dft_c2r(int rank, const fftwf_iodim *dims,
-                                            int batch_rank, const fftwf_iodim *batch_dims,
-                                            fftwf_complex *in, float *out, 
-                                            unsigned flags);
-
-void CUFFTAPI fftwf_execute(const fftw_plan plan);
-
-void CUFFTAPI fftwf_execute_dft(const fftwf_plan plan, 
-                                fftwf_complex *idata,
-                                fftwf_complex *odata);
-
-void CUFFTAPI fftwf_execute_dft_r2c(const fftwf_plan plan, 
-                                    float *idata,
-                                    fftwf_complex *odata);
-
-void CUFFTAPI fftwf_execute_dft_c2r(const fftwf_plan plan, 
-                                    fftwf_complex *idata,
-                                    float *odata);
-
-/// CUFFTW 64-bit Guru Interface
-/// dp
-fftw_plan CUFFTAPI fftw_plan_guru64_dft(int rank, const fftw_iodim64* dims, int batch_rank, const fftw_iodim64* batch_dims, fftw_complex* in, fftw_complex* out, int sign, unsigned flags);
-
-fftw_plan CUFFTAPI fftw_plan_guru64_dft_r2c(int rank, const fftw_iodim64* dims, int batch_rank, const fftw_iodim64* batch_dims, double* in, fftw_complex* out, unsigned flags);
-
-fftw_plan CUFFTAPI fftw_plan_guru64_dft_c2r(int rank, const fftw_iodim64* dims, int batch_rank, const fftw_iodim64* batch_dims, fftw_complex* in, double* out, unsigned flags);
-
-/// sp
-fftwf_plan CUFFTAPI fftwf_plan_guru64_dft(int rank, const fftwf_iodim64* dims, int batch_rank, const fftwf_iodim64* batch_dims, fftwf_complex* in, fftwf_complex* out, int sign, unsigned flags);
-
-fftwf_plan CUFFTAPI fftwf_plan_guru64_dft_r2c(int rank, const fftwf_iodim64* dims, int batch_rank, const fftwf_iodim64* batch_dims, float* in, fftwf_complex* out, unsigned flags);
-
-fftwf_plan CUFFTAPI fftwf_plan_guru64_dft_c2r(int rank, const fftwf_iodim64* dims, int batch_rank, const fftwf_iodim64* batch_dims, fftwf_complex* in, float* out, unsigned flags);
-
-#ifdef _WIN32
-#define _CUFFTAPI(T) T CUFFTAPI
-#else
-#define _CUFFTAPI(T) CUFFTAPI T
-#endif
-
-// CUFFTW defines and supports the following support APIs
-_CUFFTAPI(void *) fftw_malloc(size_t n);
-
-_CUFFTAPI(void *) fftwf_malloc(size_t n);
-
-void CUFFTAPI fftw_free(void *pointer);
-
-void CUFFTAPI fftwf_free(void *pointer);
-
-void CUFFTAPI fftw_export_wisdom_to_file(FILE * output_file); 
-
-void CUFFTAPI fftwf_export_wisdom_to_file(FILE * output_file); 
-
-void CUFFTAPI fftw_import_wisdom_from_file(FILE * input_file); 
-
-void CUFFTAPI fftwf_import_wisdom_from_file(FILE * input_file); 
-
-void CUFFTAPI fftw_print_plan(const fftw_plan plan);                                 
-
-void CUFFTAPI fftwf_print_plan(const fftwf_plan plan);
-
-void CUFFTAPI fftw_set_timelimit(double seconds);
-
-void CUFFTAPI fftwf_set_timelimit(double seconds);
-
-double CUFFTAPI fftw_cost(const fftw_plan plan);
-                               
-double CUFFTAPI fftwf_cost(const fftw_plan plan);
-
-void CUFFTAPI fftw_flops(const fftw_plan plan, double *add, double *mul, double *fma);
-
-void CUFFTAPI fftwf_flops(const fftw_plan plan, double *add, double *mul, double *fma);
-
-void CUFFTAPI fftw_destroy_plan(fftw_plan plan);
-
-void CUFFTAPI fftwf_destroy_plan(fftwf_plan plan);
-
-void CUFFTAPI fftw_cleanup(void);
-
-void CUFFTAPI fftwf_cleanup(void);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _CUFFTW_H_ */
--- a/include/cuda/curand.h
+++ b/include/cuda/curand.h
--- a/include/cuda/curand_discrete.h
+++ b/include/cuda/curand_discrete.h
@ -1,87 +0,0 @@
- /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
-  *
-  * NOTICE TO LICENSEE:
-  *
-  * The source code and/or documentation ("Licensed Deliverables") are
-  * subject to NVIDIA intellectual property rights under U.S. and
-  * international Copyright laws.
-  *
-  * The Licensed Deliverables contained herein are PROPRIETARY and
-  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
-  * conditions of a form of NVIDIA software license agreement by and
-  * between NVIDIA and Licensee ("License Agreement") or electronically
-  * accepted by Licensee.  Notwithstanding any terms or conditions to
-  * the contrary in the License Agreement, reproduction or disclosure
-  * of the Licensed Deliverables to any third party without the express
-  * written consent of NVIDIA is prohibited.
-  *
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
-  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
-  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
-  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
-  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
-  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
-  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
-  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
-  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
-  * OF THESE LICENSED DELIVERABLES.
-  *
-  * U.S. Government End Users.  These Licensed Deliverables are a
-  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
-  * 1995), consisting of "commercial computer software" and "commercial
-  * computer software documentation" as such terms are used in 48
-  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
-  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
-  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
-  * U.S. Government End Users acquire the Licensed Deliverables with
-  * only those rights set forth herein.
-  *
-  * Any use of the Licensed Deliverables in individual and commercial
-  * software must include, in the user documentation and internal
-  * comments to the code, the above Disclaimer and U.S. Government End
-  * Users Notice.
-  */
-
-#if !defined(CURANDDISCRETE_H_)
-#define CURANDDISCRETE_H_
-
-struct curandDistributionShift_st {
-    curandDistribution_t probability;
-    curandDistribution_t host_probability;
-    unsigned int shift;
-    unsigned int length;
-    unsigned int host_gen;
-};
-
-struct curandHistogramM2_st {
-    curandHistogramM2V_t V; 
-    curandHistogramM2V_t host_V; 
-    curandHistogramM2K_t K; 
-    curandHistogramM2K_t host_K; 
-    unsigned int host_gen;
-};
-
-
-struct curandDistributionM2Shift_st {
-    curandHistogramM2_t histogram;
-    curandHistogramM2_t host_histogram;
-    unsigned int shift;
-    unsigned int length;
-    unsigned int host_gen;
-};
-
-struct curandDiscreteDistribution_st {
-    curandDiscreteDistribution_t self_host_ptr;
-    curandDistributionM2Shift_t M2;
-    curandDistributionM2Shift_t host_M2;
-    double stddev;
-    double mean;
-    curandMethod_t method;
-    unsigned int host_gen; 
-};
-
-#endif // !defined(CURANDDISCRETE_H_)
--- a/include/cuda/curand_discrete2.h
+++ b/include/cuda/curand_discrete2.h
@ -1,253 +0,0 @@
-
- /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
-  *
-  * NOTICE TO LICENSEE:
-  *
-  * The source code and/or documentation ("Licensed Deliverables") are
-  * subject to NVIDIA intellectual property rights under U.S. and
-  * international Copyright laws.
-  *
-  * The Licensed Deliverables contained herein are PROPRIETARY and
-  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
-  * conditions of a form of NVIDIA software license agreement by and
-  * between NVIDIA and Licensee ("License Agreement") or electronically
-  * accepted by Licensee.  Notwithstanding any terms or conditions to
-  * the contrary in the License Agreement, reproduction or disclosure
-  * of the Licensed Deliverables to any third party without the express
-  * written consent of NVIDIA is prohibited.
-  *
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
-  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
-  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
-  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
-  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
-  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
-  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
-  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
-  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
-  * OF THESE LICENSED DELIVERABLES.
-  *
-  * U.S. Government End Users.  These Licensed Deliverables are a
-  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
-  * 1995), consisting of "commercial computer software" and "commercial
-  * computer software documentation" as such terms are used in 48
-  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
-  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
-  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
-  * U.S. Government End Users acquire the Licensed Deliverables with
-  * only those rights set forth herein.
-  *
-  * Any use of the Licensed Deliverables in individual and commercial
-  * software must include, in the user documentation and internal
-  * comments to the code, the above Disclaimer and U.S. Government End
-  * Users Notice.
-  */
-
-
-#if !defined(CURAND_DISCRETE_H_)
-#define CURAND_DISCRETE_H_
-
-/**
- * \defgroup DEVICE Device API
- *
- * @{
- */
-
-#ifndef __CUDACC_RTC__
-#include <math.h>
-#endif // __CUDACC_RTC__
-
-#include "curand_mrg32k3a.h"
-#include "curand_mtgp32_kernel.h"
-#include "curand_philox4x32_x.h"
-
-
-template <typename T>
-QUALIFIERS unsigned int _curand_discrete(T x, curandDiscreteDistribution_t discrete_distribution){
-    if (discrete_distribution->method == CURAND_M2){
-        return _curand_M2_double(x, discrete_distribution->M2);
-    }
-    return (unsigned int)((discrete_distribution->stddev * _curand_normal_icdf_double(x)) + discrete_distribution->mean + 0.5);
-}
-
-
-template <typename STATE>
-QUALIFIERS unsigned int curand__discrete(STATE state, curandDiscreteDistribution_t discrete_distribution){
-    if (discrete_distribution->method == CURAND_M2){
-        return curand_M2_double(state, discrete_distribution->M2);
-    }
-    return (unsigned int)((discrete_distribution->stddev * curand_normal_double(state)) + discrete_distribution->mean + 0.5); //Round to nearest
-}
-
-template <typename STATE>
-QUALIFIERS uint4 curand__discrete4(STATE state, curandDiscreteDistribution_t discrete_distribution){
-    if (discrete_distribution->method == CURAND_M2){
-        return curand_M2_double4(state, discrete_distribution->M2);
-    }
-    double4 _res;
-    uint4 result;
-    _res = curand_normal4_double(state);
-    result.x = (unsigned int)((discrete_distribution->stddev * _res.x) + discrete_distribution->mean + 0.5); //Round to nearest
-    result.y = (unsigned int)((discrete_distribution->stddev * _res.y) + discrete_distribution->mean + 0.5); //Round to nearest
-    result.z = (unsigned int)((discrete_distribution->stddev * _res.z) + discrete_distribution->mean + 0.5); //Round to nearest
-    result.w = (unsigned int)((discrete_distribution->stddev * _res.w) + discrete_distribution->mean + 0.5); //Round to nearest
-    return result;
-}
-
-/*
- * \brief Return a discrete distributed unsigned int from a XORWOW generator.
- *
- * Return a single discrete distributed unsigned int derived from a
- * distribution defined by \p discrete_distribution from the XORWOW generator in \p state,
- * increment position of generator by one.
- *
- * \param state - Pointer to state to update
- * \param discrete_distribution - ancillary structure for discrete distribution
- *
- * \return unsigned int distributed by distribution defined by \p discrete_distribution.
- */
-QUALIFIERS unsigned int curand_discrete(curandStateXORWOW_t *state, curandDiscreteDistribution_t discrete_distribution)
-{
-    return curand__discrete(state, discrete_distribution);
-}
-
-/*
- * \brief Return a discrete distributed unsigned int from a Philox4_32_10 generator.
- *
- * Return a single discrete distributed unsigned int derived from a
- * distribution defined by \p discrete_distribution from the Philox4_32_10 generator in \p state,
- * increment position of generator by one.
- *
- * \param state - Pointer to state to update
- * \param discrete_distribution - ancillary structure for discrete distribution
- *
- * \return unsigned int distributed by distribution defined by \p discrete_distribution.
- */
-QUALIFIERS unsigned int curand_discrete(curandStatePhilox4_32_10_t *state, curandDiscreteDistribution_t discrete_distribution)
-{
-    return curand__discrete(state, discrete_distribution);
-}
-
-/*
- * \brief Return four discrete distributed unsigned ints from a Philox4_32_10 generator.
- *
- * Return four single discrete distributed unsigned ints derived from a
- * distribution defined by \p discrete_distribution from the Philox4_32_10 generator in \p state,
- * increment position of generator by one.
- *
- * \param state - Pointer to state to update
- * \param discrete_distribution - ancillary structure for discrete distribution
- *
- * \return unsigned int distributed by distribution defined by \p discrete_distribution.
- */
-QUALIFIERS uint4 curand_discrete4(curandStatePhilox4_32_10_t *state, curandDiscreteDistribution_t discrete_distribution)
-{
-    return curand__discrete4(state, discrete_distribution);
-}
-/*
- * \brief Return a discrete distributed unsigned int from a MRG32k3a generator.
- *
- * Re turn a single discrete distributed unsigned int derived from a
- * distribution defined by \p discrete_distribution from the MRG32k3a generator in \p state,
- * increment position of generator by one.
- *
- * \param state - Pointer to state to update
- * \param discrete_distribution - ancillary structure for discrete distribution
- *
- * \return unsigned int distributed by distribution defined by \p discrete_distribution.
- */
-QUALIFIERS unsigned int curand_discrete(curandStateMRG32k3a_t *state, curandDiscreteDistribution_t discrete_distribution)
-{
-    return curand__discrete(state, discrete_distribution);
-}
-
-/*
- * \brief Return a discrete distributed unsigned int from a MTGP32 generator.
- *
- * Return a single discrete distributed unsigned int derived from a
- * distribution defined by \p discrete_distribution from the MTGP32 generator in \p state,
- * increment position of generator by one.
- *
- * \param state - Pointer to state to update
- * \param discrete_distribution - ancillary structure for discrete distribution
- *
- * \return unsigned int distributed by distribution defined by \p discrete_distribution.
- */
-QUALIFIERS unsigned int curand_discrete(curandStateMtgp32_t *state, curandDiscreteDistribution_t discrete_distribution)
-{
-    return curand__discrete(state, discrete_distribution);
-}
-
-/*
- * \brief Return a discrete distributed unsigned int from a Sobol32 generator.
- *
- * Return a single discrete distributed unsigned int derived from a
- * distribution defined by \p discrete_distribution from the Sobol32 generator in \p state,
- * increment position of generator by one.
- *
- * \param state - Pointer to state to update
- * \param discrete_distribution - ancillary structure for discrete distribution
- *
- * \return unsigned int distributed by distribution defined by \p discrete_distribution.
- */
-QUALIFIERS unsigned int curand_discrete(curandStateSobol32_t *state, curandDiscreteDistribution_t discrete_distribution)
-{
-    return curand__discrete(state, discrete_distribution);
-}
-
-/*
- * \brief Return a discrete distributed unsigned int from a scrambled Sobol32 generator.
- *
- * Return a single discrete distributed unsigned int derived from a
- * distribution defined by \p discrete_distribution from the scrambled Sobol32 generator in \p state,
- * increment position of generator by one.
- *
- * \param state - Pointer to state to update
- * \param discrete_distribution - ancillary structure for discrete distribution
- *
- * \return unsigned int distributed by distribution defined by \p discrete_distribution.
- */
-QUALIFIERS unsigned int curand_discrete(curandStateScrambledSobol32_t *state, curandDiscreteDistribution_t discrete_distribution)
-{
-    return curand__discrete(state, discrete_distribution);
-}
-
-/*
- * \brief Return a discrete distributed unsigned int from a Sobol64 generator.
- *
- * Return a single discrete distributed unsigned int derived from a
- * distribution defined by \p discrete_distribution from the Sobol64 generator in \p state,
- * increment position of generator by one.
- *
- * \param state - Pointer to state to update
- * \param discrete_distribution - ancillary structure for discrete distribution
- *
- * \return unsigned int distributed by distribution defined by \p discrete_distribution.
- */
-QUALIFIERS unsigned int curand_discrete(curandStateSobol64_t *state, curandDiscreteDistribution_t discrete_distribution)
-{
-    return curand__discrete(state, discrete_distribution);
-}
-
-/*
- * \brief Return a discrete distributed unsigned int from a scrambled Sobol64 generator.
- *
- * Return a single discrete distributed unsigned int derived from a
- * distribution defined by \p discrete_distribution from the scrambled Sobol64 generator in \p state,
- * increment position of generator by one.
- *
- * \param state - Pointer to state to update
- * \param discrete_distribution - ancillary structure for discrete distribution
- *
- * \return unsigned int distributed by distribution defined by \p discrete_distribution.
- */
-QUALIFIERS unsigned int curand_discrete(curandStateScrambledSobol64_t *state, curandDiscreteDistribution_t discrete_distribution)
-{
-    return curand__discrete(state, discrete_distribution);
-}
-
-#endif // !defined(CURAND_DISCRETE_H_)
--- a/include/cuda/curand_globals.h
+++ b/include/cuda/curand_globals.h
@ -1,93 +0,0 @@
- /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
-  *
-  * NOTICE TO LICENSEE:
-  *
-  * The source code and/or documentation ("Licensed Deliverables") are
-  * subject to NVIDIA intellectual property rights under U.S. and
-  * international Copyright laws.
-  *
-  * The Licensed Deliverables contained herein are PROPRIETARY and
-  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
-  * conditions of a form of NVIDIA software license agreement by and
-  * between NVIDIA and Licensee ("License Agreement") or electronically
-  * accepted by Licensee.  Notwithstanding any terms or conditions to
-  * the contrary in the License Agreement, reproduction or disclosure
-  * of the Licensed Deliverables to any third party without the express
-  * written consent of NVIDIA is prohibited.
-  *
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
-  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
-  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
-  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
-  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
-  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
-  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
-  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
-  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
-  * OF THESE LICENSED DELIVERABLES.
-  *
-  * U.S. Government End Users.  These Licensed Deliverables are a
-  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
-  * 1995), consisting of "commercial computer software" and "commercial
-  * computer software documentation" as such terms are used in 48
-  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
-  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
-  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
-  * U.S. Government End Users acquire the Licensed Deliverables with
-  * only those rights set forth herein.
-  *
-  * Any use of the Licensed Deliverables in individual and commercial
-  * software must include, in the user documentation and internal
-  * comments to the code, the above Disclaimer and U.S. Government End
-  * Users Notice.
-  */
-#ifndef CURAND_GLOBALS_H
-#define CURAND_GLOBALS_H
-
-#define MAX_XOR_N (5)
-#define SKIPAHEAD_BLOCKSIZE (4)
-#define SKIPAHEAD_MASK ((1<<SKIPAHEAD_BLOCKSIZE)-1)
-#define CURAND_2POW32 (4294967296.f)
-#define CURAND_2POW32_DOUBLE (4294967296.)
-#define CURAND_2POW32_INV (2.3283064e-10f)
-#define CURAND_2POW32_INV_DOUBLE (2.3283064365386963e-10) 
-#define CURAND_2POW53_INV_DOUBLE (1.1102230246251565e-16)
-#define CURAND_2POW32_INV_2PI (2.3283064e-10f * 6.2831855f)
-#define CURAND_2PI (6.2831855f)
-#define CURAND_2POW53_INV_2PI_DOUBLE (1.1102230246251565e-16 * 6.2831853071795860)
-#define CURAND_PI_DOUBLE  (3.1415926535897932)
-#define CURAND_2PI_DOUBLE (6.2831853071795860)
-#define CURAND_SQRT2 (-1.4142135f)
-#define CURAND_SQRT2_DOUBLE (-1.4142135623730951)
-
-#define SOBOL64_ITR_BINARY_DIVIDE 2
-#define SOBOL_M2_BINARY_DIVIDE 10
-#define MTGP32_M2_BINARY_DIVIDE 32
-#define MAX_LAMBDA 400000
-#define MIN_GAUSS_LAMBDA 2000
-
-struct normal_args_st {
-    float mean;
-    float stddev;
-};
-
-typedef struct normal_args_st normal_args_t;
-
-struct normal_args_double_st {
-    double mean;
-    double stddev;
-};
-
-typedef struct normal_args_double_st normal_args_double_t;
-
-
-
-
-
-
-
-#endif
--- a/include/cuda/curand_kernel.h
+++ b/include/cuda/curand_kernel.h
--- a/include/cuda/curand_lognormal.h
+++ b/include/cuda/curand_lognormal.h
@ -1,697 +0,0 @@
-
- /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
-  *
-  * NOTICE TO LICENSEE:
-  *
-  * The source code and/or documentation ("Licensed Deliverables") are
-  * subject to NVIDIA intellectual property rights under U.S. and
-  * international Copyright laws.
-  *
-  * The Licensed Deliverables contained herein are PROPRIETARY and
-  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
-  * conditions of a form of NVIDIA software license agreement by and
-  * between NVIDIA and Licensee ("License Agreement") or electronically
-  * accepted by Licensee.  Notwithstanding any terms or conditions to
-  * the contrary in the License Agreement, reproduction or disclosure
-  * of the Licensed Deliverables to any third party without the express
-  * written consent of NVIDIA is prohibited.
-  *
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
-  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
-  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
-  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
-  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
-  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
-  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
-  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
-  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
-  * OF THESE LICENSED DELIVERABLES.
-  *
-  * U.S. Government End Users.  These Licensed Deliverables are a
-  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
-  * 1995), consisting of "commercial computer software" and "commercial
-  * computer software documentation" as such terms are used in 48
-  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
-  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
-  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
-  * U.S. Government End Users acquire the Licensed Deliverables with
-  * only those rights set forth herein.
-  *
-  * Any use of the Licensed Deliverables in individual and commercial
-  * software must include, in the user documentation and internal
-  * comments to the code, the above Disclaimer and U.S. Government End
-  * Users Notice.
-  */
-
-
-#if !defined(CURAND_LOGNORMAL_H_)
-#define CURAND_LOGNORMAL_H_
-
-/**
- * \defgroup DEVICE Device API
- *
- * @{
- */
-
-#ifndef __CUDACC_RTC__
-#include <math.h>
-#endif // __CUDACC_RTC__
-
-#include "curand_mrg32k3a.h"
-#include "curand_mtgp32_kernel.h"
-#include "curand_philox4x32_x.h"
-
-/**
- * \brief Return a log-normally distributed float from an XORWOW generator.
- *
- * Return a single log-normally distributed float derived from a normal
- * distribution with mean \p mean and standard deviation \p stddev
- * from the XORWOW generator in \p state,
- * increment position of generator by one.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results, transforms them to log-normal distribution,
- * then returns them one at a time.
- * See ::curand_log_normal2() for a more efficient version that returns
- * both results at once.
- *
- * \param state  - Pointer to state to update
- * \param mean   - Mean of the related normal distribution
- * \param stddev - Standard deviation of the related normal distribution
- *
- * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
- */
-QUALIFIERS float curand_log_normal(curandStateXORWOW_t *state, float mean, float stddev)
-{
-    if(state->boxmuller_flag != EXTRA_FLAG_LOG_NORMAL) {
-        unsigned int x, y;
-        x = curand(state);
-        y = curand(state);
-        float2 v = _curand_box_muller(x, y);
-        state->boxmuller_extra = expf(mean + (stddev * v.y));
-        state->boxmuller_flag = EXTRA_FLAG_LOG_NORMAL;
-        return expf(mean + (stddev * v.x));
-    }
-    state->boxmuller_flag = 0;
-    return state->boxmuller_extra;
-}
-
-/**
- * \brief Return a log-normally distributed float from an Philox4_32_10 generator.
- *
- * Return a single log-normally distributed float derived from a normal
- * distribution with mean \p mean and standard deviation \p stddev
- * from the Philox4_32_10 generator in \p state,
- * increment position of generator by one.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results, transforms them to log-normal distribution,
- * then returns them one at a time.
- * See ::curand_log_normal2() for a more efficient version that returns
- * both results at once.
- *
- * \param state  - Pointer to state to update
- * \param mean   - Mean of the related normal distribution
- * \param stddev - Standard deviation of the related normal distribution
- *
- * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
- */
-
-QUALIFIERS float curand_log_normal(curandStatePhilox4_32_10_t *state, float mean, float stddev)
-{
-    if(state->boxmuller_flag != EXTRA_FLAG_LOG_NORMAL) {
-        unsigned int x, y;
-        x = curand(state);
-        y = curand(state);
-        float2 v = _curand_box_muller(x, y);
-        state->boxmuller_extra = expf(mean + (stddev * v.y));
-        state->boxmuller_flag = EXTRA_FLAG_LOG_NORMAL;
-        return expf(mean + (stddev * v.x));
-    }
-    state->boxmuller_flag = 0;
-    return state->boxmuller_extra;
-}
-
-/**
- * \brief Return two normally distributed floats from an XORWOW generator.
- *
- * Return two log-normally distributed floats derived from a normal
- * distribution with mean \p mean and standard deviation \p stddev
- * from the XORWOW generator in \p state,
- * increment position of generator by two.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results, then transforms them to log-normal.
- *
- * \param state - Pointer to state to update
- * \param mean   - Mean of the related normal distribution
- * \param stddev - Standard deviation of the related normal distribution
- *
- * \return Log-normally distributed float2 where each element is from a
- * distribution with mean \p mean and standard deviation \p stddev
- */
-QUALIFIERS float2 curand_log_normal2(curandStateXORWOW_t *state, float mean, float stddev)
-{
-    float2 v = curand_box_muller(state);
-    v.x = expf(mean + (stddev * v.x));
-    v.y = expf(mean + (stddev * v.y));
-    return v;
-}
-
-/**
- * \brief Return two normally distributed floats from an Philox4_32_10 generator.
- *
- * Return two log-normally distributed floats derived from a normal
- * distribution with mean \p mean and standard deviation \p stddev
- * from the Philox4_32_10 generator in \p state,
- * increment position of generator by two.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results, then transforms them to log-normal.
- *
- * \param state - Pointer to state to update
- * \param mean   - Mean of the related normal distribution
- * \param stddev - Standard deviation of the related normal distribution
- *
- * \return Log-normally distributed float2 where each element is from a
- * distribution with mean \p mean and standard deviation \p stddev
- */
-QUALIFIERS float2 curand_log_normal2(curandStatePhilox4_32_10_t *state, float mean, float stddev)
-{
-    float2 v = curand_box_muller(state);
-    v.x = expf(mean + (stddev * v.x));
-    v.y = expf(mean + (stddev * v.y));
-    return v;
-}
-/**
- * \brief Return four normally distributed floats from an Philox4_32_10 generator.
- *
- * Return four log-normally distributed floats derived from a normal
- * distribution with mean \p mean and standard deviation \p stddev
- * from the Philox4_32_10 generator in \p state,
- * increment position of generator by four.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results, then transforms them to log-normal.
- *
- * \param state - Pointer to state to update
- * \param mean   - Mean of the related normal distribution
- * \param stddev - Standard deviation of the related normal distribution
- *
- * \return Log-normally distributed float4 where each element is from a
- * distribution with mean \p mean and standard deviation \p stddev
- */
-QUALIFIERS float4 curand_log_normal4(curandStatePhilox4_32_10_t *state, float mean, float stddev)
-{
-    float4 v = curand_box_muller4(state);
-    v.x = expf(mean + (stddev * v.x));
-    v.y = expf(mean + (stddev * v.y));
-    v.z = expf(mean + (stddev * v.z));
-    v.w = expf(mean + (stddev * v.w));
-    return v;
-}
-
-/**
- * \brief Return a log-normally distributed float from an MRG32k3a generator.
- *
- * Return a single log-normally distributed float derived from a normal
- * distribution with mean \p mean and standard deviation \p stddev
- * from the MRG32k3a generator in \p state,
- * increment position of generator by one.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results, transforms them to log-normal distribution,
- * then returns them one at a time.
- * See ::curand_log_normal2() for a more efficient version that returns
- * both results at once.
- *
- * \param state  - Pointer to state to update
- * \param mean   - Mean of the related normal distribution
- * \param stddev - Standard deviation of the related normal distribution
- *
- * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
- */
-QUALIFIERS float curand_log_normal(curandStateMRG32k3a_t *state, float mean, float stddev)
-{
-    if(state->boxmuller_flag != EXTRA_FLAG_LOG_NORMAL) {
-        float2 v = curand_box_muller_mrg(state);
-        state->boxmuller_extra = expf(mean + (stddev * v.y));
-        state->boxmuller_flag = EXTRA_FLAG_LOG_NORMAL;
-        return expf(mean + (stddev * v.x));
-    }
-    state->boxmuller_flag = 0;
-    return state->boxmuller_extra;
-}
-
-/**
- * \brief Return two normally distributed floats from an MRG32k3a generator.
- *
- * Return two log-normally distributed floats derived from a normal
- * distribution with mean \p mean and standard deviation \p stddev
- * from the MRG32k3a generator in \p state,
- * increment position of generator by two.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results, then transforms them to log-normal.
- *
- * \param state - Pointer to state to update
- * \param mean   - Mean of the related normal distribution
- * \param stddev - Standard deviation of the related normal distribution
- *
- * \return Log-normally distributed float2 where each element is from a
- * distribution with mean \p mean and standard deviation \p stddev
- */
-QUALIFIERS float2 curand_log_normal2(curandStateMRG32k3a_t *state, float mean, float stddev)
-{
-    float2 v = curand_box_muller_mrg(state);
-    v.x = expf(mean + (stddev * v.x));
-    v.y = expf(mean + (stddev * v.y));
-    return v;
-}
-
-/**
- * \brief Return a log-normally distributed float from an MTGP32 generator.
- *
- * Return a single log-normally distributed float derived from a normal
- * distribution with mean \p mean and standard deviation \p stddev
- * from the MTGP32 generator in \p state,
- * increment position of generator.
- *
- * The implementation uses the inverse cumulative distribution function
- * to generate a normally distributed result, then transforms the result
- * to log-normal.
- *
- * \param state - Pointer to state to update
- * \param mean   - Mean of the related normal distribution
- * \param stddev - Standard deviation of the related normal distribution
- *
- * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
- */
-QUALIFIERS float curand_log_normal(curandStateMtgp32_t *state, float mean, float stddev)
-{
-    return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
-}
-
-/**
- * \brief Return a log-normally distributed float from a Sobol32 generator.
- *
- * Return a single log-normally distributed float derived from a normal
- * distribution with mean \p mean and standard deviation \p stddev
- * from the Sobol32 generator in \p state,
- * increment position of generator by one.
- *
- * The implementation uses the inverse cumulative distribution function
- * to generate a normally distributed result, then transforms the result
- * to log-normal.
- *
- * \param state - Pointer to state to update
- * \param mean   - Mean of the related normal distribution
- * \param stddev - Standard deviation of the related normal distribution
- *
- * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
- */
-QUALIFIERS float curand_log_normal(curandStateSobol32_t *state, float mean, float stddev)
-{
-    return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
-}
-/**
- * \brief Return a log-normally distributed float from a scrambled Sobol32 generator.
- *
- * Return a single log-normally distributed float derived from a normal
- * distribution with mean \p mean and standard deviation \p stddev
- * from the scrambled Sobol32 generator in \p state,
- * increment position of generator by one.
- *
- * The implementation uses the inverse cumulative distribution function
- * to generate a normally distributed result, then transforms the result
- * to log-normal.
- *
- * \param state - Pointer to state to update
- * \param mean   - Mean of the related normal distribution
- * \param stddev - Standard deviation of the related normal distribution
- *
- * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
- */
-QUALIFIERS float curand_log_normal(curandStateScrambledSobol32_t *state, float mean, float stddev)
-{
-    return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
-}
-
-/**
- * \brief Return a log-normally distributed float from a Sobol64 generator.
- *
- * Return a single log-normally distributed float derived from a normal
- * distribution with mean \p mean and standard deviation \p stddev
- * from the Sobol64 generator in \p state,
- * increment position of generator by one.
- *
- * The implementation uses the inverse cumulative distribution function
- * to generate normally distributed results, then converts to log-normal
- * distribution.
- *
- * \param state - Pointer to state to update
- * \param mean   - Mean of the related normal distribution
- * \param stddev - Standard deviation of the related normal distribution
- *
- * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
- */
-QUALIFIERS float curand_log_normal(curandStateSobol64_t *state, float mean, float stddev)
-{
-    return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
-}
-
-/**
- * \brief Return a log-normally distributed float from a scrambled Sobol64 generator.
- *
- * Return a single log-normally distributed float derived from a normal
- * distribution with mean \p mean and standard deviation \p stddev
- * from the scrambled Sobol64 generator in \p state,
- * increment position of generator by one.
- *
- * The implementation uses the inverse cumulative distribution function
- * to generate normally distributed results, then converts to log-normal
- * distribution.
- *
- * \param state - Pointer to state to update
- * \param mean   - Mean of the related normal distribution
- * \param stddev - Standard deviation of the related normal distribution
- *
- * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
- */
-QUALIFIERS float curand_log_normal(curandStateScrambledSobol64_t *state, float mean, float stddev)
-{
-    return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
-}
-
-/**
- * \brief Return a log-normally distributed double from an XORWOW generator.
- *
- * Return a single normally distributed double derived from a normal
- * distribution with mean \p mean and standard deviation \p stddev
- * from the XORWOW generator in \p state,
- * increment position of generator.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results, transforms them to log-normal distribution,
- * then returns them one at a time.
- * See ::curand_log_normal2_double() for a more efficient version that returns
- * both results at once.
- *
- * \param state - Pointer to state to update
- * \param mean   - Mean of the related normal distribution
- * \param stddev - Standard deviation of the related normal distribution
- *
- * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
- */
-
-QUALIFIERS double curand_log_normal_double(curandStateXORWOW_t *state, double mean, double stddev)
-{
-    if(state->boxmuller_flag_double != EXTRA_FLAG_LOG_NORMAL) {
-        unsigned int x0, x1, y0, y1;
-        x0 = curand(state);
-        x1 = curand(state);
-        y0 = curand(state);
-        y1 = curand(state);
-        double2 v = _curand_box_muller_double(x0, x1, y0, y1);
-        state->boxmuller_extra_double = exp(mean + (stddev * v.y));
-        state->boxmuller_flag_double = EXTRA_FLAG_LOG_NORMAL;
-        return exp(mean + (stddev * v.x));
-    }
-    state->boxmuller_flag_double = 0;
-    return state->boxmuller_extra_double;
-}
-
-/**
- * \brief Return a log-normally distributed double from an Philox4_32_10 generator.
- *
- * Return a single normally distributed double derived from a normal
- * distribution with mean \p mean and standard deviation \p stddev
- * from the Philox4_32_10 generator in \p state,
- * increment position of generator.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results, transforms them to log-normal distribution,
- * then returns them one at a time.
- * See ::curand_log_normal2_double() for a more efficient version that returns
- * both results at once.
- *
- * \param state - Pointer to state to update
- * \param mean   - Mean of the related normal distribution
- * \param stddev - Standard deviation of the related normal distribution
- *
- * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
- */
-
-QUALIFIERS double curand_log_normal_double(curandStatePhilox4_32_10_t *state, double mean, double stddev)
-{
-    if(state->boxmuller_flag_double != EXTRA_FLAG_LOG_NORMAL) {
-        uint4 _x;
-        _x = curand4(state);
-        double2 v = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
-        state->boxmuller_extra_double = exp(mean + (stddev * v.y));
-        state->boxmuller_flag_double = EXTRA_FLAG_LOG_NORMAL;
-        return exp(mean + (stddev * v.x));
-    }
-    state->boxmuller_flag_double = 0;
-    return state->boxmuller_extra_double;
-}
-
-
-/**
- * \brief Return two log-normally distributed doubles from an XORWOW generator.
- *
- * Return two log-normally distributed doubles derived from a normal
- * distribution with mean \p mean and standard deviation \p stddev
- * from the XORWOW generator in \p state,
- * increment position of generator by two.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results, and transforms them to log-normal distribution,.
- *
- * \param state - Pointer to state to update
- * \param mean   - Mean of the related normal distribution
- * \param stddev - Standard deviation of the related normal distribution
- *
- * \return Log-normally distributed double2 where each element is from a
- * distribution with mean \p mean and standard deviation \p stddev
- */
-QUALIFIERS double2 curand_log_normal2_double(curandStateXORWOW_t *state, double mean, double stddev)
-{
-    double2 v = curand_box_muller_double(state);
-    v.x = exp(mean + (stddev * v.x));
-    v.y = exp(mean + (stddev * v.y));
-    return v;
-}
-
-/**
- * \brief Return two log-normally distributed doubles from an Philox4_32_10 generator.
- *
- * Return two log-normally distributed doubles derived from a normal
- * distribution with mean \p mean and standard deviation \p stddev
- * from the Philox4_32_10 generator in \p state,
- * increment position of generator by four.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results, and transforms them to log-normal distribution,.
- *
- * \param state - Pointer to state to update
- * \param mean   - Mean of the related normal distribution
- * \param stddev - Standard deviation of the related normal distribution
- *
- * \return Log-normally distributed double4 where each element is from a
- * distribution with mean \p mean and standard deviation \p stddev
- */
-QUALIFIERS double2 curand_log_normal2_double(curandStatePhilox4_32_10_t *state, double mean, double stddev)
-{
-    double2 v = curand_box_muller2_double(state);
-    v.x = exp(mean + (stddev * v.x));
-    v.y = exp(mean + (stddev * v.y));
-    return v;
-}
-// nor part of API
-QUALIFIERS double4 curand_log_normal4_double(curandStatePhilox4_32_10_t *state, double mean, double stddev)
-{
-    double4 v = curand_box_muller4_double(state);
-    v.x = exp(mean + (stddev * v.x));
-    v.y = exp(mean + (stddev * v.y));
-    v.z = exp(mean + (stddev * v.z));
-    v.w = exp(mean + (stddev * v.w));
-    return v;
-}
-
-/**
- * \brief Return a log-normally distributed double from an MRG32k3a generator.
- *
- * Return a single normally distributed double derived from a normal
- * distribution with mean \p mean and standard deviation \p stddev
- * from the MRG32k3a generator in \p state,
- * increment position of generator.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results, transforms them to log-normal distribution,
- * then returns them one at a time.
- * See ::curand_log_normal2_double() for a more efficient version that returns
- * both results at once.
- *
- * \param state - Pointer to state to update
- * \param mean   - Mean of the related normal distribution
- * \param stddev - Standard deviation of the related normal distribution
- *
- * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
- */
-QUALIFIERS double curand_log_normal_double(curandStateMRG32k3a_t *state, double mean, double stddev)
-{
-    if(state->boxmuller_flag_double != EXTRA_FLAG_LOG_NORMAL) {
-        double2 v = curand_box_muller_mrg_double(state);
-        state->boxmuller_extra_double = exp(mean + (stddev * v.y));
-        state->boxmuller_flag_double = EXTRA_FLAG_LOG_NORMAL;
-        return exp(mean + (stddev * v.x));
-    }
-    state->boxmuller_flag_double = 0;
-    return state->boxmuller_extra_double;
-}
-
-/**
- * \brief Return two log-normally distributed doubles from an MRG32k3a generator.
- *
- * Return two log-normally distributed doubles derived from a normal
- * distribution with mean \p mean and standard deviation \p stddev
- * from the MRG32k3a generator in \p state,
- * increment position of generator by two.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results, and transforms them to log-normal distribution,.
- *
- * \param state - Pointer to state to update
- * \param mean   - Mean of the related normal distribution
- * \param stddev - Standard deviation of the related normal distribution
- *
- * \return Log-normally distributed double2 where each element is from a
- * distribution with mean \p mean and standard deviation \p stddev
- */
-QUALIFIERS double2 curand_log_normal2_double(curandStateMRG32k3a_t *state, double mean, double stddev)
-{
-    double2 v = curand_box_muller_mrg_double(state);
-    v.x = exp(mean + (stddev * v.x));
-    v.y = exp(mean + (stddev * v.y));
-    return v;
-}
-
-/**
- * \brief Return a log-normally distributed double from an MTGP32 generator.
- *
- * Return a single log-normally distributed double derived from a normal
- * distribution with mean \p mean and standard deviation \p stddev
- * from the MTGP32 generator in \p state,
- * increment position of generator.
- *
- * The implementation uses the inverse cumulative distribution function
- * to generate normally distributed results, and transforms them into
- * log-normal distribution.
- *
- * \param state - Pointer to state to update
- * \param mean   - Mean of the related normal distribution
- * \param stddev - Standard deviation of the related normal distribution
- *
- * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
- */
-QUALIFIERS double curand_log_normal_double(curandStateMtgp32_t *state, double mean, double stddev)
-{
-    return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
-}
-
-/**
- * \brief Return a log-normally distributed double from a Sobol32 generator.
- *
- * Return a single log-normally distributed double derived from a normal
- * distribution with mean \p mean and standard deviation \p stddev
- * from the Sobol32 generator in \p state,
- * increment position of generator by one.
- *
- * The implementation uses the inverse cumulative distribution function
- * to generate normally distributed results, and transforms them into
- * log-normal distribution.
- *
- * \param state - Pointer to state to update
- * \param mean   - Mean of the related normal distribution
- * \param stddev - Standard deviation of the related normal distribution
- *
- * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
- */
-QUALIFIERS double curand_log_normal_double(curandStateSobol32_t *state, double mean, double stddev)
-{
-    return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
-}
-
-/**
- * \brief Return a log-normally distributed double from a scrambled Sobol32 generator.
- *
- * Return a single log-normally distributed double derived from a normal
- * distribution with mean \p mean and standard deviation \p stddev
- * from the scrambled Sobol32 generator in \p state,
- * increment position of generator by one.
- *
- * The implementation uses the inverse cumulative distribution function
- * to generate normally distributed results, and transforms them into
- * log-normal distribution.
- *
- * \param state - Pointer to state to update
- * \param mean   - Mean of the related normal distribution
- * \param stddev - Standard deviation of the related normal distribution
- *
- * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
- */
-QUALIFIERS double curand_log_normal_double(curandStateScrambledSobol32_t *state, double mean, double stddev)
-{
-    return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
-}
-
-/**
- * \brief Return a log-normally distributed double from a Sobol64 generator.
- *
- * Return a single normally distributed double derived from a normal
- * distribution with mean \p mean and standard deviation \p stddev
- * from the Sobol64 generator in \p state,
- * increment position of generator by one.
- *
- * The implementation uses the inverse cumulative distribution function
- * to generate normally distributed results.
- *
- * \param state - Pointer to state to update
- * \param mean   - Mean of the related normal distribution
- * \param stddev - Standard deviation of the related normal distribution
- *
- * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
- */
-QUALIFIERS double curand_log_normal_double(curandStateSobol64_t *state, double mean, double stddev)
-{
-    return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
-}
-
-/**
- * \brief Return a log-normally distributed double from a scrambled Sobol64 generator.
- *
- * Return a single normally distributed double derived from a normal
- * distribution with mean \p mean and standard deviation \p stddev
- * from the scrambled Sobol64 generator in \p state,
- * increment position of generator by one.
- *
- * The implementation uses the inverse cumulative distribution function
- * to generate normally distributed results.
- *
- * \param state - Pointer to state to update
- * \param mean   - Mean of the related normal distribution
- * \param stddev - Standard deviation of the related normal distribution
- *
- * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
- */
-QUALIFIERS double curand_log_normal_double(curandStateScrambledSobol64_t *state, double mean, double stddev)
-{
-    return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
-}
-
-#endif // !defined(CURAND_LOGNORMAL_H_)
--- a/include/cuda/curand_mrg32k3a.h
+++ b/include/cuda/curand_mrg32k3a.h
--- a/include/cuda/curand_mtgp32.h
+++ b/include/cuda/curand_mtgp32.h
@ -1,210 +0,0 @@
-/*
- * Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#ifndef CURAND_MTGP32_H
-#define CURAND_MTGP32_H
-/*
- * @file curand_mtgp32.h
- *
- * @brief Mersenne Twister for Graphic Processors (mtgp32), which
- * generates 32-bit unsigned integers and single precision floating
- * point numbers based on IEEE 754 format.
- *
- * @author Mutsuo Saito (Hiroshima University)
- * @author Makoto Matsumoto (Hiroshima University)
- *
- */
-/*
- * Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
- * University.  All rights reserved.
- * Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
- * University and University of Tokyo.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- * 
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above
- *       copyright notice, this list of conditions and the following
- *       disclaimer in the documentation and/or other materials provided
- *       with the distribution.
- *     * Neither the name of the Hiroshima University nor the names of
- *       its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written
- *       permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-
-#define MTGPDC_MEXP 11213
-#define MTGPDC_N 351
-#define MTGPDC_FLOOR_2P 256
-#define MTGPDC_CEIL_2P 512
-#define MTGPDC_PARAM_TABLE mtgp32dc_params_fast_11213
-#define MTGP32_STATE_SIZE 1024
-#define MTGP32_STATE_MASK 1023
-#define CURAND_NUM_MTGP32_PARAMS 200
-#define MEXP 11213
-#define THREAD_NUM MTGPDC_FLOOR_2P
-#define LARGE_SIZE (THREAD_NUM * 3)
-#define TBL_SIZE 16
-
-/**
- * \addtogroup DEVICE Device API
- *
- * @{
- */
-
-/*
- * \struct MTGP32_PARAMS_FAST_T
- * MTGP32 parameters.
- * Some element is redundant to keep structure simple.
- *
- * \b pos is a pick up position which is selected to have good
- * performance on graphic processors.  3 < \b pos < Q, where Q is a
- * maximum number such that the size of status array - Q is a power of
- * 2.  For example, when \b mexp is 44497, size of 32-bit status array
- * is 696, and Q is 184, then \b pos is between 4 and 183. This means
- * 512 parallel calculations is allowed when \b mexp is 44497.
- *
- * \b poly_sha1 is SHA1 digest of the characteristic polynomial of
- * state transition function. SHA1 is calculated based on printing
- * form of the polynomial. This is important when we use parameters
- * generated by the dynamic creator which
- *
- * \b mask This is a mask to make the dimension of state space have
- * just Mersenne Prime. This is redundant.
- */
-
-struct mtgp32_params_fast;
-
-struct mtgp32_params_fast {
-    int mexp;			/*< Mersenne exponent. This is redundant. */
-    int pos;			/*< pick up position. */
-    int sh1;			/*< shift value 1. 0 < sh1 < 32. */
-    int sh2;			/*< shift value 2. 0 < sh2 < 32. */
-    unsigned int tbl[16];		/*< a small matrix. */
-    unsigned int tmp_tbl[16];	/*< a small matrix for tempering. */
-    unsigned int flt_tmp_tbl[16];	/*< a small matrix for tempering and
-                 converting to float. */
-    unsigned int mask;		/*< This is a mask for state space */
-    unsigned char poly_sha1[21]; /*< SHA1 digest */
-};
-
-/** \cond UNHIDE_TYPEDEFS */
-typedef struct mtgp32_params_fast mtgp32_params_fast_t;
-/** \endcond */
-
-/*
- * Generator Parameters.
- */
-struct mtgp32_kernel_params;
-struct mtgp32_kernel_params {
-    unsigned int pos_tbl[CURAND_NUM_MTGP32_PARAMS];
-    unsigned int param_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE];
-    unsigned int temper_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE];
-    unsigned int single_temper_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE];
-    unsigned int sh1_tbl[CURAND_NUM_MTGP32_PARAMS];
-    unsigned int sh2_tbl[CURAND_NUM_MTGP32_PARAMS];
-    unsigned int mask[1];
-};
-
-/** \cond UNHIDE_TYPEDEFS */
-typedef struct mtgp32_kernel_params mtgp32_kernel_params_t;
-/** \endcond */
-
-
-
-/*
- * kernel I/O
- * This structure must be initialized before first use.
- */
-
-/* MTGP (Mersenne Twister) RNG */
-/* This generator uses the Mersenne Twister algorithm of
- * http://arxiv.org/abs/1005.4973v2
- * Has period 2^11213.
-*/
-
-/**
- * CURAND MTGP32 state 
- */
-struct curandStateMtgp32;
-
-struct curandStateMtgp32 {
-    unsigned int s[MTGP32_STATE_SIZE];
-    int offset;
-    int pIdx;
-    mtgp32_kernel_params_t * k;
-};
-
-/*
- * CURAND MTGP32 state 
- */
-/** \cond UNHIDE_TYPEDEFS */
-typedef struct curandStateMtgp32 curandStateMtgp32_t;
-/** \endcond */
-
-/** @} */
-
-#endif
-
--- a/include/cuda/curand_mtgp32_host.h
+++ b/include/cuda/curand_mtgp32_host.h
@ -1,516 +0,0 @@
-/*
- * Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-/*
- * curand_mtgp32_host.h
- *
- *
- * MTGP32-11213
- *
- * Mersenne Twister RNG for the GPU
- *
- * The period of generated integers is 2<sup>11213</sup>-1.
- *
- * This code generates 32-bit unsigned integers, and
- * single precision floating point numbers uniformly distributed
- * in the range [1, 2). (float r; 1.0 <= r < 2.0)
- */
-
-/*
- * Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
- * University.  All rights reserved.
- * Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
- * University and University of Tokyo.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above
- *       copyright notice, this list of conditions and the following
- *       disclaimer in the documentation and/or other materials provided
- *       with the distribution.
- *     * Neither the name of the Hiroshima University nor the names of
- *       its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written
- *       permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-#if !defined CURAND_MTGP32_HOST_H
-#define CURAND_MTGP32_HOST_H
-
-#if !defined(QUALIFIERS)
-#define QUALIFIERS static inline __device__
-#endif
-
-#include <cuda.h>
-#include <stdlib.h>
-#include <memory.h>
-#include <string.h>
-#include "curand.h"
-#include "curand_mtgp32.h"
-#include "curand_mtgp32dc_p_11213.h"
-
-
-/**
- * \addtogroup DEVICE Device API
- *
- * @{
- */
-
-static const unsigned int non_zero = 0x4d544750;
-
-/*
- * This function represents a function used in the initialization
- * by mtgp32_init_by_array() and mtgp32_init_by_str().
- * @param[in] x 32-bit integer
- * @return 32-bit integer
- */
-static __forceinline__ unsigned int ini_func1(unsigned int x) {
-    return (x ^ (x >> 27)) * (1664525);
-}
-
-/*
- * This function represents a function used in the initialization
- * by mtgp32_init_by_array() and mtgp32_init_by_str().
- * @param[in] x 32-bit integer
- * @return 32-bit integer
- */
-static __forceinline__ unsigned int ini_func2(unsigned int x) {
-    return (x ^ (x >> 27)) * (1566083941);
-}
-
-/*
- * This function initializes the internal state array with a 32-bit
- * integer seed. The allocated memory should be freed by calling
- * mtgp32_free(). \b para should be one of the elements in the
- * parameter table (mtgp32-param-ref.c).
- *
- * This function is call by cuda program, because cuda program uses
- * another structure and another allocation method.
- *
- * @param[out] array MTGP internal status vector.
- * @param[in] para parameter structure
- * @param[in] seed a 32-bit integer used as the seed.
- */
-static __forceinline__ __host__
-void mtgp32_init_state(unsigned int state[],
-                       const mtgp32_params_fast_t *para, unsigned int seed) {
-    int i;
-    int size = para->mexp / 32 + 1;
-    unsigned int hidden_seed;
-    unsigned int tmp;
-    hidden_seed = para->tbl[4] ^ (para->tbl[8] << 16);
-    tmp = hidden_seed;
-    tmp += tmp >> 16;
-    tmp += tmp >> 8;
-    memset(state, tmp & 0xff, sizeof(unsigned int) * size);
-    state[0] = seed;
-    state[1] = hidden_seed;
-    for (i = 1; i < size; i++) {
-        state[i] ^= (1812433253) * (state[i - 1] ^ (state[i - 1] >> 30)) + i;
-    }
-}
-
-/*
- * This function initializes the internal state array
- * with a 32-bit integer array. \b para should be one of the elements in
- * the parameter table (mtgp32-param-ref.c).
- *
- * @param[out] mtgp32 MTGP structure.
- * @param[in] para parameter structure
- * @param[in] array a 32-bit integer array used as a seed.
- * @param[in] length length of the array.
- * @return CURAND_STATUS_SUCCESS
- */
-static __forceinline__ __host__
-int mtgp32_init_by_array(unsigned int state[],
-                         const mtgp32_params_fast_t *para,
-                         unsigned int *array, int length) {
-    int i, j, count;
-    unsigned int r;
-    int lag;
-    int mid;
-    int size = para->mexp / 32 + 1;
-    unsigned int hidden_seed;
-    unsigned int tmp;
-
-    if (size >= 623) {
-    lag = 11;
-    } else if (size >= 68) {
-    lag = 7;
-    } else if (size >= 39) {
-    lag = 5;
-    } else {
-    lag = 3;
-    }
-    mid = (size - lag) / 2;
-
-    hidden_seed = para->tbl[4] ^ (para->tbl[8] << 16);
-    tmp = hidden_seed;
-    tmp += tmp >> 16;
-    tmp += tmp >> 8;
-    memset(state, tmp & 0xff, sizeof(unsigned int) * size);
-    state[0] = hidden_seed;
-
-    if (length + 1 > size) {
-    count = length + 1;
-    } else {
-    count = size;
-    }
-    r = ini_func1(state[0] ^ state[mid] ^ state[size - 1]);
-    state[mid] += r;
-    r += length;
-    state[(mid + lag) % size] += r;
-    state[0] = r;
-    i = 1;
-    count--;
-    for (i = 1, j = 0; (j < count) && (j < length); j++) {
-    r = ini_func1(state[i] ^ state[(i + mid) % size]
-              ^ state[(i + size - 1) % size]);
-    state[(i + mid) % size] += r;
-    r += array[j] + i;
-    state[(i + mid + lag) % size] += r;
-    state[i] = r;
-    i = (i + 1) % size;
-    }
-    for (; j < count; j++) {
-    r = ini_func1(state[i] ^ state[(i + mid) % size]
-              ^ state[(i + size - 1) % size]);
-    state[(i + mid) % size] += r;
-    r += i;
-    state[(i + mid + lag) % size] += r;
-    state[i] = r;
-    i = (i + 1) % size;
-    }
-    for (j = 0; j < size; j++) {
-    r = ini_func2(state[i] + state[(i + mid) % size]
-              + state[(i + size - 1) % size]);
-    state[(i + mid) % size] ^= r;
-    r -= i;
-    state[(i + mid + lag) % size] ^= r;
-    state[i] = r;
-    i = (i + 1) % size;
-    }
-    if (state[size - 1] == 0) {
-    state[size - 1] = non_zero;
-    }
-    return 0;
-}
-
-/*
- * This function initializes the internal state array
- * with a character array. \b para should be one of the elements in
- * the parameter table (mtgp32-param-ref.c).
- * This is the same algorithm with mtgp32_init_by_array(), but hope to
- * be more useful.
- *
- * @param[out] mtgp32 MTGP structure.
- * @param[in] para parameter structure
- * @param[in] array a character array used as a seed. (terminated by zero.)
- * @return memory allocation result. if 0 then O.K.
- */
-static __forceinline__ __host__
-int mtgp32_init_by_str(unsigned int state[],
-                       const mtgp32_params_fast_t *para, unsigned char *array) {
-    int i, j, count;
-    unsigned int r;
-    int lag;
-    int mid;
-    int size = para->mexp / 32 + 1;
-    int length = (unsigned int)strlen((char *)array);
-    unsigned int hidden_seed;
-    unsigned int tmp;
-
-    if (size >= 623) {
-    lag = 11;
-    } else if (size >= 68) {
-    lag = 7;
-    } else if (size >= 39) {
-    lag = 5;
-    } else {
-    lag = 3;
-    }
-    mid = (size - lag) / 2;
-
-    hidden_seed = para->tbl[4] ^ (para->tbl[8] << 16);
-    tmp = hidden_seed;
-    tmp += tmp >> 16;
-    tmp += tmp >> 8;
-    memset(state, tmp & 0xff, sizeof(unsigned int) * size);
-    state[0] = hidden_seed;
-
-    if (length + 1 > size) {
-    count = length + 1;
-    } else {
-    count = size;
-    }
-    r = ini_func1(state[0] ^ state[mid] ^ state[size - 1]);
-    state[mid] += r;
-    r += length;
-    state[(mid + lag) % size] += r;
-    state[0] = r;
-    i = 1;
-    count--;
-    for (i = 1, j = 0; (j < count) && (j < length); j++) {
-    r = ini_func1(state[i] ^ state[(i + mid) % size]
-              ^ state[(i + size - 1) % size]);
-    state[(i + mid) % size] += r;
-    r += array[j] + i;
-    state[(i + mid + lag) % size] += r;
-    state[i] = r;
-    i = (i + 1) % size;
-    }
-    for (; j < count; j++) {
-    r = ini_func1(state[i] ^ state[(i + mid) % size]
-              ^ state[(i + size - 1) % size]);
-    state[(i + mid) % size] += r;
-    r += i;
-    state[(i + mid + lag) % size] += r;
-    state[i] = r;
-    i = (i + 1) % size;
-    }
-    for (j = 0; j < size; j++) {
-    r = ini_func2(state[i] + state[(i + mid) % size]
-              + state[(i + size - 1) % size]);
-    state[(i + mid) % size] ^= r;
-    r -= i;
-    state[(i + mid + lag) % size] ^= r;
-    state[i] = r;
-    i = (i + 1) % size;
-    }
-    if (state[size - 1] == 0) {
-    state[size - 1] = non_zero;
-    }
-    return 0;
-}
-
-template<typename ParamsType>
-static __forceinline__ __host__
-curandStatus_t curandMakeMTGP32ConstantsImpl(const mtgp32_params_fast_t params[], ParamsType * p, const int block_num)
-{
-    const int size1 = sizeof(unsigned int) * block_num;
-    const int size2 = sizeof(unsigned int) * block_num * TBL_SIZE;
-    unsigned int *h_pos_tbl;
-    unsigned int *h_sh1_tbl;
-    unsigned int *h_sh2_tbl;
-    unsigned int *h_param_tbl;
-    unsigned int *h_temper_tbl;
-    unsigned int *h_single_temper_tbl;
-    unsigned int *h_mask;
-    curandStatus_t status = CURAND_STATUS_SUCCESS;
-
-    h_pos_tbl = (unsigned int *)malloc(size1);
-    h_sh1_tbl = (unsigned int *)malloc(size1);
-    h_sh2_tbl = (unsigned int *)malloc(size1);
-    h_param_tbl = (unsigned int *)malloc(size2);
-    h_temper_tbl = (unsigned int *)malloc(size2);
-    h_single_temper_tbl = (unsigned int *)malloc(size2);
-    h_mask = (unsigned int *)malloc(sizeof(unsigned int));
-    if (h_pos_tbl == NULL
-	    || h_sh1_tbl == NULL
-	    || h_sh2_tbl == NULL
-	    || h_param_tbl == NULL
-	    || h_temper_tbl == NULL
-	    || h_single_temper_tbl == NULL
-	    || h_mask == NULL) {
-        if (h_pos_tbl != NULL) free(h_pos_tbl);
-        if (h_sh1_tbl != NULL) free(h_sh1_tbl);
-        if (h_sh2_tbl != NULL) free(h_sh2_tbl);
-        if (h_param_tbl != NULL) free(h_param_tbl);
-        if (h_temper_tbl != NULL) free(h_temper_tbl);
-        if (h_single_temper_tbl != NULL) free(h_single_temper_tbl);
-        if (h_mask != NULL) free(h_mask);
-        status = CURAND_STATUS_ALLOCATION_FAILED;
-    } else {
-
-        h_mask[0] = params[0].mask;
-        for (int i = 0; i < block_num; i++) {
-	        h_pos_tbl[i] = params[i].pos;
-	        h_sh1_tbl[i] = params[i].sh1;
-	        h_sh2_tbl[i] = params[i].sh2;
-	        for (int j = 0; j < TBL_SIZE; j++) {
-	            h_param_tbl[i * TBL_SIZE + j] = params[i].tbl[j];
-	            h_temper_tbl[i * TBL_SIZE + j] = params[i].tmp_tbl[j];
-	            h_single_temper_tbl[i * TBL_SIZE + j] = params[i].flt_tmp_tbl[j];
-	        }
-        }
-        if (cudaMemcpy( p->pos_tbl,
-                        h_pos_tbl, size1, cudaMemcpyHostToDevice) != cudaSuccess)
-        {
-            status = CURAND_STATUS_INITIALIZATION_FAILED;
-        } else
-        if (cudaMemcpy( p->sh1_tbl,
-                        h_sh1_tbl, size1, cudaMemcpyHostToDevice) != cudaSuccess)
-        {
-            status = CURAND_STATUS_INITIALIZATION_FAILED;
-        } else
-        if (cudaMemcpy( p->sh2_tbl,
-                        h_sh2_tbl, size1, cudaMemcpyHostToDevice) != cudaSuccess)
-        {
-            status = CURAND_STATUS_INITIALIZATION_FAILED;
-        } else
-        if (cudaMemcpy( p->param_tbl,
-                        h_param_tbl, size2, cudaMemcpyHostToDevice) != cudaSuccess)
-        {
-            status = CURAND_STATUS_INITIALIZATION_FAILED;
-        } else
-        if (cudaMemcpy( p->temper_tbl,
-                        h_temper_tbl, size2, cudaMemcpyHostToDevice) != cudaSuccess)
-        {
-            status = CURAND_STATUS_INITIALIZATION_FAILED;
-        } else
-        if (cudaMemcpy( p->single_temper_tbl,
-                        h_single_temper_tbl, size2, cudaMemcpyHostToDevice) != cudaSuccess)
-        {
-            status = CURAND_STATUS_INITIALIZATION_FAILED;
-        } else
-        if (cudaMemcpy( p->mask,
-                        h_mask, sizeof(unsigned int), cudaMemcpyHostToDevice) != cudaSuccess)
-        {
-            status = CURAND_STATUS_INITIALIZATION_FAILED;
-        }
-    }
-    if (h_pos_tbl != NULL) free(h_pos_tbl);
-    if (h_sh1_tbl != NULL) free(h_sh1_tbl);
-    if (h_sh2_tbl != NULL) free(h_sh2_tbl);
-    if (h_param_tbl != NULL) free(h_param_tbl);
-    if (h_temper_tbl != NULL) free(h_temper_tbl);
-    if (h_single_temper_tbl != NULL)free(h_single_temper_tbl);
-    if (h_mask != NULL) free(h_mask);
-    return status;
-}
-
-/**
- * \brief Set up constant parameters for the mtgp32 generator
- *
- * This host-side helper function re-organizes CURAND_NUM_MTGP32_PARAMS sets of
- * generator parameters for use by kernel functions and copies the
- * result to the specified location in device memory.
- *
- * \param params - Pointer to an array of type mtgp32_params_fast_t in host memory
- * \param p - pointer to a structure of type mtgp32_kernel_params_t in device memory.
- *
- * \return
- * - CURAND_STATUS_ALLOCATION_FAILED if host memory could not be allocated
- * - CURAND_STATUS_INITIALIZATION_FAILED if the copy to device memory failed
- * - CURAND_STATUS_SUCCESS otherwise
- */
-static __forceinline__ __host__
-curandStatus_t curandMakeMTGP32Constants(const mtgp32_params_fast_t params[], mtgp32_kernel_params_t * p)
-{
-    return curandMakeMTGP32ConstantsImpl(params, p, CURAND_NUM_MTGP32_PARAMS);
-}
-
-/**
- * \brief Set up initial states for the mtgp32 generator
- *
- * This host-side helper function initializes a number of states (one parameter set per state) for
- * an mtgp32 generator. To accomplish this it allocates a state array in host memory,
- * initializes that array, and copies the result to device memory.
- *
- * \param s - pointer to an array of states in device memory
- * \param params - Pointer to an array of type mtgp32_params_fast_t in host memory
- * \param k - pointer to a structure of type mtgp32_kernel_params_t in device memory
- * \param n - number of parameter sets/states to initialize
- * \param seed - seed value
- *
- * \return
- * - CURAND_STATUS_ALLOCATION_FAILED if host memory state could not be allocated
- * - CURAND_STATUS_INITIALIZATION_FAILED if the copy to device memory failed
- * - CURAND_STATUS_SUCCESS otherwise
- */
-static __forceinline__ __host__
-curandStatus_t CURANDAPI curandMakeMTGP32KernelState(curandStateMtgp32_t *s,
-                                                     mtgp32_params_fast_t params[],
-                                                     mtgp32_kernel_params_t *k,
-                                                     int n,
-                                                     unsigned long long seed)
-{
-    int i;
-    curandStatus_t status = CURAND_STATUS_SUCCESS;
-    curandStateMtgp32_t *h_status =(curandStateMtgp32_t *) malloc(sizeof(curandStateMtgp32_t) * n);
-    if (h_status == NULL) {
-        status = CURAND_STATUS_ALLOCATION_FAILED;
-    } else {
-        seed = seed ^ (seed >> 32);
-        for (i = 0; i < n; i++) {
-            mtgp32_init_state(&(h_status[i].s[0]), &params[i],(unsigned int)seed + i + 1);
-            h_status[i].offset = 0;
-            h_status[i].pIdx = i;
-            h_status[i].k = k;
-        }
-        if (cudaMemcpy(s, h_status,
-                       sizeof(curandStateMtgp32_t) * n,
-                       cudaMemcpyHostToDevice) != cudaSuccess) {
-            status = CURAND_STATUS_INITIALIZATION_FAILED;
-        }
-     }
-    free(h_status);
-    return status;
-}
-
-/** @} */
-
-#endif
-
-
--- a/include/cuda/curand_mtgp32_kernel.h
+++ b/include/cuda/curand_mtgp32_kernel.h
@ -1,385 +0,0 @@
-/*
- * Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-/*
- * curand_mtgp32_kernel.h
- *
- *
- * MTGP32-11213
- *
- * Mersenne Twister RNG for the GPU
- *
- * The period of generated integers is 2<sup>11213</sup>-1.
- *
- * This code generates 32-bit unsigned integers, and
- * single precision floating point numbers uniformly distributed
- * in the range [1, 2). (float r; 1.0 <= r < 2.0)
- */
-
-/*
- * Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
- * University.  All rights reserved.
- * Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
- * University and University of Tokyo.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above
- *       copyright notice, this list of conditions and the following
- *       disclaimer in the documentation and/or other materials provided
- *       with the distribution.
- *     * Neither the name of the Hiroshima University nor the names of
- *       its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written
- *       permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-#if !defined CURAND_MTGP32_KERNEL_H
-#define CURAND_MTGP32_KERNEL_H
-
-#if !defined(QUALIFIERS)
-#define QUALIFIERS static __forceinline__ __device__
-#endif
-
-#ifndef __CUDACC_RTC__
-#include <cuda.h>
-#include <stdlib.h>
-#include <memory.h>
-#include <string.h>
-#endif // ifndef __CUDACC_RTC__
-#include "curand.h"
-#include "curand_mtgp32.h"
-
-/**
- * \addtogroup DEVICE Device API
- *
- * @{
- */
-
-#ifndef __CUDA_ARCH__
-// define blockDim and threadIdx for host compatibility call
-extern const dim3 blockDim;
-extern const uint3 threadIdx;
-#endif
-
-
-/*
- * The function of the recursion formula calculation.
- *
- * @param[in] X1 the farthest part of state array.
- * @param[in] X2 the second farthest part of state array.
- * @param[in] Y a part of state array.
- * @param[in] bid block id.
- * @return output
- */
-QUALIFIERS unsigned int para_rec(mtgp32_kernel_params_t * k,unsigned int X1, unsigned int X2, unsigned int Y, int bid) {
-    unsigned int X = (X1 & k->mask[0]) ^ X2;
-    unsigned int MAT;
-
-    X ^= X << k->sh1_tbl[bid];
-    Y = X ^ (Y >> k->sh2_tbl[bid]);
-    MAT = k->param_tbl[bid][Y & 0x0f];
-    return Y ^ MAT;
-}
-
-/*
- * The tempering function.
- *
- * @param[in] V the output value should be tempered.
- * @param[in] T the tempering helper value.
- * @param[in] bid block id.
- * @return the tempered value.
- */
-QUALIFIERS unsigned int temper(mtgp32_kernel_params_t * k,unsigned int V, unsigned int T, int bid) {
-    unsigned int MAT;
-
-    T ^= T >> 16;
-    T ^= T >> 8;
-    MAT = k->temper_tbl[bid][T & 0x0f];
-    return V ^ MAT;
-}
-
-/*
- * The tempering and converting function.
- * By using the preset table, converting to IEEE format
- * and tempering are done simultaneously.
- *
- * @param[in] V the output value should be tempered.
- * @param[in] T the tempering helper value.
- * @param[in] bid block id.
- * @return the tempered and converted value.
- */
-QUALIFIERS unsigned int temper_single(mtgp32_kernel_params_t * k,unsigned int V, unsigned int T, int bid) {
-    unsigned int MAT;
-    unsigned int r;
-
-    T ^= T >> 16;
-    T ^= T >> 8;
-    MAT = k->single_temper_tbl[bid][T & 0x0f];
-    r = (V >> 9) ^ MAT;
-    return r;
-}
-
-/**
- * \brief Return 32-bits of pseudorandomness from a mtgp32 generator.
- *
- * Return 32-bits of pseudorandomness from the mtgp32 generator in \p state,
- * increment position of generator by the number of threads in the block.
- * Note the number of threads in the block can not exceed 256.
- *
- * \param state - Pointer to state to update
- *
- * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
- */
-QUALIFIERS unsigned int curand(curandStateMtgp32_t *state)
-{
-    unsigned int t;
-    unsigned int d;
-    int pos = state->k->pos_tbl[state->pIdx];
-    unsigned int r;
-    unsigned int o;
-
-    d = blockDim.z * blockDim.y * blockDim.x;
-    //assert( d <= 256 );
-    t = (blockDim.z * blockDim.y * threadIdx.z) + (blockDim.x * threadIdx.y) + threadIdx.x;
-    r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
-             state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
-             state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
-             state->pIdx);
-
-    state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r;
-    o = temper(state->k, r,
-           state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
-           state->pIdx);
-#if __CUDA_ARCH__ != 0
-    __syncthreads();
-#endif
-    if (t == 0)
-    {
-        state->offset = (state->offset + d) & MTGP32_STATE_MASK;
-    }
-#if __CUDA_ARCH__ != 0
-    __syncthreads();
-#endif
-    return o;
-
-}
-/**
- * \brief Return 32-bits of pseudorandomness from a specific position in a mtgp32 generator.
- *
- * Return 32-bits of pseudorandomness from position \p index of the mtgp32 generator in \p state,
- * increment position of generator by \p n positions, which must be the total number of positions
- * upddated in the state by the thread block, for this invocation.
- *
- * Note :
- * Thread indices must range from 0...\ n - 1.
- * The number of positions updated may not exceed 256.
- * A thread block may update more than one state, but a given state may not be updated by more than one thread block.
- *
- * \param state - Pointer to state to update
- * \param index - Index (0..255) of the position within the state to draw from and update
- * \param n - The total number of postions in this state that are being updated by this invocation
- *
- * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
- */
-QUALIFIERS unsigned int curand_mtgp32_specific(curandStateMtgp32_t *state, unsigned char index, unsigned char n)
-{
-    unsigned int t;
-    int pos = state->k->pos_tbl[state->pIdx];
-    unsigned int r;
-    unsigned int o;
-
-    t = index;
-    r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
-             state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
-             state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
-             state->pIdx);
-
-    state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r;
-    o = temper(state->k, r,
-           state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
-           state->pIdx);
-#if __CUDA_ARCH__ != 0
-    __syncthreads();
-#endif
-    if (index == 0)
-    {
-        state->offset = (state->offset + n) & MTGP32_STATE_MASK;
-    }
-#if __CUDA_ARCH__ != 0
-    __syncthreads();
-#endif
-    return o;
-}
-/**
- * \brief Return a uniformly distributed float from a mtgp32 generator.
- *
- * Return a uniformly distributed float between \p 0.0f and \p 1.0f
- * from the mtgp32 generator in \p state, increment position of generator.
- * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
- * point outputs are never returned.
- *
- * Note: This alternate derivation of a uniform float is provided for completeness
- * with the original source
- *
- * \param state - Pointer to state to update
- *
- * \return uniformly distributed float between \p 0.0f and \p 1.0f
- */
-QUALIFIERS float curand_mtgp32_single(curandStateMtgp32_t *state)
-{
-    unsigned int t;
-    unsigned int d;
-    int pos = state->k->pos_tbl[state->pIdx];
-    unsigned int r;
-    unsigned int o_u;
-    float o_f;
-
-
-    t = blockDim.z * blockDim.y;
-    d = t * blockDim.x;
-    //assert( d <= 256 );
-    t += threadIdx.x;
-    r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
-             state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
-             state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
-             state->pIdx);
-
-    state->s[t] = r;
-    o_u = temper_single(state->k, r,
-                        state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
-                        state->pIdx);
-#if __CUDA_ARCH__ != 0
-    __syncthreads();
-#endif
-    if (threadIdx.x == 0)
-    {
-        state->offset = (state->offset + d) & MTGP32_STATE_MASK;
-    }
-#if __CUDA_ARCH__ != 0
-    __syncthreads();
-#endif
-    memcpy(&o_f, &o_u, sizeof(o_u));
-    return o_f;
-}
-
-/**
- * \brief Return a uniformly distributed float from a specific position in a mtgp32 generator.
- *
- * Return a uniformly distributed float between \p 0.0f and \p 1.0f
- * from position \p index of the mtgp32 generator in \p state, and
- * increment position of generator by \p n positions, which must be the total number of positions
- * upddated in the state by the thread block, for this invocation.
- * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
- * point outputs are never returned.
- *
- * Note 1:
- * Thread indices must range from 0...\p n - 1.
- * The number of positions updated may not exceed 256.
- * A thread block may update more than one state, but a given state may not be updated by more than one thread block.
- *
- * Note 2: This alternate derivation of a uniform float is provided for completeness
- * with the original source
- *
- * \param state - Pointer to state to update
- * \param index - Index (0..255) of the position within the state to draw from and update
- * \param n - The total number of postions in this state that are being updated by this invocation
- *
- * \return uniformly distributed float between \p 0.0f and \p 1.0f
- */
-QUALIFIERS float curand_mtgp32_single_specific(curandStateMtgp32_t *state, unsigned char index, unsigned char n)
-{
-    unsigned int t;
-    int pos = state->k->pos_tbl[state->pIdx];
-    unsigned int r;
-    unsigned int o_u;
-    float o_f;
-
-    t = index;
-    r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
-             state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
-             state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
-             state->pIdx);
-
-    state->s[t] = r;
-    o_u = temper_single(state->k, r,
-                        state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
-                        state->pIdx);
-#if __CUDA_ARCH__ != 0
-    __syncthreads();
-#endif
-    if (threadIdx.x == 0)
-    {
-        state->offset = (state->offset + n) & MTGP32_STATE_MASK;
-    }
-#if __CUDA_ARCH__ != 0
-    __syncthreads();
-#endif
-    memcpy(&o_f, &o_u, sizeof(o_u));
-    return o_f;
-}
-
-/** @} */
-
-#endif
--- a/include/cuda/curand_mtgp32dc_p_11213.h
+++ b/include/cuda/curand_mtgp32dc_p_11213.h
--- a/include/cuda/curand_normal.h
+++ b/include/cuda/curand_normal.h
@ -1,837 +0,0 @@
-
- /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
-  *
-  * NOTICE TO LICENSEE:
-  *
-  * The source code and/or documentation ("Licensed Deliverables") are
-  * subject to NVIDIA intellectual property rights under U.S. and
-  * international Copyright laws.
-  *
-  * The Licensed Deliverables contained herein are PROPRIETARY and
-  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
-  * conditions of a form of NVIDIA software license agreement by and
-  * between NVIDIA and Licensee ("License Agreement") or electronically
-  * accepted by Licensee.  Notwithstanding any terms or conditions to
-  * the contrary in the License Agreement, reproduction or disclosure
-  * of the Licensed Deliverables to any third party without the express
-  * written consent of NVIDIA is prohibited.
-  *
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
-  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
-  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
-  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
-  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
-  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
-  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
-  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
-  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
-  * OF THESE LICENSED DELIVERABLES.
-  *
-  * U.S. Government End Users.  These Licensed Deliverables are a
-  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
-  * 1995), consisting of "commercial computer software" and "commercial
-  * computer software documentation" as such terms are used in 48
-  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
-  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
-  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
-  * U.S. Government End Users acquire the Licensed Deliverables with
-  * only those rights set forth herein.
-  *
-  * Any use of the Licensed Deliverables in individual and commercial
-  * software must include, in the user documentation and internal
-  * comments to the code, the above Disclaimer and U.S. Government End
-  * Users Notice.
-  */
-
-
-#if !defined(CURAND_NORMAL_H_)
-#define CURAND_NORMAL_H_
-
-/**
- * \defgroup DEVICE Device API
- *
- * @{
- */
-
-#ifndef __CUDACC_RTC__
-#include <math.h>
-#endif // __CUDACC_RTC__
-
-#include "curand_mrg32k3a.h"
-#include "curand_mtgp32_kernel.h"
-#include "curand_philox4x32_x.h"
-#include "curand_normal_static.h"
-
-QUALIFIERS float2 _curand_box_muller(unsigned int x, unsigned int y)
-{
-    float2 result;
-    float u = x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2);
-    float v = y * CURAND_2POW32_INV_2PI + (CURAND_2POW32_INV_2PI/2);
-#if __CUDA_ARCH__ > 0
-    float s = sqrtf(-2.0f * logf(u));
-    __sincosf(v, &result.x, &result.y);
-#else
-    float s = sqrtf(-2.0f * logf(u));
-    result.x = sinf(v);
-    result.y = cosf(v);
-#endif
-    result.x *= s;
-    result.y *= s;
-    return result;
-}
-
-QUALIFIERS float2 curand_box_muller_mrg(curandStateMRG32k3a_t * state)
-{
-    float x, y;
-    x = curand_uniform(state);
-    y = curand_uniform(state) * CURAND_2PI;
-    float2 result;
-#if __CUDA_ARCH__ > 0
-    float s = sqrtf(-2.0f * logf(x));
-    __sincosf(y, &result.x, &result.y);
-#else
-    float s = sqrtf(-2.0f * logf(x));
-    result.x = sinf(y);
-    result.y = cosf(y);
-#endif
-    result.x *= s;
-    result.y *= s;
-    return result;
-}
-
-QUALIFIERS double2
-_curand_box_muller_double(unsigned int x0, unsigned int x1,
-                          unsigned int y0, unsigned int y1)
-{
-    double2 result;
-    unsigned long long zx = (unsigned long long)x0 ^
-        ((unsigned long long)x1 << (53 - 32));
-    double u = zx * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
-    unsigned long long zy = (unsigned long long)y0 ^
-        ((unsigned long long)y1 << (53 - 32));
-    double v = zy * (CURAND_2POW53_INV_DOUBLE*2.0) + CURAND_2POW53_INV_DOUBLE;
-    double s = sqrt(-2.0 * log(u));
-
-#if __CUDA_ARCH__ > 0
-    sincospi(v, &result.x, &result.y);
-#else
-    result.x = sin(v*CURAND_PI_DOUBLE);
-    result.y = cos(v*CURAND_PI_DOUBLE);
-#endif
-    result.x *= s;
-    result.y *= s;
-
-    return result;
-}
-
-QUALIFIERS double2
-curand_box_muller_mrg_double(curandStateMRG32k3a_t * state)
-{
-    double x, y;
-    double2 result;
-    x = curand_uniform_double(state);
-    y = curand_uniform_double(state) * 2.0;
-
-    double s = sqrt(-2.0 * log(x));
-#if __CUDA_ARCH__ > 0
-    sincospi(y, &result.x, &result.y);
-#else
-    result.x = sin(y*CURAND_PI_DOUBLE);
-    result.y = cos(y*CURAND_PI_DOUBLE);
-#endif
-    result.x *= s;
-    result.y *= s;
-    return result;
-}
-
-template <typename R>
-QUALIFIERS float2 curand_box_muller(R *state)
-{
-    float2 result;
-    unsigned int x = curand(state);
-    unsigned int y = curand(state);
-    result = _curand_box_muller(x, y);
-    return result;
-}
-
-template <typename R>
-QUALIFIERS float4 curand_box_muller4(R *state)
-{
-    float4 result;
-    float2 _result;
-    uint4 x = curand4(state);
-    //unsigned int y = curand(state);
-    _result = _curand_box_muller(x.x, x.y);
-    result.x = _result.x;
-    result.y = _result.y;
-    _result = _curand_box_muller(x.z, x.w);
-    result.z = _result.x;
-    result.w = _result.y;
-    return result;
-}
-
-template <typename R>
-QUALIFIERS double2 curand_box_muller_double(R *state)
-{
-    double2 result;
-    unsigned int x0 = curand(state);
-    unsigned int x1 = curand(state);
-    unsigned int y0 = curand(state);
-    unsigned int y1 = curand(state);
-    result = _curand_box_muller_double(x0, x1, y0, y1);
-    return result;
-}
-
-template <typename R>
-QUALIFIERS double2 curand_box_muller2_double(R *state)
-{
-    double2 result;
-    uint4 _x;
-    _x = curand4(state);
-    result = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
-    return result;
-}
-
-
-template <typename R>
-QUALIFIERS double4 curand_box_muller4_double(R *state)
-{
-    double4 result;
-    double2 _res1;
-    double2 _res2;
-    uint4 _x;
-    uint4 _y;
-    _x = curand4(state);
-    _y = curand4(state);
-    _res1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
-    _res2 = _curand_box_muller_double(_y.x, _y.y, _y.z, _y.w);
-    result.x = _res1.x;
-    result.y = _res1.y;
-    result.z = _res2.x;
-    result.w = _res2.y;
-    return result;
-}
-
-//QUALIFIERS float _curand_normal_icdf(unsigned int x)
-//{
-//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
-//    float s = CURAND_SQRT2;
-//    // Mirror to avoid loss of precision
-//    if(x > 0x80000000UL) {
-//        x = 0xffffffffUL - x;
-//        s = -s;
-//    }
-//    float p = x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
-//    // p is in (0, 0.5], 2p is in (0, 1]
-//    return s * erfcinvf(2.0f * p);
-//#else
-//    x++;    //suppress warnings
-//    return 0.0f;
-//#endif
-//}
-//
-//QUALIFIERS float _curand_normal_icdf(unsigned long long x)
-//{
-//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
-//    unsigned int t = (unsigned int)(x >> 32);
-//    float s = CURAND_SQRT2;
-//    // Mirror to avoid loss of precision
-//    if(t > 0x80000000UL) {
-//        t = 0xffffffffUL - t;
-//        s = -s;
-//    }
-//    float p = t * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
-//    // p is in (0, 0.5], 2p is in (0, 1]
-//    return s * erfcinvf(2.0f * p);
-//#else
-//    x++;
-//    return 0.0f;
-//#endif
-//}
-//
-//QUALIFIERS double _curand_normal_icdf_double(unsigned int x)
-//{
-//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
-//    double s = CURAND_SQRT2_DOUBLE;
-//    // Mirror to avoid loss of precision
-//    if(x > 0x80000000UL) {
-//        x = 0xffffffffUL - x;
-//        s = -s;
-//    }
-//    double p = x * CURAND_2POW32_INV_DOUBLE + (CURAND_2POW32_INV_DOUBLE/2.0);
-//    // p is in (0, 0.5], 2p is in (0, 1]
-//    return s * erfcinv(2.0 * p);
-//#else
-//    x++;
-//    return 0.0;
-//#endif
-//}
-//
-//QUALIFIERS double _curand_normal_icdf_double(unsigned long long x)
-//{
-//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
-//    double s = CURAND_SQRT2_DOUBLE;
-//    x >>= 11;
-//    // Mirror to avoid loss of precision
-//    if(x > 0x10000000000000UL) {
-//        x = 0x1fffffffffffffUL - x;
-//        s = -s;
-//    }
-//    double p = x * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
-//    // p is in (0, 0.5], 2p is in (0, 1]
-//    return s * erfcinv(2.0 * p);
-//#else
-//    x++;
-//    return 0.0;
-//#endif
-//}
-//
-
-/**
- * \brief Return a normally distributed float from an XORWOW generator.
- *
- * Return a single normally distributed float with mean \p 0.0f and
- * standard deviation \p 1.0f from the XORWOW generator in \p state,
- * increment position of generator by one.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results, then returns them one at a time.
- * See ::curand_normal2() for a more efficient version that returns
- * both results at once.
- *
- * \param state - Pointer to state to update
- *
- * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
- */
-QUALIFIERS float curand_normal(curandStateXORWOW_t *state)
-{
-    if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) {
-        unsigned int x, y;
-        x = curand(state);
-        y = curand(state);
-        float2 v = _curand_box_muller(x, y);
-        state->boxmuller_extra = v.y;
-        state->boxmuller_flag = EXTRA_FLAG_NORMAL;
-        return v.x;
-    }
-    state->boxmuller_flag = 0;
-    return state->boxmuller_extra;
-}
-
-/**
- * \brief Return a normally distributed float from an Philox4_32_10 generator.
- *
- * Return a single normally distributed float with mean \p 0.0f and
- * standard deviation \p 1.0f from the Philox4_32_10 generator in \p state,
- * increment position of generator by one.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results, then returns them one at a time.
- * See ::curand_normal2() for a more efficient version that returns
- * both results at once.
- *
- * \param state - Pointer to state to update
- *
- * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
- */
-
-QUALIFIERS float curand_normal(curandStatePhilox4_32_10_t *state)
-{
-    if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) {
-        unsigned int x, y;
-        x = curand(state);
-        y = curand(state);
-        float2 v = _curand_box_muller(x, y);
-        state->boxmuller_extra = v.y;
-        state->boxmuller_flag = EXTRA_FLAG_NORMAL;
-        return v.x;
-    }
-    state->boxmuller_flag = 0;
-    return state->boxmuller_extra;
-}
-
-
-
-/**
- * \brief Return a normally distributed float from an MRG32k3a generator.
- *
- * Return a single normally distributed float with mean \p 0.0f and
- * standard deviation \p 1.0f from the MRG32k3a generator in \p state,
- * increment position of generator by one.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results, then returns them one at a time.
- * See ::curand_normal2() for a more efficient version that returns
- * both results at once.
- *
- * \param state - Pointer to state to update
- *
- * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
- */
-QUALIFIERS float curand_normal(curandStateMRG32k3a_t *state)
-{
-    if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) {
-        float2 v = curand_box_muller_mrg(state);
-        state->boxmuller_extra = v.y;
-        state->boxmuller_flag = EXTRA_FLAG_NORMAL;
-        return v.x;
-    }
-    state->boxmuller_flag = 0;
-    return state->boxmuller_extra;
-}
-
-/**
- * \brief Return two normally distributed floats from an XORWOW generator.
- *
- * Return two normally distributed floats with mean \p 0.0f and
- * standard deviation \p 1.0f from the XORWOW generator in \p state,
- * increment position of generator by two.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results.
- *
- * \param state - Pointer to state to update
- *
- * \return Normally distributed float2 where each element is from a
- * distribution with mean \p 0.0f and standard deviation \p 1.0f
- */
-QUALIFIERS float2 curand_normal2(curandStateXORWOW_t *state)
-{
-    return curand_box_muller(state);
-}
-/**
- * \brief Return two normally distributed floats from an Philox4_32_10 generator.
- *
- * Return two normally distributed floats with mean \p 0.0f and
- * standard deviation \p 1.0f from the Philox4_32_10 generator in \p state,
- * increment position of generator by two.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results.
- *
- * \param state - Pointer to state to update
- *
- * \return Normally distributed float2 where each element is from a
- * distribution with mean \p 0.0f and standard deviation \p 1.0f
- */
-QUALIFIERS float2 curand_normal2(curandStatePhilox4_32_10_t *state)
-{
-    return curand_box_muller(state);
-}
-
-/**
- * \brief Return four normally distributed floats from an Philox4_32_10 generator.
- *
- * Return four normally distributed floats with mean \p 0.0f and
- * standard deviation \p 1.0f from the Philox4_32_10 generator in \p state,
- * increment position of generator by four.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results.
- *
- * \param state - Pointer to state to update
- *
- * \return Normally distributed float2 where each element is from a
- * distribution with mean \p 0.0f and standard deviation \p 1.0f
- */
-QUALIFIERS float4 curand_normal4(curandStatePhilox4_32_10_t *state)
-{
-    return curand_box_muller4(state);
-}
-
-
-
-/**
- * \brief Return two normally distributed floats from an MRG32k3a generator.
- *
- * Return two normally distributed floats with mean \p 0.0f and
- * standard deviation \p 1.0f from the MRG32k3a generator in \p state,
- * increment position of generator by two.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results.
- *
- * \param state - Pointer to state to update
- *
- * \return Normally distributed float2 where each element is from a
- * distribution with mean \p 0.0f and standard deviation \p 1.0f
- */
-QUALIFIERS float2 curand_normal2(curandStateMRG32k3a_t *state)
-{
-    return curand_box_muller_mrg(state);
-}
-
-/**
- * \brief Return a normally distributed float from a MTGP32 generator.
- *
- * Return a single normally distributed float with mean \p 0.0f and
- * standard deviation \p 1.0f from the MTGP32 generator in \p state,
- * increment position of generator.
- *
- * The implementation uses the inverse cumulative distribution function
- * to generate normally distributed results.
- *
- * \param state - Pointer to state to update
- *
- * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
- */
-QUALIFIERS float curand_normal(curandStateMtgp32_t *state)
-{
-    return _curand_normal_icdf(curand(state));
-}
-/**
- * \brief Return a normally distributed float from a Sobol32 generator.
- *
- * Return a single normally distributed float with mean \p 0.0f and
- * standard deviation \p 1.0f from the Sobol32 generator in \p state,
- * increment position of generator by one.
- *
- * The implementation uses the inverse cumulative distribution function
- * to generate normally distributed results.
- *
- * \param state - Pointer to state to update
- *
- * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
- */
-QUALIFIERS float curand_normal(curandStateSobol32_t *state)
-{
-    return _curand_normal_icdf(curand(state));
-}
-
-/**
- * \brief Return a normally distributed float from a scrambled Sobol32 generator.
- *
- * Return a single normally distributed float with mean \p 0.0f and
- * standard deviation \p 1.0f from the scrambled Sobol32 generator in \p state,
- * increment position of generator by one.
- *
- * The implementation uses the inverse cumulative distribution function
- * to generate normally distributed results.
- *
- * \param state - Pointer to state to update
- *
- * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
- */
-QUALIFIERS float curand_normal(curandStateScrambledSobol32_t *state)
-{
-    return _curand_normal_icdf(curand(state));
-}
-
-/**
- * \brief Return a normally distributed float from a Sobol64 generator.
- *
- * Return a single normally distributed float with mean \p 0.0f and
- * standard deviation \p 1.0f from the Sobol64 generator in \p state,
- * increment position of generator by one.
- *
- * The implementation uses the inverse cumulative distribution function
- * to generate normally distributed results.
- *
- * \param state - Pointer to state to update
- *
- * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
- */
-QUALIFIERS float curand_normal(curandStateSobol64_t *state)
-{
-    return _curand_normal_icdf(curand(state));
-}
-
-/**
- * \brief Return a normally distributed float from a scrambled Sobol64 generator.
- *
- * Return a single normally distributed float with mean \p 0.0f and
- * standard deviation \p 1.0f from the scrambled Sobol64 generator in \p state,
- * increment position of generator by one.
- *
- * The implementation uses the inverse cumulative distribution function
- * to generate normally distributed results.
- *
- * \param state - Pointer to state to update
- *
- * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
- */
-QUALIFIERS float curand_normal(curandStateScrambledSobol64_t *state)
-{
-    return _curand_normal_icdf(curand(state));
-}
-
-/**
- * \brief Return a normally distributed double from an XORWOW generator.
- *
- * Return a single normally distributed double with mean \p 0.0 and
- * standard deviation \p 1.0 from the XORWOW generator in \p state,
- * increment position of generator.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results, then returns them one at a time.
- * See ::curand_normal2_double() for a more efficient version that returns
- * both results at once.
- *
- * \param state - Pointer to state to update
- *
- * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
- */
-QUALIFIERS double curand_normal_double(curandStateXORWOW_t *state)
-{
-    if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) {
-        unsigned int x0, x1, y0, y1;
-        x0 = curand(state);
-        x1 = curand(state);
-        y0 = curand(state);
-        y1 = curand(state);
-        double2 v = _curand_box_muller_double(x0, x1, y0, y1);
-        state->boxmuller_extra_double = v.y;
-        state->boxmuller_flag_double = EXTRA_FLAG_NORMAL;
-        return v.x;
-    }
-    state->boxmuller_flag_double = 0;
-    return state->boxmuller_extra_double;
-}
-
-/**
- * \brief Return a normally distributed double from an Philox4_32_10 generator.
- *
- * Return a single normally distributed double with mean \p 0.0 and
- * standard deviation \p 1.0 from the Philox4_32_10 generator in \p state,
- * increment position of generator.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results, then returns them one at a time.
- * See ::curand_normal2_double() for a more efficient version that returns
- * both results at once.
- *
- * \param state - Pointer to state to update
- *
- * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
- */
-
-QUALIFIERS double curand_normal_double(curandStatePhilox4_32_10_t *state)
-{
-    if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) {
-        uint4 _x;
-        _x = curand4(state);
-        double2 v = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
-        state->boxmuller_extra_double = v.y;
-        state->boxmuller_flag_double = EXTRA_FLAG_NORMAL;
-        return v.x;
-    }
-    state->boxmuller_flag_double = 0;
-    return state->boxmuller_extra_double;
-}
-
-
-/**
- * \brief Return a normally distributed double from an MRG32k3a generator.
- *
- * Return a single normally distributed double with mean \p 0.0 and
- * standard deviation \p 1.0 from the XORWOW generator in \p state,
- * increment position of generator.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results, then returns them one at a time.
- * See ::curand_normal2_double() for a more efficient version that returns
- * both results at once.
- *
- * \param state - Pointer to state to update
- *
- * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
- */
-QUALIFIERS double curand_normal_double(curandStateMRG32k3a_t *state)
-{
-    if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) {
-        double2 v = curand_box_muller_mrg_double(state);
-        state->boxmuller_extra_double = v.y;
-        state->boxmuller_flag_double = EXTRA_FLAG_NORMAL;
-        return v.x;
-    }
-    state->boxmuller_flag_double = 0;
-    return state->boxmuller_extra_double;
-}
-
-/**
- * \brief Return two normally distributed doubles from an XORWOW generator.
- *
- * Return two normally distributed doubles with mean \p 0.0 and
- * standard deviation \p 1.0 from the XORWOW generator in \p state,
- * increment position of generator by 2.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results.
- *
- * \param state - Pointer to state to update
- *
- * \return Normally distributed double2 where each element is from a
- * distribution with mean \p 0.0 and standard deviation \p 1.0
- */
-QUALIFIERS double2 curand_normal2_double(curandStateXORWOW_t *state)
-{
-    return curand_box_muller_double(state);
-}
-
-/**
- * \brief Return two normally distributed doubles from an Philox4_32_10 generator.
- *
- * Return two normally distributed doubles with mean \p 0.0 and
- * standard deviation \p 1.0 from the Philox4_32_10 generator in \p state,
- * increment position of generator by 2.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results.
- *
- * \param state - Pointer to state to update
- *
- * \return Normally distributed double2 where each element is from a
- * distribution with mean \p 0.0 and standard deviation \p 1.0
- */
-QUALIFIERS double2 curand_normal2_double(curandStatePhilox4_32_10_t *state)
-{
-    uint4 _x;
-    double2 result;
-
-    _x = curand4(state);
-    double2 v1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
-    result.x = v1.x;
-    result.y = v1.y;
-
-    return result;
-}
-
- // not a part of API
-QUALIFIERS double4 curand_normal4_double(curandStatePhilox4_32_10_t *state)
-{
-    uint4 _x;
-    uint4 _y;
-    double4 result;
-
-    _x = curand4(state);
-    _y = curand4(state);
-    double2 v1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
-    double2 v2 = _curand_box_muller_double(_y.x, _y.y, _y.z, _y.w);
-    result.x = v1.x;
-    result.y = v1.y;
-    result.z = v2.x;
-    result.w = v2.y;
-
-    return result;
-}
-
-
-/**
- * \brief Return two normally distributed doubles from an MRG32k3a generator.
- *
- * Return two normally distributed doubles with mean \p 0.0 and
- * standard deviation \p 1.0 from the MRG32k3a generator in \p state,
- * increment position of generator.
- *
- * The implementation uses a Box-Muller transform to generate two
- * normally distributed results.
- *
- * \param state - Pointer to state to update
- *
- * \return Normally distributed double2 where each element is from a
- * distribution with mean \p 0.0 and standard deviation \p 1.0
- */
-QUALIFIERS double2 curand_normal2_double(curandStateMRG32k3a_t *state)
-{
-    return curand_box_muller_mrg_double(state);
-}
-
-/**
- * \brief Return a normally distributed double from an MTGP32 generator.
- *
- * Return a single normally distributed double with mean \p 0.0 and
- * standard deviation \p 1.0 from the MTGP32 generator in \p state,
- * increment position of generator.
- *
- * The implementation uses the inverse cumulative distribution function
- * to generate normally distributed results.
- *
- * \param state - Pointer to state to update
- *
- * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
- */
-QUALIFIERS double curand_normal_double(curandStateMtgp32_t *state)
-{
-    return _curand_normal_icdf_double(curand(state));
-}
-
-/**
- * \brief Return a normally distributed double from an Sobol32 generator.
- *
- * Return a single normally distributed double with mean \p 0.0 and
- * standard deviation \p 1.0 from the Sobol32 generator in \p state,
- * increment position of generator by one.
- *
- * The implementation uses the inverse cumulative distribution function
- * to generate normally distributed results.
- *
- * \param state - Pointer to state to update
- *
- * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
- */
-QUALIFIERS double curand_normal_double(curandStateSobol32_t *state)
-{
-    return _curand_normal_icdf_double(curand(state));
-}
-
-/**
- * \brief Return a normally distributed double from a scrambled Sobol32 generator.
- *
- * Return a single normally distributed double with mean \p 0.0 and
- * standard deviation \p 1.0 from the scrambled Sobol32 generator in \p state,
- * increment position of generator by one.
- *
- * The implementation uses the inverse cumulative distribution function
- * to generate normally distributed results.
- *
- * \param state - Pointer to state to update
- *
- * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
- */
-QUALIFIERS double curand_normal_double(curandStateScrambledSobol32_t *state)
-{
-    return _curand_normal_icdf_double(curand(state));
-}
-
-/**
- * \brief Return a normally distributed double from a Sobol64 generator.
- *
- * Return a single normally distributed double with mean \p 0.0 and
- * standard deviation \p 1.0 from the Sobol64 generator in \p state,
- * increment position of generator by one.
- *
- * The implementation uses the inverse cumulative distribution function
- * to generate normally distributed results.
- *
- * \param state - Pointer to state to update
- *
- * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
- */
-QUALIFIERS double curand_normal_double(curandStateSobol64_t *state)
-{
-    return _curand_normal_icdf_double(curand(state));
-}
-
-/**
- * \brief Return a normally distributed double from a scrambled Sobol64 generator.
- *
- * Return a single normally distributed double with mean \p 0.0 and
- * standard deviation \p 1.0 from the scrambled Sobol64 generator in \p state,
- * increment position of generator by one.
- *
- * The implementation uses the inverse cumulative distribution function
- * to generate normally distributed results.
- *
- * \param state - Pointer to state to update
- *
- * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
- */
-QUALIFIERS double curand_normal_double(curandStateScrambledSobol64_t *state)
-{
-    return _curand_normal_icdf_double(curand(state));
-}
-#endif // !defined(CURAND_NORMAL_H_)
--- a/include/cuda/curand_normal_static.h
+++ b/include/cuda/curand_normal_static.h
@ -1,127 +0,0 @@
- /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
-  *
-  * NOTICE TO LICENSEE:
-  *
-  * The source code and/or documentation ("Licensed Deliverables") are
-  * subject to NVIDIA intellectual property rights under U.S. and
-  * international Copyright laws.
-  *
-  * The Licensed Deliverables contained herein are PROPRIETARY and
-  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
-  * conditions of a form of NVIDIA software license agreement by and
-  * between NVIDIA and Licensee ("License Agreement") or electronically
-  * accepted by Licensee.  Notwithstanding any terms or conditions to
-  * the contrary in the License Agreement, reproduction or disclosure
-  * of the Licensed Deliverables to any third party without the express
-  * written consent of NVIDIA is prohibited.
-  *
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
-  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
-  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
-  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
-  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
-  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
-  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
-  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
-  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
-  * OF THESE LICENSED DELIVERABLES.
-  *
-  * U.S. Government End Users.  These Licensed Deliverables are a
-  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
-  * 1995), consisting of "commercial computer software" and "commercial
-  * computer software documentation" as such terms are used in 48
-  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
-  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
-  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
-  * U.S. Government End Users acquire the Licensed Deliverables with
-  * only those rights set forth herein.
-  *
-  * Any use of the Licensed Deliverables in individual and commercial
-  * software must include, in the user documentation and internal
-  * comments to the code, the above Disclaimer and U.S. Government End
-  * Users Notice.
-  */
-#ifndef CURAND_NORMAL_STATIC_H
-#define CURAND_NORMAL_STATIC_H
-
-#define QUALIFIERS_STATIC __host__ __device__ __forceinline__
-
-QUALIFIERS_STATIC float _curand_normal_icdf(unsigned int x)
-{
-#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
-    float s = CURAND_SQRT2;
-    // Mirror to avoid loss of precision
-    if(x > 0x80000000UL) {
-        x = 0xffffffffUL - x;
-        s = -s;
-    }
-    float p = x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
-    // p is in (0, 0.5], 2p is in (0, 1]
-    return s * erfcinvf(2.0f * p);
-#else
-    x++;    //suppress warnings
-    return 0.0f;
-#endif
-}
-
-QUALIFIERS_STATIC float _curand_normal_icdf(unsigned long long x)
-{
-#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
-    unsigned int t = (unsigned int)(x >> 32);
-    float s = CURAND_SQRT2;
-    // Mirror to avoid loss of precision
-    if(t > 0x80000000UL) {
-        t = 0xffffffffUL - t;
-        s = -s;
-    }
-    float p = t * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
-    // p is in (0, 0.5], 2p is in (0, 1]
-    return s * erfcinvf(2.0f * p);
-#else
-    x++;
-    return 0.0f;
-#endif
-}
-
-QUALIFIERS_STATIC double _curand_normal_icdf_double(unsigned int x)
-{
-#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
-    double s = CURAND_SQRT2_DOUBLE;
-    // Mirror to avoid loss of precision
-    if(x > 0x80000000UL) {
-        x = 0xffffffffUL - x;
-        s = -s;
-    }
-    double p = x * CURAND_2POW32_INV_DOUBLE + (CURAND_2POW32_INV_DOUBLE/2.0);
-    // p is in (0, 0.5], 2p is in (0, 1]
-    return s * erfcinv(2.0 * p);
-#else
-    x++;
-    return 0.0;
-#endif
-}
-
-QUALIFIERS_STATIC double _curand_normal_icdf_double(unsigned long long x)
-{
-#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
-    double s = CURAND_SQRT2_DOUBLE;
-    x >>= 11;
-    // Mirror to avoid loss of precision
-    if(x > 0x10000000000000UL) {
-        x = 0x1fffffffffffffUL - x;
-        s = -s;
-    }
-    double p = x * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
-    // p is in (0, 0.5], 2p is in (0, 1]
-    return s * erfcinv(2.0 * p);
-#else
-    x++;
-    return 0.0;
-#endif
-}
-#undef QUALIFIERS_STATIC
-#endif
--- a/include/cuda/curand_philox4x32_x.h
+++ b/include/cuda/curand_philox4x32_x.h
@ -1,194 +0,0 @@
-/* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
- *                   
- * NOTICE TO LICENSEE:
- *
- * The source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * The Licensed Deliverables contained herein are PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and are being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-/*
-   Copyright 2010-2011, D. E. Shaw Research.
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-met:
-
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions, and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions, and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the name of D. E. Shaw Research nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef CURAND_PHILOX4X32_X__H_
-#define CURAND_PHILOX4X32_X__H_
-
-#if !defined(QUALIFIERS)
-#define QUALIFIERS static __forceinline__ __device__
-#endif
-
-#define PHILOX_W32_0   (0x9E3779B9)
-#define PHILOX_W32_1   (0xBB67AE85)
-#define PHILOX_M4x32_0 (0xD2511F53)
-#define PHILOX_M4x32_1 (0xCD9E8D57)
-
-struct curandStatePhilox4_32_10 {
-   uint4 ctr;
-   uint4 output;
-   uint2 key;
-   unsigned int STATE;
-   int boxmuller_flag;
-   int boxmuller_flag_double;
-   float boxmuller_extra;
-   double boxmuller_extra_double;
-};
-
-typedef struct curandStatePhilox4_32_10 curandStatePhilox4_32_10_t;
-
-
-QUALIFIERS void Philox_State_Incr(curandStatePhilox4_32_10_t* s, unsigned long long n)
-{
-   unsigned int nlo = (unsigned int)(n);
-   unsigned int nhi = (unsigned int)(n>>32);
-
-   s->ctr.x += nlo;
-   if( s->ctr.x < nlo )
-      nhi++;
-
-   s->ctr.y += nhi;
-   if(nhi <= s->ctr.y)
-      return;
-   if(++s->ctr.z) return;
-   ++s->ctr.w;
-}
-
-QUALIFIERS void Philox_State_Incr_hi(curandStatePhilox4_32_10_t* s, unsigned long long n)
-{
-   unsigned int nlo = (unsigned int)(n);
-   unsigned int nhi = (unsigned int)(n>>32);
-
-   s->ctr.z += nlo;
-   if( s->ctr.z < nlo )
-      nhi++;
-
-   s->ctr.w += nhi;
-}
-
-
-
-QUALIFIERS void Philox_State_Incr(curandStatePhilox4_32_10_t* s)
-{
-   if(++s->ctr.x) return;
-   if(++s->ctr.y) return;
-   if(++s->ctr.z) return;
-   ++s->ctr.w;
-}
-
-
-QUALIFIERS unsigned int mulhilo32(unsigned int a, unsigned int b, unsigned int* hip)
-{
-#ifndef __CUDA_ARCH__
-   // host code
-   unsigned long long product = ((unsigned long long)a) * ((unsigned long long)b);
-   *hip = product >> 32;
-   return (unsigned int)product;
-#else
-   // device code
-   *hip = __umulhi(a,b);
-   return a*b;
-#endif
-}
-
-QUALIFIERS uint4 _philox4x32round(uint4 ctr, uint2 key)
-{
-   unsigned int hi0;
-   unsigned int hi1;
-   unsigned int lo0 = mulhilo32(PHILOX_M4x32_0, ctr.x, &hi0);
-   unsigned int lo1 = mulhilo32(PHILOX_M4x32_1, ctr.z, &hi1);
-
-   uint4 ret  = {hi1^ctr.y^key.x, lo1, hi0^ctr.w^key.y, lo0};
-   return ret;
-}
-
-QUALIFIERS uint4 curand_Philox4x32_10( uint4 c, uint2 k)
-{
-   c = _philox4x32round(c, k);                           // 1 
-   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
-   c = _philox4x32round(c, k);                           // 2
-   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
-   c = _philox4x32round(c, k);                           // 3 
-   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
-   c = _philox4x32round(c, k);                           // 4 
-   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
-   c = _philox4x32round(c, k);                           // 5 
-   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
-   c = _philox4x32round(c, k);                           // 6 
-   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
-   c = _philox4x32round(c, k);                           // 7 
-   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
-   c = _philox4x32round(c, k);                           // 8 
-   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
-   c = _philox4x32round(c, k);                           // 9 
-   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
-   return _philox4x32round(c, k);                        // 10
-}
-
-
-#endif
--- a/include/cuda/curand_poisson.h
+++ b/include/cuda/curand_poisson.h
@ -1,751 +0,0 @@
-
- /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
-  *
-  * NOTICE TO LICENSEE:
-  *
-  * The source code and/or documentation ("Licensed Deliverables") are
-  * subject to NVIDIA intellectual property rights under U.S. and
-  * international Copyright laws.
-  *
-  * The Licensed Deliverables contained herein are PROPRIETARY and
-  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
-  * conditions of a form of NVIDIA software license agreement by and
-  * between NVIDIA and Licensee ("License Agreement") or electronically
-  * accepted by Licensee.  Notwithstanding any terms or conditions to
-  * the contrary in the License Agreement, reproduction or disclosure
-  * of the Licensed Deliverables to any third party without the express
-  * written consent of NVIDIA is prohibited.
-  *
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
-  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
-  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
-  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
-  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
-  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
-  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
-  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
-  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
-  * OF THESE LICENSED DELIVERABLES.
-  *
-  * U.S. Government End Users.  These Licensed Deliverables are a
-  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
-  * 1995), consisting of "commercial computer software" and "commercial
-  * computer software documentation" as such terms are used in 48
-  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
-  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
-  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
-  * U.S. Government End Users acquire the Licensed Deliverables with
-  * only those rights set forth herein.
-  *
-  * Any use of the Licensed Deliverables in individual and commercial
-  * software must include, in the user documentation and internal
-  * comments to the code, the above Disclaimer and U.S. Government End
-  * Users Notice.
-  */
-
-
-#if !defined(CURAND_POISSON_H_)
-#define CURAND_POISSON_H_
-
-/**
- * \defgroup DEVICE Device API
- *
- * @{
- */
-
-#ifndef __CUDACC_RTC__
-#include <math.h>
-#endif // __CUDACC_RTC__
-
-#include "curand_mrg32k3a.h"
-#include "curand_mtgp32_kernel.h"
-#include "curand_philox4x32_x.h"
-
-#define CR_CUDART_PI               3.1415926535897931e+0
-#define CR_CUDART_TWO_TO_52        4503599627370496.0
-
-
-QUALIFIERS float __cr_rsqrt(float a)
-{
-#ifdef __CUDA_ARCH__
-    asm ("rsqrt.approx.f32.ftz %0, %1;" : "=f"(a) : "f"(a));
-#else
-    a = 1.0f / sqrtf (a);
-#endif
-    return a;
-}
-
-QUALIFIERS float __cr_exp (float a)
-{
-#ifdef __CUDA_ARCH__
-    a = a * 1.4426950408889634074;
-    asm ("ex2.approx.f32.ftz %0, %1;" : "=f"(a) : "f"(a));
-#else
-    a = expf (a);
-#endif
-    return a;
-}
-
-QUALIFIERS float __cr_log (float a)
-{
-#ifdef __CUDA_ARCH__
-    asm ("lg2.approx.f32.ftz %0, %1;" : "=f"(a) : "f"(a));
-    a = a * 0.69314718055994530942;
-#else
-    a = logf (a);
-#endif
-    return a;
-}
-
-QUALIFIERS float __cr_rcp (float a)
-{
-#ifdef __CUDA_ARCH__
-    asm ("rcp.approx.f32.ftz %0, %1;" : "=f"(a) : "f"(a));
-#else
-    a = 1.0f / a;
-#endif
-    return a;
-}
-
-/* Computes regularized gamma function:  gammainc(a,x)/gamma(a) */
-QUALIFIERS float __cr_pgammainc (float a, float x)
-{
-    float t, alpha, beta;
-
-    /* First level parametrization constants */
-    float ma1 = 1.43248035075540910f,
-          ma2 = 0.12400979329415655f,
-          ma3 = 0.00025361074907033f,
-          mb1 = 0.21096734870196546f,
-          mb2 = 1.97381164089999420f,
-          mb3 = 0.94201734077887530f;
-
-    /* Second level parametrization constants (depends only on a) */
-
-    alpha = __cr_rsqrt (a - ma2);
-    alpha = ma1 * alpha + ma3;
-    beta = __cr_rsqrt (a - mb2);
-    beta = mb1 * beta + mb3;
-
-    /* Final approximation (depends on a and x) */
-
-    t = a - x;
-    t = alpha * t - beta;
-    t = 1.0f + __cr_exp (t);
-    t = t * t;
-    t = __cr_rcp (t);
-
-    /* Negative a,x or a,x=NAN requires special handling */
-    //t = !(x > 0 && a >= 0) ? 0.0 : t;
-
-    return t;
-}
-
-/* Computes inverse of pgammainc */
-QUALIFIERS float __cr_pgammaincinv (float a, float y)
-{
-    float t, alpha, beta;
-
-    /* First level parametrization constants */
-
-    float ma1 = 1.43248035075540910f,
-          ma2 = 0.12400979329415655f,
-          ma3 = 0.00025361074907033f,
-          mb1 = 0.21096734870196546f,
-          mb2 = 1.97381164089999420f,
-          mb3 = 0.94201734077887530f;
-
-    /* Second level parametrization constants (depends only on a) */
-
-    alpha = __cr_rsqrt (a - ma2);
-    alpha = ma1 * alpha + ma3;
-    beta = __cr_rsqrt (a - mb2);
-    beta = mb1 * beta + mb3;
-
-    /* Final approximation (depends on a and y) */
-
-    t = __cr_rsqrt (y) - 1.0f;
-    t = __cr_log (t);
-    t = beta + t;
-    t = - t * __cr_rcp (alpha) + a;
-    /* Negative a,x or a,x=NAN requires special handling */
-    //t = !(y > 0 && a >= 0) ? 0.0 : t;
-    return t;
-}
-
-#if defined(__CUDACC_RDC__) && (__cplusplus >= 201703L) && defined(__cpp_inline_variables)
-inline __constant__ double __cr_lgamma_table [] = {
-#else
-static __constant__ double __cr_lgamma_table [] = {
-#endif
-    0.000000000000000000e-1,
-    0.000000000000000000e-1,
-    6.931471805599453094e-1,
-    1.791759469228055001e0,
-    3.178053830347945620e0,
-    4.787491742782045994e0,
-    6.579251212010100995e0,
-    8.525161361065414300e0,
-    1.060460290274525023e1
-};
-
-
-QUALIFIERS double __cr_lgamma_integer(int a)
-{
-    double s;
-    double t;
-    double fa = fabs((float)a);
-    double sum;
-
-    if (a > 8) {
-        /* Stirling approximation; coefficients from Hart et al, "Computer
-         * Approximations", Wiley 1968. Approximation 5404.
-         */
-        s = 1.0 / fa;
-        t = s * s;
-        sum =          -0.1633436431e-2;
-        sum = sum * t + 0.83645878922e-3;
-        sum = sum * t - 0.5951896861197e-3;
-        sum = sum * t + 0.793650576493454e-3;
-        sum = sum * t - 0.277777777735865004e-2;
-        sum = sum * t + 0.833333333333331018375e-1;
-        sum = sum * s + 0.918938533204672;
-        s = 0.5 * log (fa);
-        t = fa - 0.5;
-        s = s * t;
-        t = s - fa;
-        s = s + sum;
-        t = t + s;
-        return t;
-    } else {
-#ifdef __CUDA_ARCH__
-        return __cr_lgamma_table [(int) fa-1];
-#else
-        switch(a) {
-            case 1: return 0.000000000000000000e-1;
-            case 2: return 0.000000000000000000e-1;
-            case 3: return 6.931471805599453094e-1;
-            case 4: return 1.791759469228055001e0;
-            case 5: return 3.178053830347945620e0;
-            case 6: return 4.787491742782045994e0;
-            case 7: return 6.579251212010100995e0;
-            case 8: return 8.525161361065414300e0;
-            default: return 1.060460290274525023e1;
-        }
-#endif
-    }
-}
-
-#define KNUTH_FLOAT_CONST 60.0
-template <typename T>
-// Donald E. Knuth Seminumerical Algorithms. The Art of Computer Programming, Volume 2
-QUALIFIERS unsigned int curand_poisson_knuth(T *state, float lambda)
-{
-  unsigned int k = 0;
-  float p = expf(lambda);
-  do{
-      k++;
-      p *= curand_uniform(state);
-  }while (p > 1.0);
-  return k-1;
-}
-
-template <typename T>
-// Donald E. Knuth Seminumerical Algorithms. The Art of Computer Programming, Volume 2
-QUALIFIERS uint4 curand_poisson_knuth4(T *state, float lambda)
-{
-  uint4 k = {0,0,0,0};
-  float exp_lambda = expf(lambda);
-  float4 p={ exp_lambda,exp_lambda,exp_lambda,exp_lambda };
-  do{
-      k.x++;
-      p.x *= curand_uniform(state);
-  }while (p.x > 1.0);
-  do{
-      k.y++;
-      p.y *= curand_uniform(state);
-  }while (p.y > 1.0);
-  do{
-      k.z++;
-      p.z *= curand_uniform(state);
-  }while (p.z > 1.0);
-  do{
-      k.w++;
-      p.w *= curand_uniform(state);
-  }while (p.w > 1.0);
-
-  k.x--;
-  k.y--;
-  k.z--;
-  k.w--;
-  return k;
-}
-
-template <typename T>
-// Marsaglia, Tsang, Wang Journal of Statistical Software, square histogram.
-QUALIFIERS unsigned int _curand_M2_double(T x, curandDistributionM2Shift_t distributionM2)
-{
-    double u = _curand_uniform_double(x);
-    int j = (int) floor(distributionM2->length*u);
-
-
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
-    double histogramVj = __ldg( &(distributionM2->histogram->V[j]));
-    unsigned int histogramKj = __ldg( &(distributionM2->histogram->K[j]));
-#else
-    double histogramVj = distributionM2->histogram->V[j];
-    unsigned int histogramKj = distributionM2->histogram->K[j];
-#endif
-    //if (u < distributionM2->histogram->V[j]) return distributionM2->shift + j;
-    //return distributionM2->shift + distributionM2->histogram->K[j];
-    if (u < histogramVj) return distributionM2->shift + j;
-    return distributionM2->shift + histogramKj;
-}
-
-template <typename T>
-// Marsaglia, Tsang, Wang Journal of Statistical Software, square histogram.
-QUALIFIERS uint4 _curand_M2_double4(T x, curandDistributionM2Shift_t distributionM2)
-{
-    double4 u;
-    uint4 result = {0,0,0,0};
-    int4 flag = {1,1,1,1};
-
-    u.x = _curand_uniform_double(x.x);
-    u.y = _curand_uniform_double(x.y);
-    u.z = _curand_uniform_double(x.z);
-    u.w = _curand_uniform_double(x.w);
-
-    int4 j;
-    j.x = (int) floor(distributionM2->length*u.x);
-    j.y = (int) floor(distributionM2->length*u.y);
-    j.z = (int) floor(distributionM2->length*u.z);
-    j.w = (int) floor(distributionM2->length*u.w);
-//    int result;
-
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
-    double histogramVjx =  __ldg( &(distributionM2->histogram->V[j.x]));
-    double histogramVjy =  __ldg( &(distributionM2->histogram->V[j.y]));
-    double histogramVjz =  __ldg( &(distributionM2->histogram->V[j.z]));
-    double histogramVjw =  __ldg( &(distributionM2->histogram->V[j.w]));
-
-    unsigned int histogramKjx = __ldg( &(distributionM2->histogram->K[j.x]));
-    unsigned int histogramKjy = __ldg( &(distributionM2->histogram->K[j.y]));
-    unsigned int histogramKjz = __ldg( &(distributionM2->histogram->K[j.z]));
-    unsigned int histogramKjw = __ldg( &(distributionM2->histogram->K[j.w]));
-#else
-    double histogramVjx =  distributionM2->histogram->V[j.x];
-    double histogramVjy =  distributionM2->histogram->V[j.y];
-    double histogramVjz =  distributionM2->histogram->V[j.z];
-    double histogramVjw =  distributionM2->histogram->V[j.w];
-
-    unsigned int histogramKjx = distributionM2->histogram->K[j.x];
-    unsigned int histogramKjy = distributionM2->histogram->K[j.y];
-    unsigned int histogramKjz = distributionM2->histogram->K[j.z];
-    unsigned int histogramKjw = distributionM2->histogram->K[j.w];
-#endif
-
-    if (u.x < histogramVjx){ result.x = distributionM2->shift + j.x; flag.x = 0; }
-    if (u.y < histogramVjy){ result.y = distributionM2->shift + j.y; flag.y = 0; }
-    if (u.z < histogramVjz){ result.z = distributionM2->shift + j.z; flag.z = 0; }
-    if (u.w < histogramVjw){ result.w = distributionM2->shift + j.w; flag.w = 0; }
-    //return distributionM2->shift + distributionM2->histogram->K[j];
-
-    if(flag.x) result.x = distributionM2->shift + histogramKjx;
-    if(flag.y) result.y = distributionM2->shift + histogramKjy;
-    if(flag.z) result.z = distributionM2->shift + histogramKjz;
-    if(flag.w) result.w = distributionM2->shift + histogramKjw;
-
-    return result;
-}
-
-template <typename STATE>
-QUALIFIERS unsigned int curand_M2_double(STATE *state, curandDistributionM2Shift_t distributionM2)
-{
-    return _curand_M2_double(curand(state), distributionM2);
-}
-
-template <typename STATE>
-QUALIFIERS uint4 curand_M2_double4(STATE *state, curandDistributionM2Shift_t distributionM2)
-{
-    return _curand_M2_double4(curand4(state), distributionM2);
-}
-
-
-template <typename T>
-QUALIFIERS unsigned int _curand_binary_search_double(T x, curandDistributionShift_t distribution)
-{
-    double u = _curand_uniform_double(x);
-    int min = 0;
-    int max = distribution->length-1;
-    do{
-        int mid = (max + min)/2;
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
-        double probability_mid = __ldg( &(distribution->probability[mid]));
-#else
-        double probability_mid = distribution->probability[mid];
-#endif
-        if (u <= probability_mid){
-            max = mid;
-        }else{
-            min = mid+1;
-        }
-    }while (min < max);
-    return distribution->shift + min;
-}
-
-template <typename STATE>
-QUALIFIERS unsigned int curand_binary_search_double(STATE *state, curandDistributionShift_t distribution)
-{
-    return _curand_binary_search_double(curand(state), distribution);
-}
-
-// Generates uniformly distributed double values in range (0.0; 1.0) from uniformly distributed
-// unsigned int. We can't use standard _curand_uniform_double since it can generate 1.0.
-// This is required only for _curand_poisson_ITR_double.
-QUALIFIERS double _curand_uniform_double_excluding_one(unsigned int x)
-{
-    return x * CURAND_2POW32_INV_DOUBLE + (CURAND_2POW32_INV_DOUBLE/2.0);
-}
-
-// Overload for unsigned long long.
-// This is required only for _curand_poisson_ITR_double.
-QUALIFIERS double _curand_uniform_double_excluding_one(unsigned long long x)
-{
-    return (x >> 11) * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/4.0);
-}
-
-#define MAGIC_DOUBLE_CONST 500.0
-template <typename T>
-//George S. Fishman Discrete-event simulation: modeling, programming, and analysis
-QUALIFIERS unsigned int _curand_poisson_ITR_double(T x, double lambda)
-{
-  double L,p = 1.0;
-  double q = 1.0;
-  unsigned int k = 0;
-  int pow=0;
-  // This algorithm requires u to be in (0;1) range, however, _curand_uniform_double
-  // returns a number in range (0;1]. If u is 1.0 the inner loop never ends. The
-  // following operation transforms the range from (0;1] to (0;1).
-  double u = _curand_uniform_double_excluding_one(x);
-  do{
-      if (lambda > (double)(pow+MAGIC_DOUBLE_CONST)){
-          L = exp(-MAGIC_DOUBLE_CONST);
-      }else{
-          L = exp((double)(pow - lambda));
-      }
-      p *= L;
-      q *= L;
-      pow += (int) MAGIC_DOUBLE_CONST;
-      while (u > q){
-        k++;
-        p *= ((double)lambda / (double) k);
-        q += p;
-      }
-  }while((double)pow < lambda);
-  return k;
-}
-
-template <typename T>
-/* Rejection Method for Poisson distribution based on gammainc approximation */
-QUALIFIERS unsigned int curand_poisson_gammainc(T state, float lambda){
-    float y, x, t, z,v;
-    float logl = __cr_log (lambda);
-    while (true) {
-        y = curand_uniform (state);
-        x = __cr_pgammaincinv (lambda, y);
-        x = floorf (x);
-        z = curand_uniform (state);
-        v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
-        z = z*v;
-        t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
-        if ((z < t) && (v>=1e-20))
-            break;
-    }
-    return (unsigned int)x;
-}
-
-template <typename T>
-/* Rejection Method for Poisson distribution based on gammainc approximation */
-QUALIFIERS uint4 curand_poisson_gammainc4(T state, float lambda){
-    uint4 result;
-    float y, x, t, z,v;
-    float logl = __cr_log (lambda);
-    while (true) {
-        y = curand_uniform(state);
-        x = __cr_pgammaincinv (lambda, y);
-        x = floorf (x);
-        z = curand_uniform (state);
-        v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
-        z = z*v;
-        t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
-        if ((z < t) && (v>=1e-20))
-            break;
-    }
-    result.x = (unsigned int)x;
-
-    while (true) {
-        y = curand_uniform(state);
-        x = __cr_pgammaincinv (lambda, y);
-        x = floorf (x);
-        z = curand_uniform (state);
-        v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
-        z = z*v;
-        t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
-        if ((z < t) && (v>=1e-20))
-            break;
-    }
-    result.y = (unsigned int)x;
-
-    while (true) {
-        y = curand_uniform(state);
-        x = __cr_pgammaincinv (lambda, y);
-        x = floorf (x);
-        z = curand_uniform (state);
-        v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
-        z = z*v;
-        t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
-        if ((z < t) && (v>=1e-20))
-            break;
-    }
-    result.z = (unsigned int)x;
-
-    while (true) {
-        y = curand_uniform(state);
-        x = __cr_pgammaincinv (lambda, y);
-        x = floorf (x);
-        z = curand_uniform (state);
-        v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
-        z = z*v;
-        t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
-        if ((z < t) && (v>=1e-20))
-            break;
-    }
-    result.w = (unsigned int)x;
-
-    return result;
-}
-// Note below that the round to nearest integer, where needed,is done in line with code that
-// assumes the range of values is < 2**32
-
-template <typename T>
-QUALIFIERS unsigned int _curand_poisson(T x, double lambda)
-{
-    if (lambda < 1000)
-        return _curand_poisson_ITR_double(x, lambda);
-    return (unsigned int)((sqrt(lambda) * _curand_normal_icdf_double(x)) + lambda + 0.5); //Round to nearest
-}
-
-template <typename T>
-QUALIFIERS unsigned int _curand_poisson_from_normal(T x, double lambda)
-{
-    return (unsigned int)((sqrt(lambda) * _curand_normal_icdf(x)) + lambda + 0.5); //Round to nearest
-}
-
-template <typename STATE>
-QUALIFIERS unsigned int curand_poisson_from_normal(STATE state, double lambda)
-{
-    return (unsigned int)((sqrt(lambda) * curand_normal(state)) + lambda + 0.5); //Round to nearest
-}
-
-template <typename STATE>
-QUALIFIERS uint4 curand_poisson_from_normal4(STATE state, double lambda)
-{
-   uint4 result;
-   float4 _res;
-
-   _res = curand_normal4(state);
-
-   result.x = (unsigned int)((sqrt(lambda) * _res.x) + lambda + 0.5); //Round to nearest
-   result.y = (unsigned int)((sqrt(lambda) * _res.y) + lambda + 0.5); //Round to nearest
-   result.z = (unsigned int)((sqrt(lambda) * _res.z) + lambda + 0.5); //Round to nearest
-   result.w = (unsigned int)((sqrt(lambda) * _res.w) + lambda + 0.5); //Round to nearest
-   return result; //Round to nearest
-}
-
-/**
- * \brief Return a Poisson-distributed unsigned int from a XORWOW generator.
- *
- * Return a single unsigned int from a Poisson
- * distribution with lambda \p lambda from the XORWOW generator in \p state,
- * increment the  position of the generator by a variable amount, depending
- * on the algorithm used.
- *
- * \param state - Pointer to state to update
- * \param lambda - Lambda of the Poisson distribution
- *
- * \return Poisson-distributed unsigned int with lambda \p lambda
- */
-QUALIFIERS unsigned int curand_poisson(curandStateXORWOW_t *state, double lambda)
-{
-    if (lambda < 64)
-        return curand_poisson_knuth(state, (float)lambda);
-    if (lambda > 4000)
-        return (unsigned int)((sqrt(lambda) * curand_normal_double(state)) + lambda + 0.5); //Round to nearest
-    return curand_poisson_gammainc(state, (float)lambda);
-}
-
-/**
- * \brief Return a Poisson-distributed unsigned int from a Philox4_32_10 generator.
- *
- * Return a single unsigned int from a Poisson
- * distribution with lambda \p lambda from the Philox4_32_10 generator in \p state,
- * increment the  position of the generator by a variable amount, depending
- * on the algorithm used.
- *
- * \param state - Pointer to state to update
- * \param lambda - Lambda of the Poisson distribution
- *
- * \return Poisson-distributed unsigned int with lambda \p lambda
- */
-QUALIFIERS unsigned int curand_poisson(curandStatePhilox4_32_10_t *state, double lambda)
-{
-    if (lambda < 64)
-        return curand_poisson_knuth(state, (float)lambda);
-    if (lambda > 4000)
-        return (unsigned int)((sqrt(lambda) * curand_normal_double(state)) + lambda + 0.5); //Round to nearest
-    return curand_poisson_gammainc(state, (float)lambda);
-}
-/**
- * \brief Return four Poisson-distributed unsigned ints from a Philox4_32_10 generator.
- *
- * Return a four unsigned ints from a Poisson
- * distribution with lambda \p lambda from the Philox4_32_10 generator in \p state,
- * increment the  position of the generator by a variable amount, depending
- * on the algorithm used.
- *
- * \param state - Pointer to state to update
- * \param lambda - Lambda of the Poisson distribution
- *
- * \return Poisson-distributed unsigned int with lambda \p lambda
- */
-QUALIFIERS uint4 curand_poisson4(curandStatePhilox4_32_10_t *state, double lambda)
-{
-    uint4 result;
-    double4 _res;
-    if (lambda < 64)
-        return curand_poisson_knuth4(state, (float)lambda);
-    if (lambda > 4000) {
-        _res = curand_normal4_double(state);
-        result.x = (unsigned int)((sqrt(lambda) * _res.x) + lambda + 0.5); //Round to nearest
-        result.y = (unsigned int)((sqrt(lambda) * _res.y) + lambda + 0.5); //Round to nearest
-        result.z = (unsigned int)((sqrt(lambda) * _res.z) + lambda + 0.5); //Round to nearest
-        result.w = (unsigned int)((sqrt(lambda) * _res.w) + lambda + 0.5); //Round to nearest
-    	return result;
-    }
-    return curand_poisson_gammainc4(state, (float)lambda);
-}
-
-
-
-/**
- * \brief Return a Poisson-distributed unsigned int from a MRG32k3A generator.
- *
- * Return a single unsigned int from a Poisson
- * distribution with lambda \p lambda from the MRG32k3a generator in \p state,
- * increment the position of the generator by a variable amount, depending
- * on the algorithm used.
- *
- * \param state - Pointer to state to update
- * \param lambda - Lambda of the Poisson distribution
- *
- * \return Poisson-distributed unsigned int with lambda \p lambda
- */
-QUALIFIERS unsigned int curand_poisson(curandStateMRG32k3a_t *state, double lambda)
-{
-    if (lambda < 64)
-        return curand_poisson_knuth(state, (float)lambda);
-    if (lambda > 4000)
-        return (unsigned int)((sqrt(lambda) * curand_normal_double(state)) + lambda + 0.5); //Round to nearest
-    return curand_poisson_gammainc(state, (float)lambda);
-}
-
-/**
- * \brief Return a Poisson-distributed unsigned int from a MTGP32 generator.
- *
- * Return a single int from a Poisson
- * distribution with lambda \p lambda from the MTGP32 generator in \p state,
- * increment the position of the generator by one.
- *
- * \param state - Pointer to state to update
- * \param lambda - Lambda of the Poisson distribution
- *
- * \return Poisson-distributed unsigned int with lambda \p lambda
- */
-QUALIFIERS unsigned int curand_poisson(curandStateMtgp32_t *state, double lambda)
-{
-    return _curand_poisson(curand(state), lambda);
-}
-
-/**
- * \brief Return a Poisson-distributed unsigned int from a Sobol32 generator.
- *
- * Return a single unsigned int from a Poisson
- * distribution with lambda \p lambda from the Sobol32 generator in \p state,
- * increment the position of the generator by one.
- *
- * \param state - Pointer to state to update
- * \param lambda - Lambda of the Poisson distribution
- *
- * \return Poisson-distributed unsigned int with lambda \p lambda
- */
-
-QUALIFIERS unsigned int curand_poisson(curandStateSobol32_t *state, double lambda)
-{
-    return _curand_poisson(curand(state), lambda);
-}
-
-/**
- * \brief Return a Poisson-distributed unsigned int from a scrambled Sobol32 generator.
- *
- * Return a single unsigned int from a Poisson
- * distribution with lambda \p lambda from the scrambled Sobol32 generator in \p state,
- * increment the position of the generator by one.
- *
- * \param state - Pointer to state to update
- * \param lambda - Lambda of the Poisson distribution
- *
- * \return Poisson-distributed unsigned int with lambda \p lambda
- */
-QUALIFIERS unsigned int curand_poisson(curandStateScrambledSobol32_t *state, double lambda)
-{
-    return _curand_poisson(curand(state), lambda);
-}
-
-/**
- * \brief Return a Poisson-distributed unsigned int from a Sobol64 generator.
- *
- * Return a single unsigned int from a Poisson
- * distribution with lambda \p lambda from the Sobol64 generator in \p state,
- * increment position of generator by one.
- *
- * \param state - Pointer to state to update
- * \param lambda - Lambda of the Poisson distribution
- *
- * \return Poisson-distributed unsigned int with lambda \p lambda
- */
-QUALIFIERS unsigned int curand_poisson(curandStateSobol64_t *state, double lambda)
-{
-    return _curand_poisson(curand(state), lambda);
-}
-
-/**
- * \brief Return a Poisson-distributed unsigned int from a scrambled Sobol64 generator.
- *
- * Return a single unsigned int from a Poisson
- * distribution with lambda \p lambda from the scrambled Sobol64 generator in \p state,
- * increment position of generator by one.
- *
- * \param state - Pointer to state to update
- * \param lambda - Lambda of the Poisson distribution
- *
- * \return Poisson-distributed unsigned int with lambda \p lambda
- */
-QUALIFIERS unsigned int curand_poisson(curandStateScrambledSobol64_t *state, double lambda)
-{
-    return _curand_poisson(curand(state), lambda);
-}
-#endif // !defined(CURAND_POISSON_H_)
--- a/include/cuda/curand_precalc.h
+++ b/include/cuda/curand_precalc.h
--- a/include/cuda/curand_uniform.h
+++ b/include/cuda/curand_uniform.h
@ -1,498 +0,0 @@
-
- /* Copyright 2010-2018 NVIDIA Corporation.  All rights reserved.
-  *
-  * NOTICE TO LICENSEE:
-  *
-  * The source code and/or documentation ("Licensed Deliverables") are
-  * subject to NVIDIA intellectual property rights under U.S. and
-  * international Copyright laws.
-  *
-  * The Licensed Deliverables contained herein are PROPRIETARY and
-  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
-  * conditions of a form of NVIDIA software license agreement by and
-  * between NVIDIA and Licensee ("License Agreement") or electronically
-  * accepted by Licensee.  Notwithstanding any terms or conditions to
-  * the contrary in the License Agreement, reproduction or disclosure
-  * of the Licensed Deliverables to any third party without the express
-  * written consent of NVIDIA is prohibited.
-  *
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
-  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
-  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
-  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
-  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
-  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
-  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
-  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
-  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
-  * OF THESE LICENSED DELIVERABLES.
-  *
-  * U.S. Government End Users.  These Licensed Deliverables are a
-  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
-  * 1995), consisting of "commercial computer software" and "commercial
-  * computer software documentation" as such terms are used in 48
-  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
-  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
-  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
-  * U.S. Government End Users acquire the Licensed Deliverables with
-  * only those rights set forth herein.
-  *
-  * Any use of the Licensed Deliverables in individual and commercial
-  * software must include, in the user documentation and internal
-  * comments to the code, the above Disclaimer and U.S. Government End
-  * Users Notice.
-  */
-
-
-#if !defined(CURAND_UNIFORM_H_)
-#define CURAND_UNIFORM_H_
-
-/**
- * \defgroup DEVICE Device API
- *
- * @{
- */
-
-#ifndef __CUDACC_RTC__
-#include <math.h>
-#endif // __CUDACC_RTC__
-
-#include "curand_mrg32k3a.h"
-#include "curand_mtgp32_kernel.h"
-#include "curand_philox4x32_x.h"
-
-
-QUALIFIERS float _curand_uniform(unsigned int x)
-{
-    return x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
-}
-
-QUALIFIERS float4 _curand_uniform4(uint4 x)
-{
-    float4 y;
-    y.x = x.x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
-    y.y = x.y * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
-    y.z = x.z * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
-    y.w = x.w * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
-    return y;
-}
-
-QUALIFIERS float _curand_uniform(unsigned long long x)
-{
-    unsigned int t;
-    t = (unsigned int)(x >> 32);
-    return t * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
-}
-
-QUALIFIERS double _curand_uniform_double(unsigned int x)
-{
-    return x * CURAND_2POW32_INV_DOUBLE + CURAND_2POW32_INV_DOUBLE;
-}
-
-QUALIFIERS double _curand_uniform_double(unsigned long long x)
-{
-    return (x >> 11) * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
-}
-
-QUALIFIERS double _curand_uniform_double_hq(unsigned int x, unsigned int y)
-{
-    unsigned long long z = (unsigned long long)x ^
-        ((unsigned long long)y << (53 - 32));
-    return z * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
-}
-
-QUALIFIERS float curand_uniform(curandStateTest_t *state)
-{
-    return _curand_uniform(curand(state));
-}
-
-QUALIFIERS double curand_uniform_double(curandStateTest_t *state)
-{
-    return _curand_uniform_double(curand(state));
-}
-
-/**
- * \brief Return a uniformly distributed float from an XORWOW generator.
- *
- * Return a uniformly distributed float between \p 0.0f and \p 1.0f
- * from the XORWOW generator in \p state, increment position of generator.
- * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
- * point outputs are never returned.
- *
- * The implementation may use any number of calls to \p curand() to
- * get enough random bits to create the return value.  The current
- * implementation uses one call.
- *
- * \param state - Pointer to state to update
- *
- * \return uniformly distributed float between \p 0.0f and \p 1.0f
- */
-QUALIFIERS float curand_uniform(curandStateXORWOW_t *state)
-{
-    return _curand_uniform(curand(state));
-}
-
-/**
- * \brief Return a uniformly distributed double from an XORWOW generator.
- *
- * Return a uniformly distributed double between \p 0.0 and \p 1.0
- * from the XORWOW generator in \p state, increment position of generator.
- * Output range excludes \p 0.0 but includes \p 1.0.  Denormalized floating
- * point outputs are never returned.
- *
- * The implementation may use any number of calls to \p curand() to
- * get enough random bits to create the return value.  The current
- * implementation uses exactly two calls.
- *
- * \param state - Pointer to state to update
- *
- * \return uniformly distributed double between \p 0.0 and \p 1.0
- */
-QUALIFIERS double curand_uniform_double(curandStateXORWOW_t *state)
-{
-    unsigned int x, y;
-    x = curand(state);
-    y = curand(state);
-    return _curand_uniform_double_hq(x, y);
-}
-/**
- * \brief Return a uniformly distributed float from an MRG32k3a generator.
- *
- * Return a uniformly distributed float between \p 0.0f and \p 1.0f
- * from the MRG32k3a generator in \p state, increment position of generator.
- * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
- * point outputs are never returned.
- *
- * The implementation returns up to 23 bits of mantissa, with the minimum
- * return value \f$ 2^{-32} \f$
- *
- * \param state - Pointer to state to update
- *
- * \return uniformly distributed float between \p 0.0f and \p 1.0f
- */
-QUALIFIERS float curand_uniform(curandStateMRG32k3a_t *state)
-{
-    return ((float)(curand_MRG32k3a(state)*MRG32K3A_NORM));
-}
-
-/**
- * \brief Return a uniformly distributed double from an MRG32k3a generator.
- *
- * Return a uniformly distributed double between \p 0.0 and \p 1.0
- * from the MRG32k3a generator in \p state, increment position of generator.
- * Output range excludes \p 0.0 but includes \p 1.0.  Denormalized floating
- * point outputs are never returned.
- *
- * Note the implementation returns at most 32 random bits of mantissa as
- * outlined in the seminal paper by L'Ecuyer.
- *
- * \param state - Pointer to state to update
- *
- * \return uniformly distributed double between \p 0.0 and \p 1.0
- */
-QUALIFIERS double curand_uniform_double(curandStateMRG32k3a_t *state)
-{
-    return curand_MRG32k3a(state)*MRG32K3A_NORM;
-}
-
-
-
-/**
- * \brief Return a uniformly distributed tuple of 2 doubles from an Philox4_32_10 generator.
- *
- * Return a uniformly distributed 2 doubles (double4) between \p 0.0 and \p 1.0
- * from the Philox4_32_10 generator in \p state, increment position of generator by 4.
- * Output range excludes \p 0.0 but includes \p 1.0.  Denormalized floating
- * point outputs are never returned.
- *
- * \param state - Pointer to state to update
- *
- * \return 2 uniformly distributed doubles between \p 0.0 and \p 1.0
- */
-
-QUALIFIERS double2 curand_uniform2_double(curandStatePhilox4_32_10_t *state)
-{
-    uint4 _x;
-    double2 result;
-    _x = curand4(state);
-    result.x = _curand_uniform_double_hq(_x.x,_x.y);
-    result.y = _curand_uniform_double_hq(_x.z,_x.w);
-    return result;
-}
-
-
-// not a part of API
-QUALIFIERS double4 curand_uniform4_double(curandStatePhilox4_32_10_t *state)
-{
-    uint4 _x, _y;
-    double4 result;
-    _x = curand4(state);
-    _y = curand4(state);
-    result.x = _curand_uniform_double_hq(_x.x,_x.y);
-    result.y = _curand_uniform_double_hq(_x.z,_x.w);
-    result.z = _curand_uniform_double_hq(_y.x,_y.y);
-    result.w = _curand_uniform_double_hq(_y.z,_y.w);
-    return result;
-}
-
-/**
- * \brief Return a uniformly distributed float from a Philox4_32_10 generator.
- *
- * Return a uniformly distributed float between \p 0.0f and \p 1.0f
- * from the Philox4_32_10 generator in \p state, increment position of generator.
- * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
- * point outputs are never returned.
- *
- * \param state - Pointer to state to update
- *
- * \return uniformly distributed float between \p 0.0 and \p 1.0
- *
- */
-QUALIFIERS float curand_uniform(curandStatePhilox4_32_10_t *state)
-{
-   return _curand_uniform(curand(state));
-}
-
-/**
- * \brief Return a uniformly distributed tuple of 4 floats from a Philox4_32_10 generator.
- *
- * Return a uniformly distributed 4 floats between \p 0.0f and \p 1.0f
- * from the Philox4_32_10 generator in \p state, increment position of generator by 4.
- * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
- * point outputs are never returned.
- *
- * \param state - Pointer to state to update
- *
- * \return uniformly distributed float between \p 0.0 and \p 1.0
- *
- */
-QUALIFIERS float4 curand_uniform4(curandStatePhilox4_32_10_t *state)
-{
-   return _curand_uniform4(curand4(state));
-}
-
-/**
- * \brief Return a uniformly distributed float from a MTGP32 generator.
- *
- * Return a uniformly distributed float between \p 0.0f and \p 1.0f
- * from the MTGP32 generator in \p state, increment position of generator.
- * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
- * point outputs are never returned.
- *
- * \param state - Pointer to state to update
- *
- * \return uniformly distributed float between \p 0.0f and \p 1.0f
- */
-QUALIFIERS float curand_uniform(curandStateMtgp32_t *state)
-{
-    return _curand_uniform(curand(state));
-}
-/**
- * \brief Return a uniformly distributed double from a MTGP32 generator.
- *
- * Return a uniformly distributed double between \p 0.0f and \p 1.0f
- * from the MTGP32 generator in \p state, increment position of generator.
- * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
- * point outputs are never returned.
- *
- * Note that the implementation uses only 32 random bits to generate a single double
- * precision value.
- *
- * \param state - Pointer to state to update
- *
- * \return uniformly distributed double between \p 0.0f and \p 1.0f
- */
-QUALIFIERS double curand_uniform_double(curandStateMtgp32_t *state)
-{
-    return _curand_uniform_double(curand(state));
-}
-
-/**
- * \brief Return a uniformly distributed double from a Philox4_32_10 generator.
- *
- * Return a uniformly distributed double between \p 0.0f and \p 1.0f
- * from the Philox4_32_10 generator in \p state, increment position of generator.
- * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
- * point outputs are never returned.
- *
- * Note that the implementation uses only 32 random bits to generate a single double
- * precision value.
- *
- * \p curand_uniform2_double() is recommended for higher quality uniformly distributed
- * double precision values.
- *
- * \param state - Pointer to state to update
- *
- * \return uniformly distributed double between \p 0.0f and \p 1.0f
- */
-
-QUALIFIERS double curand_uniform_double(curandStatePhilox4_32_10_t *state)
-{
-    return _curand_uniform_double(curand(state));
-}
-
-
-/**
- * \brief Return a uniformly distributed float from a Sobol32 generator.
- *
- * Return a uniformly distributed float between \p 0.0f and \p 1.0f
- * from the Sobol32 generator in \p state, increment position of generator.
- * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
- * point outputs are never returned.
- *
- * The implementation is guaranteed to use a single call to \p curand().
- *
- * \param state - Pointer to state to update
- *
- * \return uniformly distributed float between \p 0.0f and \p 1.0f
- */
-QUALIFIERS float curand_uniform(curandStateSobol32_t *state)
-{
-    return _curand_uniform(curand(state));
-}
-
-/**
- * \brief Return a uniformly distributed double from a Sobol32 generator.
- *
- * Return a uniformly distributed double between \p 0.0 and \p 1.0
- * from the Sobol32 generator in \p state, increment position of generator.
- * Output range excludes \p 0.0 but includes \p 1.0.  Denormalized floating
- * point outputs are never returned.
- *
- * The implementation is guaranteed to use a single call to \p curand()
- * to preserve the quasirandom properties of the sequence.
- *
- * Note that the implementation uses only 32 random bits to generate a single double
- * precision value.
- *
- * \param state - Pointer to state to update
- *
- * \return uniformly distributed double between \p 0.0 and \p 1.0
- */
-QUALIFIERS double curand_uniform_double(curandStateSobol32_t *state)
-{
-    return _curand_uniform_double(curand(state));
-}
-/**
- * \brief Return a uniformly distributed float from a scrambled Sobol32 generator.
- *
- * Return a uniformly distributed float between \p 0.0f and \p 1.0f
- * from the scrambled Sobol32 generator in \p state, increment position of generator.
- * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
- * point outputs are never returned.
- *
- * The implementation is guaranteed to use a single call to \p curand().
- *
- * \param state - Pointer to state to update
- *
- * \return uniformly distributed float between \p 0.0f and \p 1.0f
- */
-QUALIFIERS float curand_uniform(curandStateScrambledSobol32_t *state)
-{
-    return _curand_uniform(curand(state));
-}
-
-/**
- * \brief Return a uniformly distributed double from a scrambled Sobol32 generator.
- *
- * Return a uniformly distributed double between \p 0.0 and \p 1.0
- * from the scrambled Sobol32 generator in \p state, increment position of generator.
- * Output range excludes \p 0.0 but includes \p 1.0.  Denormalized floating
- * point outputs are never returned.
- *
- * The implementation is guaranteed to use a single call to \p curand()
- * to preserve the quasirandom properties of the sequence.
- *
- * Note that the implementation uses only 32 random bits to generate a single double
- * precision value.
- *
- * \param state - Pointer to state to update
- *
- * \return uniformly distributed double between \p 0.0 and \p 1.0
- */
-QUALIFIERS double curand_uniform_double(curandStateScrambledSobol32_t *state)
-{
-    return _curand_uniform_double(curand(state));
-}
-/**
- * \brief Return a uniformly distributed float from a Sobol64 generator.
- *
- * Return a uniformly distributed float between \p 0.0f and \p 1.0f
- * from the Sobol64 generator in \p state, increment position of generator.
- * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
- * point outputs are never returned.
- *
- * The implementation is guaranteed to use a single call to \p curand().
- *
- * \param state - Pointer to state to update
- *
- * \return uniformly distributed float between \p 0.0f and \p 1.0f
- */
-QUALIFIERS float curand_uniform(curandStateSobol64_t *state)
-{
-    return _curand_uniform(curand(state));
-}
-
-/**
- * \brief Return a uniformly distributed double from a Sobol64 generator.
- *
- * Return a uniformly distributed double between \p 0.0 and \p 1.0
- * from the Sobol64 generator in \p state, increment position of generator.
- * Output range excludes \p 0.0 but includes \p 1.0.  Denormalized floating
- * point outputs are never returned.
- *
- * The implementation is guaranteed to use a single call to \p curand()
- * to preserve the quasirandom properties of the sequence.
- *
- * \param state - Pointer to state to update
- *
- * \return uniformly distributed double between \p 0.0 and \p 1.0
- */
-QUALIFIERS double curand_uniform_double(curandStateSobol64_t *state)
-{
-    return _curand_uniform_double(curand(state));
-}
-/**
- * \brief Return a uniformly distributed float from a scrambled Sobol64 generator.
- *
- * Return a uniformly distributed float between \p 0.0f and \p 1.0f
- * from the scrambled Sobol64 generator in \p state, increment position of generator.
- * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
- * point outputs are never returned.
- *
- * The implementation is guaranteed to use a single call to \p curand().
- *
- * \param state - Pointer to state to update
- *
- * \return uniformly distributed float between \p 0.0f and \p 1.0f
- */
-QUALIFIERS float curand_uniform(curandStateScrambledSobol64_t *state)
-{
-    return _curand_uniform(curand(state));
-}
-
-/**
- * \brief Return a uniformly distributed double from a scrambled Sobol64 generator.
- *
- * Return a uniformly distributed double between \p 0.0 and \p 1.0
- * from the scrambled Sobol64 generator in \p state, increment position of generator.
- * Output range excludes \p 0.0 but includes \p 1.0.  Denormalized floating
- * point outputs are never returned.
- *
- * The implementation is guaranteed to use a single call to \p curand()
- * to preserve the quasirandom properties of the sequence.
- *
- * \param state - Pointer to state to update
- *
- * \return uniformly distributed double between \p 0.0 and \p 1.0
- */
-QUALIFIERS double curand_uniform_double(curandStateScrambledSobol64_t *state)
-{
-    return _curand_uniform_double(curand(state));
-}
-
-#endif // !defined(CURAND_UNIFORM_H_)
--- a/Show more
+++ b/Show more