just doesn't work properly on windows. will leave it as a manual flag for others

This commit is contained in:
Concedo 2023-04-22 10:57:38 +08:00
parent ef13443047
commit 4fa3dfe8bc
183 changed files with 4 additions and 281227 deletions

View file

@ -55,7 +55,6 @@ BONUSCFLAGS2 =
OPENBLAS_FLAGS = -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
CLBLAST_FLAGS = -DGGML_USE_CLBLAST -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
CUBLAS_FLAGS = -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I./include/cuda -I./include/cuda/crt
#lets try enabling everything
CFLAGS += -pthread -s
@ -157,7 +156,6 @@ NOAVX2_BUILD =
OPENBLAS_BUILD =
OPENBLAS_NOAVX2_BUILD =
CLBLAST_BUILD =
CUBLAS_BUILD =
ifeq ($(OS),Windows_NT)
DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS)
@ -165,7 +163,6 @@ ifeq ($(OS),Windows_NT)
OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o $@.dll $(LDFLAGS)
OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o $@.dll $(LDFLAGS)
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o $@.dll $(LDFLAGS)
CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ lib/cuda.lib lib/cublas.lib lib/cublasLt.lib lib/cudart.lib lib/cudart_static.lib lib/ggml-cuda-kernel.lib -shared -o $@.dll $(LDFLAGS)
else
DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS)
NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS)
@ -176,17 +173,12 @@ else
ifdef LLAMA_CLBLAST
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
endif
ifdef LLAMA_CUBLAS
CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ -lcublas_static -lculibos -lcudart_static -lcublasLt_static -lpthread -ldl -L/usr/local/cuda/lib64 -shared -o $@.so $(LDFLAGS)
endif
ifndef LLAMA_OPENBLAS
ifndef LLAMA_CLBLAST
ifndef LLAMA_CUBLAS
OPENBLAS_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. For faster speeds, install and link a BLAS library. Set LLAMA_OPENBLAS=1 to compile with OpenBLAS support or LLAMA_CLBLAST=1 to compile with ClBlast support. This is just a reminder, not an error.'
endif
endif
endif
endif
#
@ -223,9 +215,6 @@ ggml_openblas_noavx2.o: ggml.c ggml.h
ggml_clblast.o: ggml.c ggml.h
$(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) $(CLBLAST_FLAGS) -c $< -o $@
ggml_cublas.o: ggml.c ggml.h
$(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) $(CUBLAS_FLAGS) -c $< -o $@
ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h
$(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -c $< -o $@
@ -248,7 +237,7 @@ gpttype_adapter.o: gpttype_adapter.cpp
$(CXX) $(CXXFLAGS) -c $< -o $@
clean:
rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize-stats perplexity embedding benchmark-q4_0-matmult main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_noavx2.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_noavx2.so koboldcpp_openblas_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so gptj.exe gpt2.exe
rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize-stats perplexity embedding benchmark-q4_0-matmult main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_noavx2.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_noavx2.so koboldcpp_openblas_noavx2.so koboldcpp_clblast.so gptj.exe gpt2.exe
main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
@ -270,9 +259,6 @@ koboldcpp_openblas_noavx2: ggml_openblas_noavx2.o ggml_rwkv.o ggml_v1_noavx2.o e
koboldcpp_clblast: ggml_clblast.o ggml_rwkv.o ggml_v1.o expose.o common.o gpttype_adapter.o
$(CLBLAST_BUILD)
koboldcpp_cublas: ggml_cublas.o ggml_rwkv.o ggml_v1.o expose.o common.o gpttype_adapter.o
$(CUBLAS_BUILD)
quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,129 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __OPENCL_CL_D3D10_H
#define __OPENCL_CL_D3D10_H
#if defined(_MSC_VER)
#if _MSC_VER >=1500
#pragma warning( push )
#pragma warning( disable : 4201 )
#endif
#endif
#include <d3d10.h>
#if defined(_MSC_VER)
#if _MSC_VER >=1500
#pragma warning( pop )
#endif
#endif
#include <CL/cl.h>
#include <CL/cl_platform.h>
#ifdef __cplusplus
extern "C" {
#endif
/******************************************************************************
* cl_khr_d3d10_sharing */
#define cl_khr_d3d10_sharing 1
typedef cl_uint cl_d3d10_device_source_khr;
typedef cl_uint cl_d3d10_device_set_khr;
/******************************************************************************/
/* Error Codes */
#define CL_INVALID_D3D10_DEVICE_KHR -1002
#define CL_INVALID_D3D10_RESOURCE_KHR -1003
#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR -1004
#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR -1005
/* cl_d3d10_device_source_nv */
#define CL_D3D10_DEVICE_KHR 0x4010
#define CL_D3D10_DXGI_ADAPTER_KHR 0x4011
/* cl_d3d10_device_set_nv */
#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR 0x4012
#define CL_ALL_DEVICES_FOR_D3D10_KHR 0x4013
/* cl_context_info */
#define CL_CONTEXT_D3D10_DEVICE_KHR 0x4014
#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
/* cl_mem_info */
#define CL_MEM_D3D10_RESOURCE_KHR 0x4015
/* cl_image_info */
#define CL_IMAGE_D3D10_SUBRESOURCE_KHR 0x4016
/* cl_command_type */
#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR 0x4017
#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR 0x4018
/******************************************************************************/
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
cl_platform_id platform,
cl_d3d10_device_source_khr d3d_device_source,
void * d3d_object,
cl_d3d10_device_set_khr d3d_device_set,
cl_uint num_entries,
cl_device_id * devices,
cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D10Buffer * resource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D10Texture2D * resource,
UINT subresource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D10Texture3D * resource,
UINT subresource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_D3D10_H */

View file

@ -1,122 +0,0 @@
/**********************************************************************************
* Copyright (c) 2008-2009 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
**********************************************************************************/
#ifndef __OPENCL_CL_D3D10_EXT_H
#define __OPENCL_CL_D3D10_EXT_H
#include <d3d10.h>
#include <CL/cl.h>
#include <CL/cl_platform.h>
#ifdef __cplusplus
extern "C" {
#endif
/******************************************************************************
* cl_nv_d3d10_sharing */
typedef cl_uint cl_d3d10_device_source_nv;
typedef cl_uint cl_d3d10_device_set_nv;
/******************************************************************************/
// Error Codes
#define CL_INVALID_D3D10_DEVICE_NV -1002
#define CL_INVALID_D3D10_RESOURCE_NV -1003
#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_NV -1004
#define CL_D3D10_RESOURCE_NOT_ACQUIRED_NV -1005
// cl_d3d10_device_source_nv
#define CL_D3D10_DEVICE_NV 0x4010
#define CL_D3D10_DXGI_ADAPTER_NV 0x4011
// cl_d3d10_device_set_nv
#define CL_PREFERRED_DEVICES_FOR_D3D10_NV 0x4012
#define CL_ALL_DEVICES_FOR_D3D10_NV 0x4013
// cl_context_info
#define CL_CONTEXT_D3D10_DEVICE_NV 0x4014
// cl_mem_info
#define CL_MEM_D3D10_RESOURCE_NV 0x4015
// cl_image_info
#define CL_IMAGE_D3D10_SUBRESOURCE_NV 0x4016
// cl_command_type
#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_NV 0x4017
#define CL_COMMAND_RELEASE_D3D10_OBJECTS_NV 0x4018
/******************************************************************************/
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10NV_fn)(
cl_platform_id platform,
cl_d3d10_device_source_nv d3d_device_source,
void * d3d_object,
cl_d3d10_device_set_nv d3d_device_set,
cl_uint num_entries,
cl_device_id * devices,
cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferNV_fn)(
cl_context context,
cl_mem_flags flags,
ID3D10Buffer * resource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DNV_fn)(
cl_context context,
cl_mem_flags flags,
ID3D10Texture2D * resource,
UINT subresource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DNV_fn)(
cl_context context,
cl_mem_flags flags,
ID3D10Texture3D * resource,
UINT subresource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsNV_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsNV_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
#ifdef __cplusplus
}
#endif
#endif // __OPENCL_CL_D3D10_H

View file

@ -1,128 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __OPENCL_CL_D3D11_H
#define __OPENCL_CL_D3D11_H
#if defined(_MSC_VER)
#if _MSC_VER >=1500
#pragma warning( push )
#pragma warning( disable : 4201 )
#endif
#endif
#include <d3d11.h>
#if defined(_MSC_VER)
#if _MSC_VER >=1500
#pragma warning( pop )
#endif
#endif
#include <CL/cl.h>
#include <CL/cl_platform.h>
#ifdef __cplusplus
extern "C" {
#endif
/******************************************************************************
* cl_khr_d3d11_sharing */
#define cl_khr_d3d11_sharing 1
typedef cl_uint cl_d3d11_device_source_khr;
typedef cl_uint cl_d3d11_device_set_khr;
/******************************************************************************/
/* Error Codes */
#define CL_INVALID_D3D11_DEVICE_KHR -1006
#define CL_INVALID_D3D11_RESOURCE_KHR -1007
#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR -1008
#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR -1009
/* cl_d3d11_device_source */
#define CL_D3D11_DEVICE_KHR 0x4019
#define CL_D3D11_DXGI_ADAPTER_KHR 0x401A
/* cl_d3d11_device_set */
#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR 0x401B
#define CL_ALL_DEVICES_FOR_D3D11_KHR 0x401C
/* cl_context_info */
#define CL_CONTEXT_D3D11_DEVICE_KHR 0x401D
#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
/* cl_mem_info */
#define CL_MEM_D3D11_RESOURCE_KHR 0x401E
/* cl_image_info */
#define CL_IMAGE_D3D11_SUBRESOURCE_KHR 0x401F
/* cl_command_type */
#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR 0x4020
#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR 0x4021
/******************************************************************************/
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
cl_platform_id platform,
cl_d3d11_device_source_khr d3d_device_source,
void * d3d_object,
cl_d3d11_device_set_khr d3d_device_set,
cl_uint num_entries,
cl_device_id * devices,
cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D11Buffer * resource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D11Texture2D * resource,
UINT subresource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D11Texture3D * resource,
UINT subresource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_D3D11_H */

View file

@ -1,122 +0,0 @@
/**********************************************************************************
* Copyright (c) 2008-2009 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
**********************************************************************************/
#ifndef __OPENCL_CL_D3D11_EXT_H
#define __OPENCL_CL_D3D11_EXT_H
#include <d3d11.h>
#include <CL/cl.h>
#include <CL/cl_platform.h>
#ifdef __cplusplus
extern "C" {
#endif
/******************************************************************************
* cl_nv_d3d11_sharing */
typedef cl_uint cl_d3d11_device_source_nv;
typedef cl_uint cl_d3d11_device_set_nv;
/******************************************************************************/
// Error Codes
#define CL_INVALID_D3D11_DEVICE_NV -1006
#define CL_INVALID_D3D11_RESOURCE_NV -1007
#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_NV -1008
#define CL_D3D11_RESOURCE_NOT_ACQUIRED_NV -1009
// cl_d3d11_device_source_nv
#define CL_D3D11_DEVICE_NV 0x4019
#define CL_D3D11_DXGI_ADAPTER_NV 0x401A
// cl_d3d11_device_set_nv
#define CL_PREFERRED_DEVICES_FOR_D3D11_NV 0x401B
#define CL_ALL_DEVICES_FOR_D3D11_NV 0x401C
// cl_context_info
#define CL_CONTEXT_D3D11_DEVICE_NV 0x401D
// cl_mem_info
#define CL_MEM_D3D11_RESOURCE_NV 0x401E
// cl_image_info
#define CL_IMAGE_D3D11_SUBRESOURCE_NV 0x401F
// cl_command_type
#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_NV 0x4020
#define CL_COMMAND_RELEASE_D3D11_OBJECTS_NV 0x4021
/******************************************************************************/
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11NV_fn)(
cl_platform_id platform,
cl_d3d11_device_source_nv d3d_device_source,
void * d3d_object,
cl_d3d11_device_set_nv d3d_device_set,
cl_uint num_entries,
cl_device_id * devices,
cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferNV_fn)(
cl_context context,
cl_mem_flags flags,
ID3D11Buffer * resource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DNV_fn)(
cl_context context,
cl_mem_flags flags,
ID3D11Texture2D * resource,
UINT subresource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DNV_fn)(
cl_context context,
cl_mem_flags flags,
ID3D11Texture3D * resource,
UINT subresource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsNV_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsNV_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
#ifdef __cplusplus
}
#endif
#endif // __OPENCL_CL_D3D11_H

View file

@ -1,143 +0,0 @@
/**********************************************************************************
* Copyright (c) 2008-2009 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
**********************************************************************************/
#ifndef __OPENCL_CL_D3D9_EXT_H
#define __OPENCL_CL_D3D9_EXT_H
#include <d3d9.h>
#include <CL/cl.h>
#include <CL/cl_platform.h>
#ifdef __cplusplus
extern "C" {
#endif
/******************************************************************************
* cl_nv_d3d9_sharing */
typedef cl_uint cl_d3d9_device_source_nv;
typedef cl_uint cl_d3d9_device_set_nv;
/******************************************************************************/
// Error Codes
#define CL_INVALID_D3D9_DEVICE_NV -1010
#define CL_INVALID_D3D9_RESOURCE_NV -1011
#define CL_D3D9_RESOURCE_ALREADY_ACQUIRED_NV -1012
#define CL_D3D9_RESOURCE_NOT_ACQUIRED_NV -1013
// cl_d3d9_device_source_nv
#define CL_D3D9_DEVICE_NV 0x4022
#define CL_D3D9_ADAPTER_NAME_NV 0x4023
// cl_d3d9_device_set_nv
#define CL_PREFERRED_DEVICES_FOR_D3D9_NV 0x4024
#define CL_ALL_DEVICES_FOR_D3D9_NV 0x4025
// cl_context_info
#define CL_CONTEXT_D3D9_DEVICE_NV 0x4026
// cl_mem_info
#define CL_MEM_D3D9_RESOURCE_NV 0x4027
// cl_image_info
#define CL_IMAGE_D3D9_FACE_NV 0x4028
#define CL_IMAGE_D3D9_LEVEL_NV 0x4029
// cl_command_type
#define CL_COMMAND_ACQUIRE_D3D9_OBJECTS_NV 0x402A
#define CL_COMMAND_RELEASE_D3D9_OBJECTS_NV 0x402B
/******************************************************************************/
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D9NV_fn)(
cl_platform_id platform,
cl_d3d9_device_source_nv d3d_device_source,
void * d3d_object,
cl_d3d9_device_set_nv d3d_device_set,
cl_uint num_entries,
cl_device_id * devices,
cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9VertexBufferNV_fn)(
cl_context context,
cl_mem_flags flags,
IDirect3DVertexBuffer9 * resource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9IndexBufferNV_fn)(
cl_context context,
cl_mem_flags flags,
IDirect3DIndexBuffer9 * resource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9SurfaceNV_fn)(
cl_context context,
cl_mem_flags flags,
IDirect3DSurface9 * resource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9TextureNV_fn)(
cl_context context,
cl_mem_flags flags,
IDirect3DTexture9 *resource,
UINT miplevel,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9CubeTextureNV_fn)(
cl_context context,
cl_mem_flags flags,
IDirect3DCubeTexture9 * resource,
D3DCUBEMAP_FACES facetype,
UINT miplevel,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9VolumeTextureNV_fn)(
cl_context context,
cl_mem_flags flags,
IDirect3DVolumeTexture9 * resource,
UINT miplevel,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D9ObjectsNV_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem *mem_objects,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D9ObjectsNV_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
cl_mem *mem_objects,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_0;
#ifdef __cplusplus
}
#endif
#endif // __OPENCL_CL_D3D9_H

View file

@ -1,118 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
#define __OPENCL_CL_DX9_MEDIA_SHARING_H
#include <CL/cl.h>
#include <CL/cl_platform.h>
#ifdef __cplusplus
extern "C" {
#endif
/******************************************************************************/
/* cl_khr_dx9_media_sharing */
#define cl_khr_dx9_media_sharing 1
typedef cl_uint cl_dx9_media_adapter_type_khr;
typedef cl_uint cl_dx9_media_adapter_set_khr;
#if defined(_WIN32)
#include <d3d9.h>
typedef struct _cl_dx9_surface_info_khr
{
IDirect3DSurface9 *resource;
HANDLE shared_handle;
} cl_dx9_surface_info_khr;
#endif
/******************************************************************************/
/* Error Codes */
#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR -1010
#define CL_INVALID_DX9_MEDIA_SURFACE_KHR -1011
#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR -1012
#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR -1013
/* cl_media_adapter_type_khr */
#define CL_ADAPTER_D3D9_KHR 0x2020
#define CL_ADAPTER_D3D9EX_KHR 0x2021
#define CL_ADAPTER_DXVA_KHR 0x2022
/* cl_media_adapter_set_khr */
#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2023
#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2024
/* cl_context_info */
#define CL_CONTEXT_ADAPTER_D3D9_KHR 0x2025
#define CL_CONTEXT_ADAPTER_D3D9EX_KHR 0x2026
#define CL_CONTEXT_ADAPTER_DXVA_KHR 0x2027
/* cl_mem_info */
#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR 0x2028
#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR 0x2029
/* cl_image_info */
#define CL_IMAGE_DX9_MEDIA_PLANE_KHR 0x202A
/* cl_command_type */
#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR 0x202B
#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR 0x202C
/******************************************************************************/
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
cl_platform_id platform,
cl_uint num_media_adapters,
cl_dx9_media_adapter_type_khr * media_adapter_type,
void * media_adapters,
cl_dx9_media_adapter_set_khr media_adapter_set,
cl_uint num_entries,
cl_device_id * devices,
cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
cl_context context,
cl_mem_flags flags,
cl_dx9_media_adapter_type_khr adapter_type,
void * surface_info,
cl_uint plane,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_DX9_MEDIA_SHARING_H */

View file

@ -1,123 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __OPENCL_CL_EGL_H
#define __OPENCL_CL_EGL_H
#ifdef __APPLE__
#else
#include <CL/cl.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR 0x202F
#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR 0x202D
#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR 0x202E
/* Error type for clCreateFromEGLImageKHR */
#define CL_INVALID_EGL_OBJECT_KHR -1093
#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR -1092
/* CLeglImageKHR is an opaque handle to an EGLImage */
typedef void* CLeglImageKHR;
/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
typedef void* CLeglDisplayKHR;
/* CLeglSyncKHR is an opaque handle to an EGLSync object */
typedef void* CLeglSyncKHR;
/* properties passed to clCreateFromEGLImageKHR */
typedef intptr_t cl_egl_image_properties_khr;
#define cl_khr_egl_image 1
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromEGLImageKHR(cl_context context,
CLeglDisplayKHR egldisplay,
CLeglImageKHR eglimage,
cl_mem_flags flags,
const cl_egl_image_properties_khr * properties,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
cl_context context,
CLeglDisplayKHR egldisplay,
CLeglImageKHR eglimage,
cl_mem_flags flags,
const cl_egl_image_properties_khr * properties,
cl_int * errcode_ret);
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireEGLObjectsKHR(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event);
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseEGLObjectsKHR(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event);
#define cl_khr_egl_event 1
extern CL_API_ENTRY cl_event CL_API_CALL
clCreateEventFromEGLSyncKHR(cl_context context,
CLeglSyncKHR sync,
CLeglDisplayKHR display,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
cl_context context,
CLeglSyncKHR sync,
CLeglDisplayKHR display,
cl_int * errcode_ret);
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_EGL_H */

File diff suppressed because it is too large Load diff

View file

@ -1,154 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __OPENCL_CL_GL_H
#define __OPENCL_CL_GL_H
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
typedef cl_uint cl_gl_object_type;
typedef cl_uint cl_gl_texture_info;
typedef cl_uint cl_gl_platform_info;
typedef struct __GLsync *cl_GLsync;
/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken */
#define CL_GL_OBJECT_BUFFER 0x2000
#define CL_GL_OBJECT_TEXTURE2D 0x2001
#define CL_GL_OBJECT_TEXTURE3D 0x2002
#define CL_GL_OBJECT_RENDERBUFFER 0x2003
#define CL_GL_OBJECT_TEXTURE2D_ARRAY 0x200E
#define CL_GL_OBJECT_TEXTURE1D 0x200F
#define CL_GL_OBJECT_TEXTURE1D_ARRAY 0x2010
#define CL_GL_OBJECT_TEXTURE_BUFFER 0x2011
/* cl_gl_texture_info */
#define CL_GL_TEXTURE_TARGET 0x2004
#define CL_GL_MIPMAP_LEVEL 0x2005
#define CL_GL_NUM_SAMPLES 0x2012
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLBuffer(cl_context context,
cl_mem_flags flags,
cl_GLuint bufobj,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLTexture(cl_context context,
cl_mem_flags flags,
cl_GLenum target,
cl_GLint miplevel,
cl_GLuint texture,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLRenderbuffer(cl_context context,
cl_mem_flags flags,
cl_GLuint renderbuffer,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLObjectInfo(cl_mem memobj,
cl_gl_object_type * gl_object_type,
cl_GLuint * gl_object_name) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLTextureInfo(cl_mem memobj,
cl_gl_texture_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireGLObjects(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseGLObjects(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
/* Deprecated OpenCL 1.1 APIs */
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
clCreateFromGLTexture2D(cl_context context,
cl_mem_flags flags,
cl_GLenum target,
cl_GLint miplevel,
cl_GLuint texture,
cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
clCreateFromGLTexture3D(cl_context context,
cl_mem_flags flags,
cl_GLenum target,
cl_GLint miplevel,
cl_GLuint texture,
cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
/* cl_khr_gl_sharing extension */
#define cl_khr_gl_sharing 1
typedef cl_uint cl_gl_context_info;
/* Additional Error Codes */
#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000
/* cl_gl_context_info */
#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006
#define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007
/* Additional cl_context_properties */
#define CL_GL_CONTEXT_KHR 0x2008
#define CL_EGL_DISPLAY_KHR 0x2009
#define CL_GLX_DISPLAY_KHR 0x200A
#define CL_WGL_HDC_KHR 0x200B
#define CL_CGL_SHAREGROUP_KHR 0x200C
extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLContextInfoKHR(const cl_context_properties * properties,
cl_gl_context_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
const cl_context_properties * properties,
cl_gl_context_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret);
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_GL_H */

View file

@ -1,44 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __OPENCL_CL_GL_EXT_H
#define __OPENCL_CL_GL_EXT_H
#ifdef __cplusplus
extern "C" {
#endif
#ifdef __APPLE__
#include <OpenCL/cl_gl.h>
#else
#include <CL/cl_gl.h>
#endif
/*
* cl_khr_gl_event extension
*/
#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D
extern CL_API_ENTRY cl_event CL_API_CALL
clCreateEventFromGLsyncKHR(cl_context context,
cl_GLsync sync,
cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_GL_EXT_H */

File diff suppressed because it is too large Load diff

View file

@ -1,40 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __OPENCL_H
#define __OPENCL_H
#ifdef __cplusplus
extern "C" {
#endif
#ifdef __APPLE__
#include <OpenCL/cl.h>
#include <OpenCL/cl_gl.h>
#include <OpenCL/cl_gl_ext.h>
#include <OpenCL/cl_ext.h>
#else
#include <CL/cl.h>
#include <CL/cl_gl.h>
#include <CL/cl_gl_ext.h>
#include <CL/cl_ext.h>
#endif
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_H */

View file

@ -1,64 +0,0 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "device_types.h"
#if !defined(__CUDACC_RTC__)
#define EXCLUDE_FROM_RTC
#include "driver_types.h"
#undef EXCLUDE_FROM_RTC
#endif /* !__CUDACC_RTC__ */
#include "surface_types.h"
#include "texture_types.h"
#include "vector_types.h"

View file

@ -1,595 +0,0 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CHANNEL_DESCRIPTOR_H__)
#define __CHANNEL_DESCRIPTOR_H__
#if defined(__cplusplus)
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "cuda_runtime_api.h"
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
/**
* \addtogroup CUDART_HIGHLEVEL
*
* @{
*/
/**
* \brief \hl Returns a channel descriptor using the specified format
*
* Returns a channel descriptor with format \p f and number of bits of each
* component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is
* defined as:
* \code
struct cudaChannelFormatDesc {
int x, y, z, w;
enum cudaChannelFormatKind f;
};
* \endcode
*
* where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
* ::cudaChannelFormatKindUnsigned, cudaChannelFormatKindFloat,
* ::cudaChannelFormatKindSignedNormalized8X1, ::cudaChannelFormatKindSignedNormalized8X2,
* ::cudaChannelFormatKindSignedNormalized8X4,
* ::cudaChannelFormatKindUnsignedNormalized8X1, ::cudaChannelFormatKindUnsignedNormalized8X2,
* ::cudaChannelFormatKindUnsignedNormalized8X4,
* ::cudaChannelFormatKindSignedNormalized16X1, ::cudaChannelFormatKindSignedNormalized16X2,
* ::cudaChannelFormatKindSignedNormalized16X4,
* ::cudaChannelFormatKindUnsignedNormalized16X1, ::cudaChannelFormatKindUnsignedNormalized16X2,
* ::cudaChannelFormatKindUnsignedNormalized16X4
* or ::cudaChannelFormatKindNV12.
*
* The format is specified by the template specialization.
*
* The template function specializes for the following scalar types:
* char, signed char, unsigned char, short, unsigned short, int, unsigned int, long, unsigned long, and float.
* The template function specializes for the following vector types:
* char{1|2|4}, uchar{1|2|4}, short{1|2|4}, ushort{1|2|4}, int{1|2|4}, uint{1|2|4}, long{1|2|4}, ulong{1|2|4}, float{1|2|4}.
* The template function specializes for following cudaChannelFormatKind enum values:
* ::cudaChannelFormatKind{Uns|S}ignedNormalized{8|16}X{1|2|4}, and ::cudaChannelFormatKindNV12.
*
* Invoking the function on a type without a specialization defaults to creating a channel format of kind ::cudaChannelFormatKindNone
*
* \return
* Channel descriptor with format \p f
*
* \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
* ::cudaGetChannelDesc, ::cudaGetTextureReference,
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (High level)",
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, size_t) "cudaBindTexture (High level, inherited channel descriptor)",
* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (High level)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (High level)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (High level, inherited channel descriptor)",
* \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (High level)",
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (High level)"
*/
template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
{
return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
{
int e = (int)sizeof(char) * 8;
#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
{
int e = (int)sizeof(signed char) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
{
int e = (int)sizeof(unsigned char) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
{
int e = (int)sizeof(signed char) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
{
int e = (int)sizeof(unsigned char) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
{
int e = (int)sizeof(signed char) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
{
int e = (int)sizeof(unsigned char) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
{
int e = (int)sizeof(signed char) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
{
int e = (int)sizeof(unsigned char) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
{
int e = (int)sizeof(short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
{
int e = (int)sizeof(short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
{
int e = (int)sizeof(short) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
{
int e = (int)sizeof(short) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
{
int e = (int)sizeof(int) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
{
int e = (int)sizeof(unsigned int) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
{
int e = (int)sizeof(int) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
{
int e = (int)sizeof(unsigned int) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
{
int e = (int)sizeof(int) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
{
int e = (int)sizeof(unsigned int) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
{
int e = (int)sizeof(int) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
{
int e = (int)sizeof(unsigned int) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
}
#if !defined(__LP64__)
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
{
int e = (int)sizeof(long) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
{
int e = (int)sizeof(unsigned long) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
{
int e = (int)sizeof(long) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
{
int e = (int)sizeof(unsigned long) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
{
int e = (int)sizeof(long) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
{
int e = (int)sizeof(unsigned long) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
{
int e = (int)sizeof(long) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
{
int e = (int)sizeof(unsigned long) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
}
#endif /* !__LP64__ */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
{
int e = (int)sizeof(float) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
{
int e = (int)sizeof(float) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
{
int e = (int)sizeof(float) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
{
int e = (int)sizeof(float) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescNV12(void)
{
int e = (int)sizeof(char) * 8;
return cudaCreateChannelDesc(e, e, e, 0, cudaChannelFormatKindNV12);
}
template<cudaChannelFormatKind> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
{
return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
}
/* Signed 8-bit normalized integer formats */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X1>(void)
{
return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedNormalized8X1);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X2>(void)
{
return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedNormalized8X2);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X4>(void)
{
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindSignedNormalized8X4);
}
/* Unsigned 8-bit normalized integer formats */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X1>(void)
{
return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized8X1);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X2>(void)
{
return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedNormalized8X2);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X4>(void)
{
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedNormalized8X4);
}
/* Signed 16-bit normalized integer formats */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X1>(void)
{
return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindSignedNormalized16X1);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X2>(void)
{
return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindSignedNormalized16X2);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X4>(void)
{
return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindSignedNormalized16X4);
}
/* Unsigned 16-bit normalized integer formats */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X1>(void)
{
return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized16X1);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X2>(void)
{
return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindUnsignedNormalized16X2);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X4>(void)
{
return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindUnsignedNormalized16X4);
}
/* NV12 format */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindNV12>(void)
{
return cudaCreateChannelDesc(8, 8, 8, 0, cudaChannelFormatKindNV12);
}
/* BC1 format */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1>(void)
{
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1);
}
/* BC1sRGB format */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1SRGB>(void)
{
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1SRGB);
}
/* BC2 format */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2>(void)
{
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2);
}
/* BC2sRGB format */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2SRGB>(void)
{
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2SRGB);
}
/* BC3 format */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3>(void)
{
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3);
}
/* BC3sRGB format */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3SRGB>(void)
{
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3SRGB);
}
/* BC4 unsigned format */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed4>(void)
{
return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed4);
}
/* BC4 signed format */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed4>(void)
{
return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedBlockCompressed4);
}
/* BC5 unsigned format */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed5>(void)
{
return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed5);
}
/* BC5 signed format */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed5>(void)
{
return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedBlockCompressed5);
}
/* BC6H unsigned format */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed6H>(void)
{
return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindUnsignedBlockCompressed6H);
}
/* BC6H signed format */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed6H>(void)
{
return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindSignedBlockCompressed6H);
}
/* BC7 format */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7>(void)
{
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7);
}
/* BC7sRGB format */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7SRGB>(void)
{
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7SRGB);
}
#endif /* __cplusplus */
/** @} */
/** @} */ /* END CUDART_TEXTURE_HL */
#endif /* !__CHANNEL_DESCRIPTOR_H__ */

View file

@ -1,65 +0,0 @@
/*
* Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
#if defined(_MSC_VER)
#pragma message("common_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
#else
#warning "common_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead."
#endif
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
#endif
#include "crt/common_functions.h"
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__)
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
#endif

File diff suppressed because it is too large Load diff

View file

@ -1,310 +0,0 @@
/*
* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
#if defined(_MSC_VER)
#pragma message("crt/common_functions.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
#else
#warning "crt/common_functions.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
#endif
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H__
#endif
#if !defined(__COMMON_FUNCTIONS_H__)
#define __COMMON_FUNCTIONS_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if defined(__cplusplus) && defined(__CUDACC__)
#include "builtin_types.h"
#include "host_defines.h"
#define __CUDACC_VER__ "__CUDACC_VER__ is no longer supported. Use __CUDACC_VER_MAJOR__, __CUDACC_VER_MINOR__, and __CUDACC_VER_BUILD__ instead."
#ifndef __CUDA_API_VER_MAJOR__
#define __CUDA_API_VER_MAJOR__ __CUDACC_VER_MAJOR__
#endif /* __CUDA_API_VER_MAJOR__ */
#ifndef __CUDA_API_VER_MINOR__
#define __CUDA_API_VER_MINOR__ __CUDACC_VER_MINOR__
#endif /* __CUDA_API_VER_MINOR__ */
#if !defined(__CUDACC_RTC__)
#include <string.h>
#include <time.h>
extern "C"
{
#endif /* !__CUDACC_RTC__ */
extern _CRTIMP __host__ __device__ __device_builtin__ __cudart_builtin__ clock_t __cdecl clock(void)
#if defined(__QNX__)
asm("clock32")
#endif
__THROW;
extern __host__ __device__ __device_builtin__ __cudart_builtin__ void* __cdecl memset(void*, int, size_t) __THROW;
extern __host__ __device__ __device_builtin__ __cudart_builtin__ void* __cdecl memcpy(void*, const void*, size_t) __THROW;
#if !defined(__CUDACC_RTC__)
}
#endif /* !__CUDACC_RTC__ */
#if defined(__CUDA_ARCH__)
#if defined(__CUDACC_RTC__)
inline __host__ __device__ void* operator new(size_t, void *p) { return p; }
inline __host__ __device__ void* operator new[](size_t, void *p) { return p; }
inline __host__ __device__ void operator delete(void*, void*) { }
inline __host__ __device__ void operator delete[](void*, void*) { }
#else /* !__CUDACC_RTC__ */
#ifndef __CUDA_INTERNAL_SKIP_CPP_HEADERS__
#include <new>
#endif
#if defined (__GNUC__)
#define STD \
std::
#else /* __GNUC__ */
#define STD
#endif /* __GNUC__ */
extern __host__ __device__ __cudart_builtin__ void* __cdecl operator new(STD size_t, void*) throw();
extern __host__ __device__ __cudart_builtin__ void* __cdecl operator new[](STD size_t, void*) throw();
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete(void*, void*) throw();
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete[](void*, void*) throw();
# if __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__)
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete(void*, STD size_t) throw();
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete[](void*, STD size_t) throw();
#endif /* __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__) */
#endif /* __CUDACC_RTC__ */
#if !defined(__CUDACC_RTC__)
#include <stdio.h>
#include <stdlib.h>
#endif /* !__CUDACC_RTC__ */
#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
namespace std {
#endif
extern "C"
{
extern
#if !defined(_MSC_VER) || _MSC_VER < 1900
_CRTIMP
#endif
#if defined(__GLIBC__) && defined(__GLIBC_MINOR__) && ( (__GLIBC__ < 2) || ( (__GLIBC__ == 2) && (__GLIBC_MINOR__ < 3) ) )
__host__ __device__ __device_builtin__ __cudart_builtin__ int __cdecl printf(const char*, ...) __THROW;
#else /* newer glibc */
__host__ __device__ __device_builtin__ __cudart_builtin__ int __cdecl printf(const char*, ...);
#endif /* defined(__GLIBC__) && defined(__GLIBC_MINOR__) && ( (__GLIBC__ < 2) || ( (__GLIBC__ == 2) && (__GLIBC_MINOR__ < 3) ) ) */
extern _CRTIMP __host__ __device__ __cudart_builtin__ void* __cdecl malloc(size_t) __THROW;
extern _CRTIMP __host__ __device__ __cudart_builtin__ void __cdecl free(void*) __THROW;
#if defined(_MSC_VER)
extern __host__ __device__ __cudart_builtin__ void* __cdecl _alloca(size_t);
#endif
#if defined(__QNX__)
#undef alloca
#define alloca(__S) __builtin_alloca(__S)
#endif
}
#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
} /* std */
#endif
#if !defined(__CUDACC_RTC__)
#include <assert.h>
#endif /* !__CUDACC_RTC__ */
extern "C"
{
#if defined(__CUDACC_RTC__)
extern __host__ __device__ void __assertfail(const char * __assertion,
const char *__file,
unsigned int __line,
const char *__function,
size_t charsize);
#elif defined(__APPLE__)
#define __builtin_expect(exp,c) (exp)
extern __host__ __device__ __cudart_builtin__ void __assert_rtn(
const char *, const char *, int, const char *);
#elif defined(__ANDROID__)
extern __host__ __device__ __cudart_builtin__ void __assert2(
const char *, int, const char *, const char *);
#elif defined(__QNX__)
#if !defined(_LIBCPP_VERSION)
namespace std {
#endif
extern __host__ __device__ __cudart_builtin__ void __assert(
const char *, const char *, unsigned int, const char *);
#if !defined(_LIBCPP_VERSION)
}
#endif
#elif defined(__HORIZON__)
extern __host__ __device__ __cudart_builtin__ void __assert_fail(
const char *, const char *, int, const char *);
#elif defined(__GNUC__)
extern __host__ __device__ __cudart_builtin__ void __assert_fail(
const char *, const char *, unsigned int, const char *)
__THROW;
#elif defined(_WIN32)
extern __host__ __device__ __cudart_builtin__ _CRTIMP void __cdecl _wassert(
const wchar_t *, const wchar_t *, unsigned);
#endif
}
#if defined(__CUDACC_RTC__)
#ifdef NDEBUG
#define assert(e) (static_cast<void>(0))
#else /* !NDEBUG */
#define __ASSERT_STR_HELPER(x) #x
#define assert(e) ((e) ? static_cast<void>(0)\
: __assertfail(__ASSERT_STR_HELPER(e), __FILE__,\
__LINE__, __PRETTY_FUNCTION__,\
sizeof(char)))
#endif /* NDEBUG */
__host__ __device__ void* operator new(size_t);
__host__ __device__ void* operator new[](size_t);
__host__ __device__ void operator delete(void*);
__host__ __device__ void operator delete[](void*);
# if __cplusplus >= 201402L
__host__ __device__ void operator delete(void*, size_t);
__host__ __device__ void operator delete[](void*, size_t);
#endif /* __cplusplus >= 201402L */
#if __cplusplus >= 201703L
namespace std { enum class align_val_t : size_t {}; }
__host__ __device__ void* __cdecl operator new(size_t sz, std::align_val_t) noexcept;
__host__ __device__ void* __cdecl operator new[](size_t sz, std::align_val_t) noexcept;
__host__ __device__ void __cdecl operator delete(void* ptr, std::align_val_t) noexcept;
__host__ __device__ void __cdecl operator delete[](void* ptr, std::align_val_t) noexcept;
__host__ __device__ void __cdecl operator delete(void* ptr, size_t, std::align_val_t) noexcept;
__host__ __device__ void __cdecl operator delete[](void* ptr, size_t, std::align_val_t) noexcept;
#endif /* __cplusplus >= 201703L */
#else /* !__CUDACC_RTC__ */
#if defined (__GNUC__)
#define __NV_GLIBCXX_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
#if (__cplusplus >= 201103L) && ((!(defined(__QNX__) && defined(_LIBCPP_VERSION))) || (defined(__QNX__) && __NV_GLIBCXX_VERSION >= 80300))
#define THROWBADALLOC
#else
#if defined(__ANDROID__) && !defined(_LIBCPP_VERSION) && (defined(__BIONIC__) || __NV_GLIBCXX_VERSION < 40900)
#define THROWBADALLOC
#else
#define THROWBADALLOC throw(STD bad_alloc)
#endif
#endif
#define __DELETE_THROW throw()
#undef __NV_GLIBCXX_VERSION
#else /* __GNUC__ */
#define THROWBADALLOC throw(...)
#endif /* __GNUC__ */
extern __host__ __device__ __cudart_builtin__ void* __cdecl operator new(STD size_t) THROWBADALLOC;
extern __host__ __device__ __cudart_builtin__ void* __cdecl operator new[](STD size_t) THROWBADALLOC;
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete(void*) throw();
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete[](void*) throw();
# if __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__)
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete(void*, STD size_t) throw();
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete[](void*, STD size_t) throw();
#endif /* __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__) */
#if __cpp_aligned_new
extern __host__ __device__ __cudart_builtin__ void* __cdecl operator new(STD size_t, std::align_val_t);
extern __host__ __device__ __cudart_builtin__ void* __cdecl operator new[](STD size_t, std::align_val_t);
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete(void*, std::align_val_t) noexcept;
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete[](void*, std::align_val_t) noexcept;
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete(void*, STD size_t, std::align_val_t) noexcept;
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete[](void*, STD size_t, std::align_val_t) noexcept;
#endif /* __cpp_aligned_new */
#undef THROWBADALLOC
#undef STD
#endif /* __CUDACC_RTC__ */
#endif /* __CUDA_ARCH__ */
#endif /* __cplusplus && __CUDACC__ */
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if defined(__CUDACC_RTC__) && (__CUDA_ARCH__ >= 350)
#include "cuda_device_runtime_api.h"
#endif
#include "math_functions.h"
#endif /* !__COMMON_FUNCTIONS_H__ */
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H__)
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H__
#endif

File diff suppressed because it is too large Load diff

View file

@ -1,197 +0,0 @@
/*
* Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
#if defined(_MSC_VER)
#pragma message("crt/device_double_functions.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
#else
#warning "crt/device_double_functions.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
#endif
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_HPP__
#endif
#if !defined(__DEVICE_DOUBLE_FUNCTIONS_HPP__)
#define __DEVICE_DOUBLE_FUNCTIONS_HPP__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if defined(__cplusplus) && defined(__CUDACC__)
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if defined(__CUDACC_RTC__)
#define __DEVICE_DOUBLE_FUNCTIONS_DECL__ __device__
#else
#define __DEVICE_DOUBLE_FUNCTIONS_DECL__ static __inline__ __device__
#endif /* __CUDACC_RTC__ */
#include "builtin_types.h"
#include "device_types.h"
#include "host_defines.h"
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double fma(double a, double b, double c, enum cudaRoundMode mode)
{
return mode == cudaRoundZero ? __fma_rz(a, b, c) :
mode == cudaRoundPosInf ? __fma_ru(a, b, c) :
mode == cudaRoundMinInf ? __fma_rd(a, b, c) :
__fma_rn(a, b, c);
}
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dmul(double a, double b, enum cudaRoundMode mode)
{
return mode == cudaRoundZero ? __dmul_rz(a, b) :
mode == cudaRoundPosInf ? __dmul_ru(a, b) :
mode == cudaRoundMinInf ? __dmul_rd(a, b) :
__dmul_rn(a, b);
}
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dadd(double a, double b, enum cudaRoundMode mode)
{
return mode == cudaRoundZero ? __dadd_rz(a, b) :
mode == cudaRoundPosInf ? __dadd_ru(a, b) :
mode == cudaRoundMinInf ? __dadd_rd(a, b) :
__dadd_rn(a, b);
}
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dsub(double a, double b, enum cudaRoundMode mode)
{
return mode == cudaRoundZero ? __dsub_rz(a, b) :
mode == cudaRoundPosInf ? __dsub_ru(a, b) :
mode == cudaRoundMinInf ? __dsub_rd(a, b) :
__dsub_rn(a, b);
}
__DEVICE_DOUBLE_FUNCTIONS_DECL__ int double2int(double a, enum cudaRoundMode mode)
{
return mode == cudaRoundNearest ? __double2int_rn(a) :
mode == cudaRoundPosInf ? __double2int_ru(a) :
mode == cudaRoundMinInf ? __double2int_rd(a) :
__double2int_rz(a);
}
__DEVICE_DOUBLE_FUNCTIONS_DECL__ unsigned int double2uint(double a, enum cudaRoundMode mode)
{
return mode == cudaRoundNearest ? __double2uint_rn(a) :
mode == cudaRoundPosInf ? __double2uint_ru(a) :
mode == cudaRoundMinInf ? __double2uint_rd(a) :
__double2uint_rz(a);
}
__DEVICE_DOUBLE_FUNCTIONS_DECL__ long long int double2ll(double a, enum cudaRoundMode mode)
{
return mode == cudaRoundNearest ? __double2ll_rn(a) :
mode == cudaRoundPosInf ? __double2ll_ru(a) :
mode == cudaRoundMinInf ? __double2ll_rd(a) :
__double2ll_rz(a);
}
__DEVICE_DOUBLE_FUNCTIONS_DECL__ unsigned long long int double2ull(double a, enum cudaRoundMode mode)
{
return mode == cudaRoundNearest ? __double2ull_rn(a) :
mode == cudaRoundPosInf ? __double2ull_ru(a) :
mode == cudaRoundMinInf ? __double2ull_rd(a) :
__double2ull_rz(a);
}
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double ll2double(long long int a, enum cudaRoundMode mode)
{
return mode == cudaRoundZero ? __ll2double_rz(a) :
mode == cudaRoundPosInf ? __ll2double_ru(a) :
mode == cudaRoundMinInf ? __ll2double_rd(a) :
__ll2double_rn(a);
}
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double ull2double(unsigned long long int a, enum cudaRoundMode mode)
{
return mode == cudaRoundZero ? __ull2double_rz(a) :
mode == cudaRoundPosInf ? __ull2double_ru(a) :
mode == cudaRoundMinInf ? __ull2double_rd(a) :
__ull2double_rn(a);
}
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double int2double(int a, enum cudaRoundMode mode)
{
return (double)a;
}
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double uint2double(unsigned int a, enum cudaRoundMode mode)
{
return (double)a;
}
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double float2double(float a, enum cudaRoundMode mode)
{
return (double)a;
}
#undef __DEVICE_DOUBLE_FUNCTIONS_DECL__
#endif /* __cplusplus && __CUDACC__ */
#endif /* !__DEVICE_DOUBLE_FUNCTIONS_HPP__ */
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_HPP__)
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_HPP__
#endif

File diff suppressed because it is too large Load diff

View file

@ -1,212 +0,0 @@
/*
* Copyright 1993-2020 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
#if defined(_MSC_VER)
#pragma message("crt/device_functions.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
#else
#warning "crt/device_functions.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
#endif
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_HPP__
#endif
#if !defined(__DEVICE_FUNCTIONS_HPP__)
#define __DEVICE_FUNCTIONS_HPP__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if defined(__cplusplus) && defined(__CUDACC__)
#if defined(__CUDACC_RTC__)
#define __DEVICE_FUNCTIONS_DECL__ __device__
#define __DEVICE_FUNCTIONS_STATIC_DECL__ __device__
#else
#define __DEVICE_FUNCTIONS_DECL__ __device__
#define __DEVICE_FUNCTIONS_STATIC_DECL__ static __inline__ __device__
#endif /* __CUDACC_RTC__ */
#include "builtin_types.h"
#include "device_types.h"
#include "host_defines.h"
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
__DEVICE_FUNCTIONS_STATIC_DECL__ int mulhi(const int a, const int b)
{
return __mulhi(a, b);
}
__DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int mulhi(const unsigned int a, const unsigned int b)
{
return __umulhi(a, b);
}
__DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int mulhi(const int a, const unsigned int b)
{
return __umulhi(static_cast<unsigned int>(a), b);
}
__DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int mulhi(const unsigned int a, const int b)
{
return __umulhi(a, static_cast<unsigned int>(b));
}
__DEVICE_FUNCTIONS_STATIC_DECL__ long long int mul64hi(const long long int a, const long long int b)
{
return __mul64hi(a, b);
}
__DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int mul64hi(const unsigned long long int a, const unsigned long long int b)
{
return __umul64hi(a, b);
}
__DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int mul64hi(const long long int a, const unsigned long long int b)
{
return __umul64hi(static_cast<unsigned long long int>(a), b);
}
__DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int mul64hi(const unsigned long long int a, const long long int b)
{
return __umul64hi(a, static_cast<unsigned long long int>(b));
}
__DEVICE_FUNCTIONS_STATIC_DECL__ int float_as_int(const float a)
{
return __float_as_int(a);
}
__DEVICE_FUNCTIONS_STATIC_DECL__ float int_as_float(const int a)
{
return __int_as_float(a);
}
__DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int float_as_uint(const float a)
{
return __float_as_uint(a);
}
__DEVICE_FUNCTIONS_STATIC_DECL__ float uint_as_float(const unsigned int a)
{
return __uint_as_float(a);
}
__DEVICE_FUNCTIONS_STATIC_DECL__ float saturate(const float a)
{
return __saturatef(a);
}
__DEVICE_FUNCTIONS_STATIC_DECL__ int mul24(const int a, const int b)
{
return __mul24(a, b);
}
__DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int umul24(const unsigned int a, const unsigned int b)
{
return __umul24(a, b);
}
__DEVICE_FUNCTIONS_STATIC_DECL__ int float2int(const float a, const enum cudaRoundMode mode)
{
return (mode == cudaRoundNearest) ? __float2int_rn(a) :
(mode == cudaRoundPosInf ) ? __float2int_ru(a) :
(mode == cudaRoundMinInf ) ? __float2int_rd(a) :
__float2int_rz(a);
}
__DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int float2uint(const float a, const enum cudaRoundMode mode)
{
return (mode == cudaRoundNearest) ? __float2uint_rn(a) :
(mode == cudaRoundPosInf ) ? __float2uint_ru(a) :
(mode == cudaRoundMinInf ) ? __float2uint_rd(a) :
__float2uint_rz(a);
}
__DEVICE_FUNCTIONS_STATIC_DECL__ float int2float(const int a, const enum cudaRoundMode mode)
{
return (mode == cudaRoundZero ) ? __int2float_rz(a) :
(mode == cudaRoundPosInf) ? __int2float_ru(a) :
(mode == cudaRoundMinInf) ? __int2float_rd(a) :
__int2float_rn(a);
}
__DEVICE_FUNCTIONS_STATIC_DECL__ float uint2float(const unsigned int a, const enum cudaRoundMode mode)
{
return (mode == cudaRoundZero ) ? __uint2float_rz(a) :
(mode == cudaRoundPosInf) ? __uint2float_ru(a) :
(mode == cudaRoundMinInf) ? __uint2float_rd(a) :
__uint2float_rn(a);
}
#undef __DEVICE_FUNCTIONS_DECL__
#undef __DEVICE_FUNCTIONS_STATIC_DECL__
#endif /* __cplusplus && __CUDACC__ */
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#endif /* !__DEVICE_FUNCTIONS_HPP__ */
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_HPP__)
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_HPP__
#endif

View file

@ -1,57 +0,0 @@
/*
* NVIDIA_COPYRIGHT_BEGIN
*
* Copyright (c) 2008-2018, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*
* NVIDIA_COPYRIGHT_END
*/
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
#if defined(_MSC_VER)
#pragma message("crt/func_macro.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
#else
#warning "crt/func_macro.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
#endif
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_FUNC_MACRO_H__
#endif
#if !defined(__FUNC_MACRO_H__)
#define __FUNC_MACRO_H__
#if !defined(__CUDA_INTERNAL_COMPILATION__)
#error -- incorrect inclusion of a cudart header file
#endif /* !__CUDA_INTERNAL_COMPILATION__ */
#if defined(__GNUC__)
#define __func__(decl) \
inline decl
#define __device_func__(decl) \
static __attribute__((__unused__)) decl
#elif defined(_WIN32)
#define __func__(decl) \
static inline decl
#define __device_func__(decl) \
static decl
#endif /* __GNUC__ */
#endif /* __FUNC_MACRO_H__ */
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_FUNC_MACRO_H__)
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_FUNC_MACRO_H__
#endif

View file

@ -1,293 +0,0 @@
/*
* Copyright 1993-2022 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
#if defined(_MSC_VER)
#pragma message("crt/host_config.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
#else
#warning "crt/host_config.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
#endif
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__
#endif
#if !defined(__HOST_CONFIG_H__)
#define __HOST_CONFIG_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if defined(__CUDACC__)
#if defined(__CUDACC_RTC__)
#define _CRTIMP
#define __THROW
#else /* __CUDACC_RTC__ */
/* check for host compilers that are compatible with nvcc */
#if !defined(__GNUC__) && !defined(_WIN32)
#error --- !!! UNSUPPORTED COMPILER !!! ---
#endif /* !__GNUC__ && !_WIN32 */
/* check invalid configurations */
#if defined(__PGIC__)
#if !defined(__GNUC__) || !defined(__LP64__) || !defined(__linux__)
#error -- unsupported pgc++ configuration! pgc++ is supported only on Linux x86_64!
#endif /* !defined(__GNUC__) || !defined(__LP64__) || !defined(__linux__) */
#endif /* defined(__PGIC__) */
#if defined(__powerpc__)
#if !defined(__powerpc64__) || !defined(__LITTLE_ENDIAN__)
#error -- unsupported PPC platform! Only 64-bit little endian PPC is supported!
#endif /* !__powerpc64__ || !__LITTLE_ENDIAN__ */
#endif /* __powerpc__ */
#if defined(__APPLE__) && defined(__MACH__) && !defined(__clang__)
#error -- clang and clang++ are the only supported host compilers on Mac OS X!
#endif /* __APPLE__ && __MACH__ && !__clang__ */
/* check host compiler version */
#if !__NV_NO_HOST_COMPILER_CHECK
#if defined(__ICC)
#if (__ICC != 1500 && __ICC != 1600 && __ICC != 1700 && __ICC != 1800 && !(__ICC >= 1900 && __ICC <= 2021)) || !defined(__GNUC__) || !defined(__LP64__)
#error -- unsupported ICC configuration! Only ICC 15.0, ICC 16.0, ICC 17.0, ICC 18.0, ICC 19.x and 20.x on Linux x86_64 are supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
#endif /* (__ICC != 1500 && __ICC != 1600 && __ICC != 1700 && __ICC != 1800 && __ICC != 1900) || !__GNUC__ || !__LP64__ */
#endif /* __ICC */
#if defined(__powerpc__)
#if defined(__ibmxl_vrm__) && !(__ibmxl_vrm__ >= 0x0d010000 && __ibmxl_vrm__ < 0x0d020000) && \
!(__ibmxl_vrm__ >= 0x10010000 && __ibmxl_vrm__ < 0x10020000)
#error -- unsupported xlC version! only xlC 13.1 and 16.1 are supported. The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
#endif /* __ibmxl_vrm__ && !(__ibmxl_vrm__ >= 0x0d010000 && __ibmxl_vrm__ < 0x0d020000) &&
!(__ibmxl_vrm__ >= 0x10010000 && __ibmxl_vrm__ < 0x10020000) */
#endif /* __powerpc__ */
#if defined(__GNUC__)
#if __GNUC__ > 11
#error -- unsupported GNU version! gcc versions later than 11 are not supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
#endif /* __GNUC__ > 11 */
#if defined(__clang__) && !defined(__ibmxl_vrm__) && !defined(__ICC) && !defined(__HORIZON__) && !defined(__APPLE__)
#if (__clang_major__ >= 14) || (__clang_major__ < 3) || ((__clang_major__ == 3) && (__clang_minor__ < 3))
#error -- unsupported clang version! clang version must be less than 14 and greater than 3.2 . The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
#endif /* (__clang_major__ >= 14) || (__clang_major__ < 3) || ((__clang_major__ == 3) && (__clang_minor__ < 3)) */
#endif /* defined(__clang__) && !defined(__ibmxl_vrm__) && !defined(__ICC) && !defined(__HORIZON__) && !defined(__APPLE__) */
#endif /* __GNUC__ */
#if defined(_WIN32)
#if _MSC_VER < 1910 || _MSC_VER >= 1940
#error -- unsupported Microsoft Visual Studio version! Only the versions between 2017 and 2022 (inclusive) are supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
#elif _MSC_VER >= 1910 && _MSC_VER < 1910
#pragma message("support for this version of Microsoft Visual Studio has been deprecated! Only the versions between 2017 and 2022 (inclusive) are supported!")
#endif /* (_MSC_VER < 1910 || _MSC_VER >= 1940) || (_MSC_VER >= 1910 && _MSC_VER < 1910) */
#endif /* _WIN32 */
#endif /* !__NV_NO_HOST_COMPILER_CHECK */
/* configure host compiler */
#if defined(__APPLE__)
#define _CRTIMP
#define _ACRTIMP
#define __THROW
#if defined(__BLOCKS__) /* nvcc does not support closures */
#undef __BLOCKS__
#endif /* __BLOCKS__ */
#elif defined(__ANDROID__)
#define _CRTIMP
#define _ACRTIMP
#define __THROW
#elif defined(__QNX__)
#define _CRTIMP
#define _ACRTIMP
#define __THROW
#elif defined(__HORIZON__)
#define _CRTIMP
#define _ACRTIMP
#define __THROW
#elif defined(__GNUC__)
#define _CRTIMP
#define _ACRTIMP
#include <features.h> /* for __THROW */
#elif defined(_WIN32)
#if _MSC_VER >= 1500
#undef _USE_DECLSPECS_FOR_SAL
#define _USE_DECLSPECS_FOR_SAL \
1
#endif /* _MSC_VER >= 1500 */
#if !defined(_CRT_NONSTDC_NO_WARNINGS)
#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */
#endif /* !_CRT_NONSTDC_NO_WARNINGS */
#if !defined(_CRT_SECURE_NO_WARNINGS)
#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */
#endif /* !_CRT_SECURE_NO_WARNINGS */
#if !defined(NOMINMAX)
#define NOMINMAX /* min and max are part of cuda runtime */
#endif /* !NOMINMAX */
#include <crtdefs.h> /* for _CRTIMP */
#if _MSC_VER >= 1900
#include <corecrt.h> /* for _ACRTIMP */
#endif /* _MSC_VER >= 1900 */
#define __THROW
#endif /* __APPLE__ */
#endif /* __CUDACC_RTC__ */
#if defined(__cplusplus) && defined(__CUDA_ARCH__) && (defined(__PGIC__) || defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER)))
#if __CUDACC_RTC__
typedef char *va_list;
#else /* !__CUDACC_RTC__ */
#include <cstdarg>
#endif /* __CUDACC_RTC__ */
#undef va_start
#undef va_end
#undef va_arg
#ifdef __PGIC__
#undef __builtin_va_end
#define va_start(v,l) __builtin_alt_va_start(v,l)
#define va_end(v) __builtin_va_end(v)
#define va_arg(v,l) __builtin_alt_va_arg(v,l)
#if (__cplusplus >= 201103L)
#undef va_copy
#define va_copy(d,s) __builtin_va_copy(d,s)
#endif
#else /* !__PGIC__ */
#define va_start(ap, x) (__cu_va_start(&ap, x))
#define va_end(ap) (__cu_va_end(&ap))
#define va_arg(ap, t) (*((t *)__cu_va_arg(&ap, (t *)0)))
#if (_MSC_VER >= 1800) || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L))
#undef va_copy
#define va_copy(apd, aps) (__cu_va_copy(&(apd), &(aps)))
#endif /* (_MSC_VER >= 1800) || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L)) */
#endif /* __PGIC__ */
#endif /* defined(__cplusplus) && (defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER))) */
#endif /* __CUDACC__ */
#endif /* !__HOST_CONFIG_H__ */
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__)
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__
#endif

View file

@ -1,246 +0,0 @@
/*
* Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
#if defined(_MSC_VER)
#pragma message("crt/host_defines.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
#else
#warning "crt/host_defines.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
#endif
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H__
#endif
#if !defined(__HOST_DEFINES_H__)
#define __HOST_DEFINES_H__
/* CUDA JIT mode (__CUDACC_RTC__) also uses GNU style attributes */
#if defined(__GNUC__) || (defined(__PGIC__) && defined(__linux__)) || defined(__CUDA_LIBDEVICE__) || defined(__CUDACC_RTC__)
#if defined(__CUDACC_RTC__)
#define __volatile__ volatile
#endif /* __CUDACC_RTC__ */
#define __no_return__ \
__attribute__((noreturn))
#if defined(__CUDACC__) || defined(__CUDA_ARCH__) || defined(__CUDA_LIBDEVICE__)
/* gcc allows users to define attributes with underscores,
e.g., __attribute__((__noinline__)).
Consider a non-CUDA source file (e.g. .cpp) that has the
above attribute specification, and includes this header file. In that case,
defining __noinline__ as below would cause a gcc compilation error.
Hence, only define __noinline__ when the code is being processed
by a CUDA compiler component.
*/
#define __noinline__ \
__attribute__((noinline))
#endif /* __CUDACC__ || __CUDA_ARCH__ || __CUDA_LIBDEVICE__ */
#define __forceinline__ \
__inline__ __attribute__((always_inline))
#define __align__(n) \
__attribute__((aligned(n)))
#define __thread__ \
__thread
#define __import__
#define __export__
#define __cdecl
#define __annotate__(a) \
__attribute__((a))
#define __location__(a) \
__annotate__(a)
#define CUDARTAPI
#define CUDARTAPI_CDECL
#elif defined(_MSC_VER)
#if _MSC_VER >= 1400
#define __restrict__ \
__restrict
#else /* _MSC_VER >= 1400 */
#define __restrict__
#endif /* _MSC_VER >= 1400 */
#define __inline__ \
__inline
#define __no_return__ \
__declspec(noreturn)
#define __noinline__ \
__declspec(noinline)
#define __forceinline__ \
__forceinline
#define __align__(n) \
__declspec(align(n))
#define __thread__ \
__declspec(thread)
#define __import__ \
__declspec(dllimport)
#define __export__ \
__declspec(dllexport)
#define __annotate__(a) \
__declspec(a)
#define __location__(a) \
__annotate__(__##a##__)
#define CUDARTAPI \
__stdcall
#define CUDARTAPI_CDECL \
__cdecl
#else /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
#define __inline__
#if !defined(__align__)
#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for '__align__' !!! ---
#endif /* !__align__ */
#if !defined(CUDARTAPI)
#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for 'CUDARTAPI' !!! ---
#endif /* !CUDARTAPI */
#endif /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
#if (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !defined(__clang__)))) || \
(defined(_MSC_VER) && _MSC_VER < 1900) || \
(!defined(__GNUC__) && !defined(_MSC_VER))
#define __specialization_static \
static
#else /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
(_MSC_VER && _MSC_VER < 1900) ||
(!__GNUC__ && !_MSC_VER) */
#define __specialization_static
#endif /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
(_MSC_VER && _MSC_VER < 1900) ||
(!__GNUC__ && !_MSC_VER) */
#if !defined(__CUDACC__) && !defined(__CUDA_LIBDEVICE__)
#undef __annotate__
#define __annotate__(a)
#else /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */
#define __launch_bounds__(...) \
__annotate__(launch_bounds(__VA_ARGS__))
#endif /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */
#if defined(__CUDACC__) || defined(__CUDA_LIBDEVICE__) || \
defined(__GNUC__) || defined(_WIN64)
#define __builtin_align__(a) \
__align__(a)
#else /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__ || _WIN64 */
#define __builtin_align__(a)
#endif /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__ || _WIN64 */
#if defined(__CUDACC__) || !defined(__host__)
#define __host__ \
__location__(host)
#endif /* defined(__CUDACC__) || !defined(__host__) */
#if defined(__CUDACC__) || !defined(__device__)
#define __device__ \
__location__(device)
#endif /* defined(__CUDACC__) || !defined(__device__) */
#if defined(__CUDACC__) || !defined(__global__)
#define __global__ \
__location__(global)
#endif /* defined(__CUDACC__) || !defined(__global__) */
#if defined(__CUDACC__) || !defined(__shared__)
#define __shared__ \
__location__(shared)
#endif /* defined(__CUDACC__) || !defined(__shared__) */
#if defined(__CUDACC__) || !defined(__constant__)
#define __constant__ \
__location__(constant)
#endif /* defined(__CUDACC__) || !defined(__constant__) */
#if defined(__CUDACC__) || !defined(__managed__)
#define __managed__ \
__location__(managed)
#endif /* defined(__CUDACC__) || !defined(__managed__) */
#if !defined(__CUDACC__)
#define __device_builtin__
#define __device_builtin_texture_type__
#define __device_builtin_surface_type__
#define __cudart_builtin__
#else /* defined(__CUDACC__) */
#define __device_builtin__ \
__location__(device_builtin)
#define __device_builtin_texture_type__ \
__location__(device_builtin_texture_type)
#define __device_builtin_surface_type__ \
__location__(device_builtin_surface_type)
#define __cudart_builtin__ \
__location__(cudart_builtin)
#endif /* !defined(__CUDACC__) */
#endif /* !__HOST_DEFINES_H__ */
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H__)
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H__
#endif

View file

@ -1,288 +0,0 @@
/*
* NVIDIA_COPYRIGHT_BEGIN
*
* Copyright (c) 2008-2018, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*
* NVIDIA_COPYRIGHT_END
*/
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
#if defined(_MSC_VER)
#pragma message("crt/device_functions.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
#else
#warning "crt/device_functions.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
#endif
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_RUNTIME_H__
#endif
#if !defined(__CUDA_INTERNAL_COMPILATION__)
#define __CUDA_INTERNAL_COMPILATION__
#define __text__
#define __surf__
#define __name__shadow_var(c, cpp) \
#c
#define __name__text_var(c, cpp) \
#cpp
#define __host__shadow_var(c, cpp) \
cpp
#define __text_var(c, cpp) \
cpp
#define __device_fun(fun) \
#fun
#define __device_var(var) \
#var
#define __device__text_var(c, cpp) \
#c
#define __device__shadow_var(c, cpp) \
#c
#if defined(_WIN32) && !defined(_WIN64)
#define __pad__(f) \
f
#else /* _WIN32 && !_WIN64 */
#define __pad__(f)
#endif /* _WIN32 && !_WIN64 */
#include "builtin_types.h"
#include "storage_class.h"
#else /* !__CUDA_INTERNAL_COMPILATION__ */
template <typename T>
static inline T *__cudaAddressOf(T &val)
{
return (T *)((void *)(&(const_cast<char &>(reinterpret_cast<const volatile char &>(val)))));
}
#define __cudaRegisterBinary(X) \
__cudaFatCubinHandle = __cudaRegisterFatBinary((void*)&__fatDeviceText); \
{ void (*callback_fp)(void **) = (void (*)(void **))(X); (*callback_fp)(__cudaFatCubinHandle); __cudaRegisterFatBinaryEnd(__cudaFatCubinHandle); }\
atexit(__cudaUnregisterBinaryUtil)
#define __cudaRegisterVariable(handle, var, ext, size, constant, global) \
__cudaRegisterVar(handle, (char*)&__host##var, (char*)__device##var, __name##var, ext, size, constant, global)
#define __cudaRegisterManagedVariable(handle, var, ext, size, constant, global) \
__cudaRegisterManagedVar(handle, (void **)&__host##var, (char*)__device##var, __name##var, ext, size, constant, global)
#define __cudaRegisterGlobalTexture(handle, tex, dim, norm, ext) \
__cudaRegisterTexture(handle, (const struct textureReference*)&tex, (const void**)(void*)__device##tex, __name##tex, dim, norm, ext)
#define __cudaRegisterGlobalSurface(handle, surf, dim, ext) \
__cudaRegisterSurface(handle, (const struct surfaceReference*)&surf, (const void**)(void*)__device##surf, __name##surf, dim, ext)
#define __cudaRegisterEntry(handle, funptr, fun, thread_limit) \
__cudaRegisterFunction(handle, (const char*)funptr, (char*)__device_fun(fun), #fun, -1, (uint3*)0, (uint3*)0, (dim3*)0, (dim3*)0, (int*)0)
extern "C" cudaError_t CUDARTAPI __cudaPopCallConfiguration(
dim3 *gridDim,
dim3 *blockDim,
size_t *sharedMem,
void *stream
);
#define __cudaLaunchPrologue(size) \
void * __args_arr[size]; \
int __args_idx = 0
#define __cudaSetupArg(arg, offset) \
__args_arr[__args_idx] = (void *)__cudaAddressOf(arg); ++__args_idx
#define __cudaSetupArgSimple(arg, offset) \
__args_arr[__args_idx] = (void *)(char *)&arg; ++__args_idx
#if defined(__GNUC__)
#define __NV_ATTR_UNUSED_FOR_LAUNCH __attribute__((unused))
#else /* !__GNUC__ */
#define __NV_ATTR_UNUSED_FOR_LAUNCH
#endif /* __GNUC__ */
/* the use of __args_idx in the expression below avoids host compiler warning about it being an
unused variable when the launch has no arguments */
#define __cudaLaunch(fun) \
{ volatile static char *__f __NV_ATTR_UNUSED_FOR_LAUNCH; __f = fun; \
dim3 __gridDim, __blockDim;\
size_t __sharedMem; \
cudaStream_t __stream; \
if (__cudaPopCallConfiguration(&__gridDim, &__blockDim, &__sharedMem, &__stream) != cudaSuccess) \
return; \
if (__args_idx == 0) {\
(void)cudaLaunchKernel(fun, __gridDim, __blockDim, &__args_arr[__args_idx], __sharedMem, __stream);\
} else { \
(void)cudaLaunchKernel(fun, __gridDim, __blockDim, &__args_arr[0], __sharedMem, __stream);\
}\
}
#if defined(__GNUC__)
#define __nv_dummy_param_ref(param) \
{ volatile static void **__ref __attribute__((unused)); __ref = (volatile void **)param; }
#else /* __GNUC__ */
#define __nv_dummy_param_ref(param) \
{ volatile static void **__ref; __ref = (volatile void **)param; }
#endif /* __GNUC__ */
static void ____nv_dummy_param_ref(void *param) __nv_dummy_param_ref(param)
#define __REGISTERFUNCNAME_CORE(X) __cudaRegisterLinkedBinary##X
#define __REGISTERFUNCNAME(X) __REGISTERFUNCNAME_CORE(X)
extern "C" {
void __REGISTERFUNCNAME( __NV_MODULE_ID ) ( void (*)(void **), void *, void *, void (*)(void *));
}
#define __TO_STRING_CORE(X) #X
#define __TO_STRING(X) __TO_STRING_CORE(X)
extern "C" {
#if defined(_WIN32)
#pragma data_seg("__nv_module_id")
static const __declspec(allocate("__nv_module_id")) unsigned char __module_id_str[] = __TO_STRING(__NV_MODULE_ID);
#pragma data_seg()
#elif defined(__APPLE__)
static const unsigned char __module_id_str[] __attribute__((section ("__NV_CUDA,__nv_module_id"))) = __TO_STRING(__NV_MODULE_ID);
#else
static const unsigned char __module_id_str[] __attribute__((section ("__nv_module_id"))) = __TO_STRING(__NV_MODULE_ID);
#endif
#undef __FATIDNAME_CORE
#undef __FATIDNAME
#define __FATIDNAME_CORE(X) __fatbinwrap##X
#define __FATIDNAME(X) __FATIDNAME_CORE(X)
#define ____cudaRegisterLinkedBinary(X) \
{ __REGISTERFUNCNAME(__NV_MODULE_ID) (( void (*)(void **))(X), (void *)&__FATIDNAME(__NV_MODULE_ID), (void *)&__module_id_str, (void (*)(void *))&____nv_dummy_param_ref); }
}
extern "C" {
extern void** CUDARTAPI __cudaRegisterFatBinary(
void *fatCubin
);
extern void CUDARTAPI __cudaRegisterFatBinaryEnd(
void **fatCubinHandle
);
extern void CUDARTAPI __cudaUnregisterFatBinary(
void **fatCubinHandle
);
extern void CUDARTAPI __cudaRegisterVar(
void **fatCubinHandle,
char *hostVar,
char *deviceAddress,
const char *deviceName,
int ext,
size_t size,
int constant,
int global
);
extern void CUDARTAPI __cudaRegisterManagedVar(
void **fatCubinHandle,
void **hostVarPtrAddress,
char *deviceAddress,
const char *deviceName,
int ext,
size_t size,
int constant,
int global
);
extern char CUDARTAPI __cudaInitModule(
void **fatCubinHandle
);
extern void CUDARTAPI __cudaRegisterTexture(
void **fatCubinHandle,
const struct textureReference *hostVar,
const void **deviceAddress,
const char *deviceName,
int dim,
int norm,
int ext
);
extern void CUDARTAPI __cudaRegisterSurface(
void **fatCubinHandle,
const struct surfaceReference *hostVar,
const void **deviceAddress,
const char *deviceName,
int dim,
int ext
);
extern void CUDARTAPI __cudaRegisterFunction(
void **fatCubinHandle,
const char *hostFun,
char *deviceFun,
const char *deviceName,
int thread_limit,
uint3 *tid,
uint3 *bid,
dim3 *bDim,
dim3 *gDim,
int *wSize
);
#if defined(__APPLE__)
extern "C" int atexit(void (*)(void));
#elif defined(__GNUC__) && !defined(__ANDROID__) && !defined(__HORIZON__)
extern int atexit(void(*)(void)) throw();
#elif defined(__HORIZON__)
// __TEMP_WAR__ 200132570 HOS : Disable atexit call until it works
#define atexit(p)
#else /* __GNUC__ && !__ANDROID__ */
extern int __cdecl atexit(void(__cdecl *)(void));
#endif
}
static void **__cudaFatCubinHandle;
static void __cdecl __cudaUnregisterBinaryUtil(void)
{
____nv_dummy_param_ref((void *)&__cudaFatCubinHandle);
__cudaUnregisterFatBinary(__cudaFatCubinHandle);
}
static char __nv_init_managed_rt_with_module(void **handle)
{
return __cudaInitModule(handle);
}
#include "common_functions.h"
#pragma pack()
#if defined(_WIN32)
#pragma warning(disable: 4099)
#if !defined(_WIN64)
#pragma warning(disable: 4408)
#endif /* !_WIN64 */
#endif /* _WIN32 */
#endif /* !__CUDA_INTERNAL_COMPILATION__ */
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_RUNTIME_H__)
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_RUNTIME_H__
#endif

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,754 +0,0 @@
/*
* Copyright 2017-2020 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
#if defined(_MSC_VER)
#pragma message("crt/mma.h is an internal header file and must not be used directly. Please use mma.h instead.")
#else
#warning "crt/mma.h is an internal header file and must not be used directly. Please use mma.h instead."
#endif
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H__
#endif
#if !defined(__CUDA_MMA_H__)
#define __CUDA_MMA_H__
#include <cuda_fp16.h>
#include <cuda_bf16.h>
#define __CUDA_MMA_DEVICE_DECL__ static __device__ __inline__
#if defined(__cplusplus) && defined(__CUDACC__)
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
#ifndef __CUDA_ARCH__
#define __DEF_IF_HOST { }
#else /* !__CUDA_ARCH__ */
#define __DEF_IF_HOST ;
#endif /* __CUDA_ARCH__ */
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 720
#define __CUDA_IMMA__ 1
#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 720 */
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 730
#define __CUDA_SUBBYTE_IMMA__ 1
#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 730 */
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
#define __CUDA_AMPERE_MMA__ 1
#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 */
namespace nvcuda {
namespace wmma {
// utility functions
#ifdef __CUDA_AMPERE_MMA__
inline __device__ float __float_to_tf32(float in)
{
float ret;
asm("{\n .reg .b32 __$1;"
"\n cvt.rna.tf32.f32 __$1, %1;"
"\n mov.b32 %0, __$1;\n}\n" : "=f"(ret) : "f"(in) );
return ret;
}
#endif /* __CUDA_AMPERE_MMA__ */
//
// tags
//
struct row_major;
struct col_major;
struct matrix_a;
struct matrix_b;
struct accumulator;
#ifdef __CUDA_AMPERE_MMA__
namespace precision {
struct tf32;
}
#endif /* __CUDA_AMPERE_MMA__ */
#ifdef __CUDA_SUBBYTE_IMMA__
namespace experimental {
namespace precision {
struct u4; // 4-bit unsigned
struct s4; // 4-bit signed
struct b1; // 1-bit
}
enum bmmaBitOp { bmmaBitOpXOR = 1
#ifdef __CUDA_AMPERE_MMA__
, bmmaBitOpAND = 2
#endif /* __CUDA_AMPERE_MMA__ */
};
enum bmmaAccumulateOp { bmmaAccumulateOpPOPC = 1 };
}
#endif /* __CUDA_SUBBYTE_IMMA__ */
//
// layout
//
enum layout_t {
mem_row_major, mem_col_major
};
template <typename T>
struct helper_traits {
typedef T element_type;
typedef T storage_element_type;
typedef T fill_argument_type;
};
#ifdef __CUDA_SUBBYTE_IMMA__
template<> struct helper_traits<experimental::precision::u4> {
typedef experimental::precision::u4 element_type;
typedef unsigned int storage_element_type;
typedef unsigned int fill_argument_type;
};
template<> struct helper_traits<experimental::precision::s4> {
typedef experimental::precision::s4 element_type;
typedef int storage_element_type;
typedef int fill_argument_type;
};
template<> struct helper_traits<experimental::precision::b1> {
typedef experimental::precision::b1 element_type;
typedef unsigned int storage_element_type;
typedef unsigned int fill_argument_type;
};
#endif /* __CUDA_SUBBYTE_IMMA__ */
#ifdef __CUDA_AMPERE_MMA__
template<> struct helper_traits<precision::tf32> {
typedef precision::tf32 element_type;
typedef float storage_element_type;
typedef float fill_argument_type;
};
#endif /* __CUDA_AMPERE_MMA__ */
//
// The base fragment type
//
/* note: alignment required for compiler implementation */
template <typename T, int size, int packed_size = size>
struct __align__(8) __frag_base {
/* Number of elements in the fragment */
enum {num_elements = size};
/* Number of storage elements in the fragment.
The elements of the fragment are packed together when the
fragment element type is experimental::precision::u4,
experimental::precision::s4 or experimental::precision::b1.
When elements are packed, num_storage_elements
will be smaller than num_elements.
*/
enum {num_storage_elements = packed_size};
/* element type of the fragment */
typedef T element_type;
/* element type of the storage representation.
The mapping from element_type to storage_element_type is as follows:
experimental::precision::u4 -> unsigned (8 elements in 1 storage element)
experimental::precision::s4 -> int (8 elements in 1 storage element)
experimental::precision::b1 -> unsigned (32 elements in 1 storage element)
precision::tf32 -> float (1 element in 1 storage element)
all other types T -> T
*/
typedef typename helper_traits<T>::storage_element_type storage_element_type;
/* Storage for the (possibly packed) fragment elements. */
storage_element_type x[num_storage_elements];
};
template <typename FragEleType, typename StorageType, typename ArgType>
static inline __device__ StorageType __get_storage_value(ArgType in) { return in; }
#ifdef __CUDA_SUBBYTE_IMMA__
template<>
__device__ inline unsigned
__get_storage_value<experimental::precision::u4, unsigned, unsigned>(unsigned in)
{
/* For experimental::precision::u4 fragment element type, pack 8 elements into a single
32-bit unsigned int storage element */
unsigned val = in & 0xf;
return (val | (val << 4) | (val << 8) | (val << 12) | (val << 16) |
(val << 20) | (val << 24) | (val << 28));
};
template<>
__device__ inline int
__get_storage_value<experimental::precision::s4, int, int>(int in)
{
/* For experimental::precision::s4 fragment element type, pack 8 elements into a single
32-bit signed int storage element */
int val = in & 0xf;
return (val | (val << 4) | (val << 8) | (val << 12) | (val << 16) |
(val << 20) | (val << 24) | (val << 28));
};
template<>
__device__ inline unsigned
__get_storage_value<experimental::precision::b1, unsigned, unsigned>(unsigned in)
{
/* For experimental::precision::b1 fragment element type, pack 32 elements into a
single 32-bit unsigned int storage element */
return (in & 0x1) ? 0xFFFFFFFFU : 0;
}
#endif /* __CUDA_SUBBYTE_IMMA__ */
template <typename FragEleType, int size, int packed_size>
__CUDA_MMA_DEVICE_DECL__ void fill_fragment(__frag_base<FragEleType, size, packed_size>& f,
/* The mapping from fragment element type (FragEleType) to fill_argument_type is:
experimental::precision::u4 -> unsigned (only lower 4 bits taken)
experimental::precision::s4 -> int (only lower 4 bits taken)
experimental::precision::b1 -> unsigned (only lowest 1 bit taken)
precision::tf32 -> float
all other types T -> T
*/
const typename helper_traits<FragEleType>::fill_argument_type & in) {
/* get the (possibly packed) storage element value. See the specializations above for fragment
element types where the storage representation is packed */
typedef typename helper_traits<FragEleType>::storage_element_type storage_type;
storage_type v = __get_storage_value<FragEleType, storage_type>(in);
#pragma unroll
for (int i=0; i< f.num_storage_elements; i++)
f.x[i] = v;
}
//
// Fragment template
//
template<typename Use, int m, int n, int k, typename T, typename Layout=void> class fragment;
//
// Fragments for 16x16x16
//
template<> class fragment<matrix_a, 16, 16, 16, __half, row_major> : public __frag_base<__half, 16> {};
template<> class fragment<matrix_a, 16, 16, 16, __half, col_major> : public __frag_base<__half, 16> {};
template<> class fragment<matrix_b, 16, 16, 16, __half, row_major> : public __frag_base<__half, 16> {};
template<> class fragment<matrix_b, 16, 16, 16, __half, col_major> : public __frag_base<__half, 16> {};
template<> class fragment<accumulator, 16, 16, 16, __half> : public __frag_base<__half, 8> {};
template<> class fragment<accumulator, 16, 16, 16, float> : public __frag_base<float, 8> {};
#ifdef __CUDA_IMMA__
template<> class fragment<matrix_a, 16, 16, 16, signed char, row_major> : public __frag_base<signed char, 8> {};
template<> class fragment<matrix_a, 16, 16, 16, signed char, col_major> : public __frag_base<signed char, 8> {};
template<> class fragment<matrix_a, 16, 16, 16, unsigned char, row_major> : public __frag_base<unsigned char, 8> {};
template<> class fragment<matrix_a, 16, 16, 16, unsigned char, col_major> : public __frag_base<unsigned char, 8> {};
template<> class fragment<matrix_b, 16, 16, 16, signed char, row_major> : public __frag_base<signed char, 8> {};
template<> class fragment<matrix_b, 16, 16, 16, signed char, col_major> : public __frag_base<signed char, 8> {};
template<> class fragment<matrix_b, 16, 16, 16, unsigned char, row_major> : public __frag_base<unsigned char, 8> {};
template<> class fragment<matrix_b, 16, 16, 16, unsigned char, col_major> : public __frag_base<unsigned char, 8> {};
template<> class fragment<accumulator, 16, 16, 16, int> : public __frag_base<int, 8> {};
#endif /* __CUDA_IMMA__ */
#ifdef __CUDA_AMPERE_MMA__
template<> class fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 8> {};
template<> class fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 8> {};
template<> class fragment<matrix_b, 16, 16, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 8> {};
template<> class fragment<matrix_b, 16, 16, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 8> {};
#endif /* __CUDA_AMPERE_MMA__ */
//
// Fragments for 32x8x16
//
template<> class fragment<matrix_a, 32, 8, 16, __half, row_major> : public __frag_base<__half, 16> {};
template<> class fragment<matrix_a, 32, 8, 16, __half, col_major> : public __frag_base<__half, 16> {};
template<> class fragment<matrix_b, 32, 8, 16, __half, row_major> : public __frag_base<__half, 16> {};
template<> class fragment<matrix_b, 32, 8, 16, __half, col_major> : public __frag_base<__half, 16> {};
template<> class fragment<accumulator, 32, 8, 16, __half> : public __frag_base<__half, 8> {};
template<> class fragment<accumulator, 32, 8, 16, float> : public __frag_base<float, 8> {};
#ifdef __CUDA_IMMA__
template<> class fragment<matrix_a, 32, 8, 16, signed char, row_major> : public __frag_base<signed char, 16> {};
template<> class fragment<matrix_a, 32, 8, 16, signed char, col_major> : public __frag_base<signed char, 16> {};
template<> class fragment<matrix_a, 32, 8, 16, unsigned char, row_major> : public __frag_base<unsigned char, 16> {};
template<> class fragment<matrix_a, 32, 8, 16, unsigned char, col_major> : public __frag_base<unsigned char, 16> {};
template<> class fragment<matrix_b, 32, 8, 16, signed char, row_major> : public __frag_base<signed char, 4> {};
template<> class fragment<matrix_b, 32, 8, 16, signed char, col_major> : public __frag_base<signed char, 4> {};
template<> class fragment<matrix_b, 32, 8, 16, unsigned char, row_major> : public __frag_base<unsigned char, 4> {};
template<> class fragment<matrix_b, 32, 8, 16, unsigned char, col_major> : public __frag_base<unsigned char, 4> {};
template<> class fragment<accumulator, 32, 8, 16, int> : public __frag_base<int, 8> {};
#endif /* __CUDA_IMMA__ */
#ifdef __CUDA_AMPERE_MMA__
template<> class fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 16> {};
template<> class fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 16> {};
template<> class fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 4> {};
template<> class fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 4> {};
#endif /* __CUDA_AMPERE_MMA__ */
//
// Fragments for 8x32x16
//
template<> class fragment<matrix_a, 8, 32, 16, __half, row_major> : public __frag_base<__half, 16> {};
template<> class fragment<matrix_a, 8, 32, 16, __half, col_major> : public __frag_base<__half, 16> {};
template<> class fragment<matrix_b, 8, 32, 16, __half, row_major> : public __frag_base<__half, 16> {};
template<> class fragment<matrix_b, 8, 32, 16, __half, col_major> : public __frag_base<__half, 16> {};
template<> class fragment<accumulator, 8, 32, 16, __half> : public __frag_base<__half, 8> {};
template<> class fragment<accumulator, 8, 32, 16, float> : public __frag_base<float, 8> {};
#ifdef __CUDA_IMMA__
template<> class fragment<matrix_a, 8, 32, 16, signed char, row_major> : public __frag_base<signed char, 4> {};
template<> class fragment<matrix_a, 8, 32, 16, signed char, col_major> : public __frag_base<signed char, 4> {};
template<> class fragment<matrix_a, 8, 32, 16, unsigned char, row_major> : public __frag_base<unsigned char, 4> {};
template<> class fragment<matrix_a, 8, 32, 16, unsigned char, col_major> : public __frag_base<unsigned char, 4> {};
template<> class fragment<matrix_b, 8, 32, 16, signed char, row_major> : public __frag_base<signed char, 16> {};
template<> class fragment<matrix_b, 8, 32, 16, signed char, col_major> : public __frag_base<signed char, 16> {};
template<> class fragment<matrix_b, 8, 32, 16, unsigned char, row_major> : public __frag_base<unsigned char, 16> {};
template<> class fragment<matrix_b, 8, 32, 16, unsigned char, col_major> : public __frag_base<unsigned char, 16> {};
template<> class fragment<accumulator, 8, 32, 16, int> : public __frag_base<int, 8> {};
#endif /* __CUDA_IMMA__ */
#ifdef __CUDA_AMPERE_MMA__
template<> class fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 4> {};
template<> class fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 4> {};
template<> class fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 16> {};
template<> class fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 16> {};
#endif /* __CUDA_AMPERE_MMA__ */
#ifdef __CUDA_SUBBYTE_IMMA__
//
// Fragments for 8x8x32
//
template<> class fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major> : public __frag_base<experimental::precision::u4, 8, 1> {};
template<> class fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major> : public __frag_base<experimental::precision::s4, 8, 1> {};
template<> class fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major> : public __frag_base<experimental::precision::u4, 8, 1> {};
template<> class fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major> : public __frag_base<experimental::precision::s4, 8, 1> {};
template<> class fragment<accumulator, 8, 8, 32, int> : public __frag_base<int, 2> {};
//
// Fragments for 8x8x128
//
template<> class fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major> : public __frag_base<experimental::precision::b1, 32, 1> {};
template<> class fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major> : public __frag_base<experimental::precision::b1, 32, 1> {};
template<> class fragment<accumulator, 8, 8, 128, int> : public __frag_base<int, 2> {};
#endif /* __CUDA_SUBBYTE_IMMA__ */
#ifdef __CUDA_AMPERE_MMA__
//
// Fragments for 16x16x8
//
template<> class fragment<matrix_a, 16, 16, 8, precision::tf32, row_major> : public __frag_base<precision::tf32, 4> {};
template<> class fragment<matrix_a, 16, 16, 8, precision::tf32, col_major> : public __frag_base<precision::tf32, 4> {};
template<> class fragment<matrix_b, 16, 16, 8, precision::tf32, row_major> : public __frag_base<precision::tf32, 4> {};
template<> class fragment<matrix_b, 16, 16, 8, precision::tf32, col_major> : public __frag_base<precision::tf32, 4> {};
template<> class fragment<accumulator, 16, 16, 8, float> : public __frag_base<float, 8> {};
//
// Fragments for 8x8x4
//
template<> class fragment<matrix_a, 8, 8, 4, double, row_major> : public __frag_base<double, 1> {};
template<> class fragment<matrix_a, 8, 8, 4, double, col_major> : public __frag_base<double, 1> {};
template<> class fragment<matrix_b, 8, 8, 4, double, row_major> : public __frag_base<double, 1> {};
template<> class fragment<matrix_b, 8, 8, 4, double, col_major> : public __frag_base<double, 1> {};
template<> class fragment<accumulator, 8, 8, 4, double> : public __frag_base<double, 2> {};
#endif /* __CUDA_AMPERE_MMA__ */
//
// Load functions for frags of shape m16n16k16
//
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 16, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
#ifdef __CUDA_IMMA__
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 16, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
#endif /* __CUDA_IMMA__ */
#ifdef __CUDA_AMPERE_MMA__
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
#endif /* __CUDA_AMPERE_MMA__ */
//
// Load functions for frags of shape m32n8k16
//
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
#ifdef __CUDA_IMMA__
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
#endif /* __CUDA_IMMA__ */
#ifdef __CUDA_AMPERE_MMA__
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
#endif /* __CUDA_AMPERE_MMA__ */
//
// Load functions for frags of shape m8n32k16
//
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
#ifdef __CUDA_IMMA__
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
#endif /* __CUDA_IMMA__ */
#ifdef __CUDA_AMPERE_MMA__
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
#endif /* __CUDA_AMPERE_MMA__ */
#ifdef __CUDA_SUBBYTE_IMMA__
//
// Load functions for frags of shape m8n8k32
//
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 32, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
//
// Load functions for frags of shape m8n8k128
//
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 128, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
#endif /* __CUDA_SUBBYTE_IMMA__ */
#ifdef __CUDA_AMPERE_MMA__
//
// Load functions for frags of shape m16n16k8
//
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 8, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
//
// Load functions for frags of shape m8n8k4
//
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 4, double, row_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 4, double, col_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 4, double, row_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 4, double, col_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 4, double>& a, const double* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
#endif /* __CUDA_AMPERE_MMA__ */
//
// Store functions for frags of shape m16n16k16
//
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 16, 16, 16, __half>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 16, 16, 16, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
#ifdef __CUDA_IMMA__
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 16, 16, 16, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
#endif /* __CUDA_IMMA__ */
//
// Store functions for frags of shape m32n8k16
//
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 32, 8, 16, __half>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 32, 8, 16, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
#ifdef __CUDA_IMMA__
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 32, 8, 16, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
#endif /* __CUDA_IMMA__ */
//
// Store functions for frags of shape m8n32k16
//
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 8, 32, 16, __half>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 8, 32, 16, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
#ifdef __CUDA_IMMA__
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 32, 16, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
#endif /* __CUDA_IMMA__ */
#ifdef __CUDA_SUBBYTE_IMMA__
//
// Store functions for frags of shape m8n8k32
//
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 8, 32, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
//
// Store functions for frags of shape m8n8k128
//
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 8, 128, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
#endif /* __CUDA_SUBBYTE_IMMA__ */
#ifdef __CUDA_AMPERE_MMA__
//
// Store functions for frags of shape m16n16k8
//
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 16, 16, 8, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
//
// Store functions for frags of shape m8n8k4
//
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(double *p, const fragment<accumulator, 8, 8, 4, double>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
#endif /* __CUDA_AMPERE_MMA__ */
//
// MMA functions for shape m16n16k16
//
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
#ifdef __CUDA_IMMA__
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const fragment<matrix_b,16, 16, 16, signed char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const fragment<matrix_b,16, 16, 16, signed char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const fragment<matrix_b,16, 16, 16, signed char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const fragment<matrix_b,16, 16, 16, signed char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
#endif /* __CUDA_IMMA__ */
#ifdef __CUDA_AMPERE_MMA__
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
#endif /* __CUDA_AMPERE_MMA__ */
//
// MMA functions for shape m32n8k16
//
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
#ifdef __CUDA_IMMA__
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
#endif /* __CUDA_IMMA__ */
#ifdef __CUDA_AMPERE_MMA__
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
#endif /* __CUDA_AMPERE_MMA__ */
//
// MMA functions for shape m8n32k16
//
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
#ifdef __CUDA_IMMA__
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
#endif /* __CUDA_IMMA__ */
#ifdef __CUDA_AMPERE_MMA__
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
#endif /* __CUDA_AMPERE_MMA__ */
#ifdef __CUDA_SUBBYTE_IMMA__
//
// MMA functions for shape m8n8k32
//
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 32, int>& d, const fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major>& a, const fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major>& b, const fragment<accumulator, 8, 8, 32, int>& c, bool satf=false) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 32, int>& d, const fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major>& a, const fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major>& b, const fragment<accumulator, 8, 8, 32, int>& c, bool satf=false) __DEF_IF_HOST
//
// MMA functions for shape m8n8k128
//
__CUDA_MMA_DEVICE_DECL__ void bmma_sync(fragment<accumulator, 8, 8, 128, int>& d, const fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major>& a, const fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major>& b, const fragment<accumulator, 8, 8, 128, int>& c,
experimental::bmmaBitOp = experimental::bmmaBitOpXOR,
experimental::bmmaAccumulateOp = experimental::bmmaAccumulateOpPOPC) __DEF_IF_HOST
#endif /* __CUDA_SUBBYTE_IMMA__ */
#ifdef __CUDA_AMPERE_MMA__
//
// MMA functions for shape m16n16k8
//
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
//
// MMA functions for shape m8n8k4
//
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, row_major>& a, const fragment<matrix_b, 8, 8, 4, double, col_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, col_major>& a, const fragment<matrix_b, 8, 8, 4, double, col_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, row_major>& a, const fragment<matrix_b, 8, 8, 4, double, row_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, col_major>& a, const fragment<matrix_b, 8, 8, 4, double, row_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
#endif /* __CUDA_AMPERE_MMA__ */
};
};
#undef __DEF_IF_HOST
#undef __CUDA_IMMA__
#undef __CUDA_SUBBYTE_IMMA__
#undef __CUDA_AMPERE_MMA__
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
#endif /* __cplusplus && __CUDACC__ */
#undef __CUDA_MMA_DEVICE_DECL__
#if defined(__CUDA_ARCH__)
#include "mma.hpp"
#endif /* defined(__CUDA_ARCH__) */
#endif /* !__CUDA_MMA_H__ */
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H__)
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H__
#endif

File diff suppressed because it is too large Load diff

View file

@ -1,621 +0,0 @@
/*
* NVIDIA_COPYRIGHT_BEGIN
*
* Copyright (c) 2014-2018, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*
* NVIDIA_COPYRIGHT_END
*/
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
#if defined(_MSC_VER)
#pragma message("crt/nvfunctional is an internal header file and must not be used directly. Please use nvfunctional instead.")
#else
#warning "crt/nvfunctional is an internal header file and must not be used directly. Please use nvfunctional instead."
#endif
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H__
#endif
#ifndef __NV_LIBCXX_FUNCTIONAL_H__
#define __NV_LIBCXX_FUNCTIONAL_H__
#if __cplusplus < 201103L
#if defined(_MSC_VER)
#if _MSC_VER < 1800
#error This library requires VS 2013 and above
#endif /* _MSC_VER < 1800 */
#else /* !_MSC_VER */
#error This library requires support for the ISO C++ 2011 standard
#endif /* _MSC_VER */
#endif /* __cplusplus */
#if defined(_MSC_VER)
#define __NV_ALIGNOF __alignof
#define __NV_NOEXCEPT
#define __NV_CONSTEXPR
#else /* !_MSC_VER */
#define __NV_ALIGNOF alignof
#define __NV_NOEXCEPT noexcept
#define __NV_CONSTEXPR constexpr
#endif /* _MSC_VER */
#include <type_traits>
#include <cstddef>
#include <new>
// n3290 20.8
namespace nvstd
{
namespace internal {
// D.8.1 base (deprecated) [depr.base]
template <class _Arg, class _Result>
struct unary_function
{
typedef _Arg argument_type;
typedef _Result result_type;
};
template <class _Arg1, class _Arg2, class _Result>
struct binary_function
{
typedef _Arg1 first_argument_type;
typedef _Arg2 second_argument_type;
typedef _Result result_type;
};
// move
template <class _T>
inline __device__ __host__
typename std::remove_reference<_T>::type&& move(_T&& __t) __NV_NOEXCEPT
{
return static_cast<typename std::remove_reference<_T>::type&&>(__t);
}
// 20.2.2 swap [utility.swap]
// swap
template<class _T,
class = typename std::enable_if<
std::is_move_constructible<_T>::value &&
std::is_move_assignable<_T>::value>::type>
inline __device__ __host__
void swap(_T& __a, _T& __b)
#if !defined(_MSC_VER)
noexcept(std::is_nothrow_move_constructible<_T>::value &&
std::is_nothrow_move_assignable<_T>::value)
#endif /* !defined(_MSC_VER) */
{
_T __t(internal::move(__a));
__a = internal::move(__b);
__b = internal::move(__t);
}
// 20.2.3 forward/move helpers [forward]
// forward
template <class _T>
inline __device__ __host__
_T&& forward(typename std::remove_reference<_T>::type& __t) __NV_NOEXCEPT
{
return static_cast<_T&&>(__t);
}
template <class _T>
inline __device__ __host__
_T&& forward(typename std::remove_reference<_T>::type&& __t) __NV_NOEXCEPT
{
static_assert(!std::is_lvalue_reference<_T>::value,
"Error: __t is instantiated with an lvalue reference type");
return static_cast<_T&&>(__t);
}
} // namespace internal
namespace __functional_helpers
{
struct __dummy_class;
// Store small functors locally:
// a functor is legitimate to local storage if it is one of the following types:
// * member object pointer;
// * member function pointer;
// * closure type of size less than or equal to the largest size of
// the above types;
// * function pointer;
// * any callable class whose size is less than or equal to
// the largest one of the above types;
union _Small_functor_types
{
void *__obj;
void (*__func_ptr)();
void (__dummy_class::*mem_fn_ptr)();
};
struct _Small_functor_data {
char __data[sizeof(_Small_functor_types)];
};
template <class _RetType, class ..._ArgTypes>
struct __maybe_base_function
{ };
template <class _RetType, class _T1>
struct __maybe_base_function<_RetType(_T1)>
: public internal::unary_function<_T1, _RetType>
{ };
template <class _RetType, class _T1, class _T2>
struct __maybe_base_function<_RetType(_T1, _T2)>
: public internal::binary_function<_T1, _T2, _RetType>
{ };
} // namespace __functional_helpers
// 20.8.11 Polymorphic function wrappers [func.wrap]
// 20.8.11.1 Class bad_function_call [func.wrap.badcall]
// unimplemented because of exception
// class bad_function_call : public std::exception
// 20.8.11.2 Class template function [func.wrap.func]
template<class> class function; // undefined
// Simplified version of template class function, which
// * does not support allocator_arg_t;
// * does not support target and target_type that rely on RTTI
// * does not throw bad_function_call exception on invoking a NULL target
template <class _RetType, class ..._ArgTypes>
class function<_RetType(_ArgTypes...)>
: public __functional_helpers::__maybe_base_function<_RetType(_ArgTypes...)>
{
__functional_helpers::_Small_functor_data __small_functor_data;
void *__obj;
typedef _RetType(*__meta_fn_type)(void *, _ArgTypes...);
__meta_fn_type __meta_fn;
typedef void(*__cloner_type)(function &, const function &);
__cloner_type __cloner;
typedef void(*__destructor_type)(function *);
__destructor_type __destructor;
#pragma nv_exec_check_disable
template <class _F>
__device__ __host__
__NV_CONSTEXPR bool __use_small_functor_data() const
{
return (sizeof(_F) <= sizeof(__small_functor_data) &&
__NV_ALIGNOF(_F) <= __NV_ALIGNOF(
__functional_helpers::_Small_functor_types));
}
#pragma nv_exec_check_disable
__device__ __host__
void* __get_small_functor_data() const
{
return (void*)(&__small_functor_data.__data[0]);
}
#pragma nv_exec_check_disable
__device__ __host__
bool __is_small_functor_data() const
{
return __obj == __get_small_functor_data();
}
#pragma nv_exec_check_disable
template <class _F>
__device__ __host__
static _F& __get_functor(void *__p)
{
return *((_F*)__p);
}
#pragma nv_exec_check_disable
template <class _F>
__device__ __host__
static bool __is_empty_functor(const _F& /*__p*/)
{
return false;
}
#pragma nv_exec_check_disable
template <class _F>
__device__ __host__
static bool __is_empty_functor(const _F* __p)
{
return !__p;
}
#pragma nv_exec_check_disable
template <class _Res, class _C>
__device__ __host__
static bool __is_empty_functor(const _Res _C::* __p)
{
return !__p;
}
#pragma nv_exec_check_disable
template <class _Res, class... _Args>
__device__ __host__
static bool __is_empty_functor(const function<_Res(_Args...)>& __p)
{
return !__p;
}
template <class _F>
struct __make_cloner
{
#pragma nv_exec_check_disable
__device__ __host__
static void __clone_data(function &__dest, const function &__src)
{
if (__dest.__use_small_functor_data<_F>()) {
__dest.__obj = __dest.__get_small_functor_data();
new (__dest.__obj) _F(__src.__get_functor<_F>(__src.__obj));
}
else {
__dest.__obj = new _F(__src.__get_functor<_F>(__src.__obj));
}
}
};
template <class _F>
struct __make_destructor
{
#pragma nv_exec_check_disable
__device__ __host__
static void __destruct(function *__fn)
{
if (__fn->__use_small_functor_data<_F>()) {
(__fn->__get_functor<_F>(__fn->__obj)).~_F();
}
else {
delete (_F*)(__fn->__obj);
}
}
};
// We cannot simple define __make_functor in the following way:
// template <class _T, _F>
// __make_functor;
// template <class _RetType1, class _F, class... _ArgTypes1>
// struct __make_functor<_RetType1(_ArgTypes1...), _F>
//
// because VS 2013 cannot unpack _RetType1(_ArgTypes1...)
template <class _RetType1, class _F, class... _ArgTypes1>
struct __make_functor
{
typedef _RetType1 type;
#pragma nv_exec_check_disable
__device__ __host__
static _RetType1 __invoke(void *__d, _ArgTypes1... __args)
{
return __get_functor<_F>(__d)(
internal::forward<_ArgTypes1>(__args)...);
}
};
template <class _RetType1, class _C, class _M, class... _ArgTypes1>
struct __make_functor<_RetType1, _M _C::*,_ArgTypes1...>
{
typedef _RetType1 type;
typedef _RetType1(*_Fn)(_ArgTypes1...);
#pragma nv_exec_check_disable
__device__ __host__
static _RetType1 __invoke(void *__d, _ArgTypes1... __args)
{
return __get_functor<_Fn>(__d)(
internal::forward<_ArgTypes1>(__args)...);
}
};
// workaround for GCC version below 4.8
#if (__GNUC__ == 4) && (__GNUC_MINOR__ < 8)
template <class _F>
struct __check_callability
: public std::integral_constant<bool,
!std::is_same<_F, std::nullptr_t>::value>
{ };
#elif defined(_MSC_VER)
// simulate VC 2013's behavior...
template <class _F>
struct __check_callability1
: public
std::integral_constant<bool,
// std::result_of does not handle member pointers well
std::is_member_pointer<_F>::value ||
std::is_convertible<
_RetType,
typename std::result_of<_F(_ArgTypes...)>::type
>::value
>
{ };
template <class _F>
struct __check_callability
: public std::integral_constant<
bool,
!std::is_same<_F, function>::value &&
__check_callability1<typename std::remove_cv<_F>::type>::value>
{ };
#else /* !((__GNUC__ == 4) && (__GNUC_MINOR__ < 8)) _MSC_VER */
template <class _F,
class _T = typename std::result_of<_F(_ArgTypes...)>::type>
struct __check_callability
: public std::integral_constant<
bool,
!std::is_same<_F, function>::value &&
std::is_convertible< _T, _RetType>::value>
{ };
#endif /* __GNUC__ == 4) && (__GNUC_MINOR__ < 8) */
#pragma nv_exec_check_disable
__device__ __host__
void __destroy()
{
if (__obj) {
__destructor(this);
__obj = 0;
}
}
#pragma nv_exec_check_disable
__device__ __host__
void __clear()
{
__obj = 0;
__meta_fn = 0;
__cloner = 0;
__destructor = 0;
}
public:
typedef _RetType result_type;
/*
* These typedef(s) are derived from __maybe_base_function
* typedef T1 argument_type; // only if sizeof...(ArgTypes) == 1 and
* // the type in ArgTypes is T1
* typedef T1 first_argument_type; // only if sizeof...(ArgTypes) == 2 and
* // ArgTypes contains T1 and T2
* typedef T2 second_argument_type; // only if sizeof...(ArgTypes) == 2 and
* // ArgTypes contains T1 and T2
*/
// 20.8.11.2.1 construct/copy/destroy [func.wrap.con]
#pragma nv_exec_check_disable
__device__ __host__
function() __NV_NOEXCEPT
: __obj(0), __meta_fn(0), __cloner(0), __destructor(0) {}
#pragma nv_exec_check_disable
__device__ __host__
function(std::nullptr_t) __NV_NOEXCEPT
: __obj(0), __meta_fn(0), __cloner(0), __destructor(0) {}
#pragma nv_exec_check_disable
__device__ __host__
function(const function &__fn)
{
if (__fn.__obj == 0) {
__clear();
}
else {
__meta_fn = __fn.__meta_fn;
__destructor = __fn.__destructor;
__fn.__cloner(*this, __fn);
__cloner = __fn.__cloner;
}
}
#pragma nv_exec_check_disable
__device__ __host__
function(function &&__fn)
{
__fn.swap(*this);
}
// VS 2013 cannot process __check_callability type trait.
// So, we check callability using static_assert instead of
// using SFINAE such as
// template<class _F,
// class = typename std::enable_if<
// __check_callability<_F>::value
// >::type>
#pragma nv_exec_check_disable
template<class _F>
__device__ __host__
function(_F);
// copy and swap
#pragma nv_exec_check_disable
__device__ __host__
function& operator=(const function& __fn)
{
function(__fn).swap(*this);
return *this;
}
#pragma nv_exec_check_disable
__device__ __host__
function& operator=(function&& __fn)
{
function(internal::move(__fn)).swap(*this);
return *this;
}
#pragma nv_exec_check_disable
__device__ __host__
function& operator=(std::nullptr_t)
{
__destroy();
return *this;
}
#pragma nv_exec_check_disable
template<class _F>
__device__ __host__
function&
operator=(_F&& __fn)
{
static_assert(__check_callability<_F>::value,
"Unable to create functor object!");
function(internal::forward<_F>(__fn)).swap(*this);
return *this;
}
#pragma nv_exec_check_disable
__device__ __host__
~function()
{
__destroy();
}
// 20.8.11.2.2 function modifiers [func.wrap.func.mod]
#pragma nv_exec_check_disable
__device__ __host__
void swap(function& __fn) __NV_NOEXCEPT
{
internal::swap(__meta_fn, __fn.__meta_fn);
internal::swap(__cloner, __fn.__cloner);
internal::swap(__destructor, __fn.__destructor);
if (__is_small_functor_data() && __fn.__is_small_functor_data()) {
internal::swap(__small_functor_data, __fn.__small_functor_data);
}
else if (__is_small_functor_data()) {
internal::swap(__small_functor_data, __fn.__small_functor_data);
internal::swap(__obj, __fn.__obj);
__fn.__obj = __fn.__get_small_functor_data();
}
else if (__fn.__is_small_functor_data()) {
internal::swap(__small_functor_data, __fn.__small_functor_data);
internal::swap(__obj, __fn.__obj);
__obj = __get_small_functor_data();
}
else {
internal::swap(__obj, __fn.__obj);
}
}
// 20.8.11.2.3 function capacity [func.wrap.func.cap]
#pragma nv_exec_check_disable
__device__ __host__
explicit operator bool() const __NV_NOEXCEPT
{
return __obj;
}
// 20.8.11.2.4 function invocation [func.wrap.func.inv]
// function::operator() can only be called in device code
// to avoid cross-execution space calls
#pragma nv_exec_check_disable
__device__ __host__
_RetType operator()(_ArgTypes...) const;
};
// Out-of-line definitions
#pragma nv_exec_check_disable
template<class _RetType, class... _ArgTypes>
template<class _F>
__device__ __host__
function<_RetType(_ArgTypes...)>::function(_F __fn)
: __obj(0), __meta_fn(0), __cloner(0), __destructor(0)
{
static_assert(__check_callability<_F>::value,
"Unable to construct functor object!");
if (__is_empty_functor(__fn))
return;
__meta_fn = &__make_functor<_RetType, _F, _ArgTypes...>::__invoke;
__cloner = &__make_cloner<_F>::__clone_data;
__destructor = &__make_destructor<_F>::__destruct;
if (__use_small_functor_data<_F>()) {
__obj = __get_small_functor_data();
new ((void*)__obj) _F(internal::move(__fn));
}
else {
__obj = new _F(internal::move(__fn));
}
}
#pragma nv_exec_check_disable
template <class _RetType, class..._ArgTypes>
__device__ __host__
_RetType
function<_RetType(_ArgTypes...)>::operator()(_ArgTypes... __args) const
{
return __meta_fn(__obj, internal::forward<_ArgTypes>(__args)...);
}
// 20.8.11.2.6, Null pointer comparisons:
#pragma nv_exec_check_disable
template <class _R, class... _ArgTypes>
__device__ __host__
bool operator==(const function<_R(_ArgTypes...)>& __fn, std::nullptr_t)
__NV_NOEXCEPT
{
return !__fn;
}
#pragma nv_exec_check_disable
template <class _R, class... _ArgTypes>
__device__ __host__
bool operator==(std::nullptr_t, const function<_R(_ArgTypes...)>& __fn)
__NV_NOEXCEPT
{
return !__fn;
}
#pragma nv_exec_check_disable
template <class _R, class... _ArgTypes>
__device__ __host__
bool operator!=(const function<_R(_ArgTypes...)>& __fn, std::nullptr_t)
__NV_NOEXCEPT
{
return static_cast<bool>(__fn);
}
#pragma nv_exec_check_disable
template <class _R, class... _ArgTypes>
__device__ __host__
bool operator!=(std::nullptr_t, const function<_R(_ArgTypes...)>& __fn)
__NV_NOEXCEPT
{
return static_cast<bool>(__fn);
}
// 20.8.11.2.7, specialized algorithms:
#pragma nv_exec_check_disable
template <class _R, class... _ArgTypes>
__device__ __host__
void swap(function<_R(_ArgTypes...)>& __fn1, function<_R(_ArgTypes...)>& __fn2)
{
__fn1.swap(__fn2);
}
} // namespace nvstd
#undef __NV_NOEXCEPT
#undef __NV_CONSTEXPR
#undef __NV_ALIGNOF
#endif // __NV_LIBCXX_FUNCTIONAL_H__
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H__)
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H__
#endif

View file

@ -1,131 +0,0 @@
/*
* Copyright 2017-2018 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
#if defined(_MSC_VER)
#pragma message("crt/sm_70_rt.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
#else
#warning "crt/sm_70_rt.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
#endif
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_H__
#endif
#if !defined(__SM_70_RT_H__)
#define __SM_70_RT_H__
#if defined(__CUDACC_RTC__)
#define __SM_70_RT_DECL__ __host__ __device__
#else /* !__CUDACC_RTC__ */
#define __SM_70_RT_DECL__ static __device__ __inline__
#endif /* __CUDACC_RTC__ */
#if defined(__cplusplus) && defined(__CUDACC__)
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "builtin_types.h"
#include "device_types.h"
#include "host_defines.h"
#ifndef __CUDA_ARCH__
#define __DEF_IF_HOST { }
#else /* !__CUDA_ARCH__ */
#define __DEF_IF_HOST ;
#endif /* __CUDA_ARCH__ */
/******************************************************************************
* match *
******************************************************************************/
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned value) __DEF_IF_HOST
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, int value) __DEF_IF_HOST
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long value) __DEF_IF_HOST
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long value) __DEF_IF_HOST
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long long value) __DEF_IF_HOST
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long long value) __DEF_IF_HOST
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, float value) __DEF_IF_HOST
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, double value) __DEF_IF_HOST
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned value, int *pred) __DEF_IF_HOST
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, int value, int *pred) __DEF_IF_HOST
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long value, int *pred) __DEF_IF_HOST
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long value, int *pred) __DEF_IF_HOST
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long long value, int *pred) __DEF_IF_HOST
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long long value, int *pred) __DEF_IF_HOST
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, float value, int *pred) __DEF_IF_HOST
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, double value, int *pred) __DEF_IF_HOST
__SM_70_RT_DECL__ void __nanosleep(unsigned int ns) __DEF_IF_HOST
__SM_70_RT_DECL__ unsigned short int atomicCAS(unsigned short int *address, unsigned short int compare, unsigned short int val) __DEF_IF_HOST
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
#endif /* __cplusplus && __CUDACC__ */
#undef __DEF_IF_HOST
#undef __SM_70_RT_DECL__
#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
#include "sm_70_rt.hpp"
#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
#endif /* !__SM_70_RT_H__ */
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_H__)
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_H__
#endif

View file

@ -1,192 +0,0 @@
/*
* Copyright 2017-2021 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
#if defined(_MSC_VER)
#pragma message("crt/sm_70_rt.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
#else
#warning "crt/sm_70_rt.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
#endif
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_HPP__
#endif
#if !defined(__SM_70_RT_HPP__)
#define __SM_70_RT_HPP__
#if defined(__CUDACC_RTC__)
#define __SM_70_RT_DECL__ __host__ __device__
#else /* !__CUDACC_RTC__ */
#define __SM_70_RT_DECL__ static __device__ __inline__
#endif /* __CUDACC_RTC__ */
#if defined(__cplusplus) && defined(__CUDACC__)
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "builtin_types.h"
#include "device_types.h"
#include "host_defines.h"
/*******************************************************************************
* *
* Below are implementations of SM-7.0 builtin functions which are included as *
* source (instead of being built in to the compiler) *
* *
*******************************************************************************/
//
// __match_any_sync
//
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned value) {
return __match32_any_sync(mask, value);
}
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, int value) {
return __match32_any_sync(mask, value);
}
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long value) {
return (sizeof(long) == sizeof(long long)) ?
__match64_any_sync(mask, (unsigned long long)value):
__match32_any_sync(mask, (unsigned)value);
}
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long value) {
return (sizeof(long) == sizeof(long long)) ?
__match64_any_sync(mask, (unsigned long long)value):
__match32_any_sync(mask, (unsigned)value);
}
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long long value) {
return __match64_any_sync(mask, value);
}
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long long value) {
return __match64_any_sync(mask, value);
}
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, float value) {
return __match32_any_sync(mask, __float_as_uint(value));
}
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, double value) {
return __match64_any_sync(mask, __double_as_longlong(value));
}
//
// __match_all_sync
//
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned value, int *pred) {
return __match32_all_sync(mask, value, pred);
}
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, int value, int *pred) {
return __match32_all_sync(mask, value, pred);
}
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long value, int *pred) {
return (sizeof(long) == sizeof(long long)) ?
__match64_all_sync(mask, (unsigned long long)value, pred):
__match32_all_sync(mask, (unsigned)value, pred);
}
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long value, int *pred) {
return (sizeof(long) == sizeof(long long)) ?
__match64_all_sync(mask, (unsigned long long)value, pred):
__match32_all_sync(mask, (unsigned)value, pred);
}
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long long value, int *pred) {
return __match64_all_sync(mask, value, pred);
}
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long long value, int *pred) {
return __match64_all_sync(mask, value, pred);
}
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, float value, int *pred) {
return __match32_all_sync(mask, __float_as_uint(value), pred);
}
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, double value, int *pred) {
return __match64_all_sync(mask, __double_as_longlong(value), pred);
}
__SM_70_RT_DECL__ void __nanosleep(unsigned int ns) {
asm volatile("nanosleep.u32 %0;" :: "r"(ns));
}
extern "C" __device__ __device_builtin__
unsigned short __usAtomicCAS(unsigned short *, unsigned short, unsigned short);
__SM_70_RT_DECL__ unsigned short int atomicCAS(unsigned short int *address, unsigned short int compare, unsigned short int val) {
return __usAtomicCAS(address, compare, val);
}
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
#endif /* __cplusplus && __CUDACC__ */
#undef __SM_70_RT_DECL__
#endif /* !__SM_70_RT_HPP__ */
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_HPP__)
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_HPP__
#endif

View file

@ -1,158 +0,0 @@
/*
* Copyright 2017-2021 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
#if defined(_MSC_VER)
#pragma message("crt/sm_80_rt.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
#else
#warning "crt/sm_80_rt.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
#endif
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_H__
#endif
#if !defined(__SM_80_RT_H__)
#define __SM_80_RT_H__
#if defined(__CUDACC_RTC__)
#define __SM_80_RT_DECL__ __host__ __device__
#else /* !__CUDACC_RTC__ */
#define __SM_80_RT_DECL__ static __device__ __inline__
#endif /* __CUDACC_RTC__ */
#if defined(__cplusplus) && defined(__CUDACC__)
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "builtin_types.h"
#include "device_types.h"
#include "host_defines.h"
#ifndef __CUDA_ARCH__
#define __DEF_IF_HOST { }
#else /* !__CUDA_ARCH__ */
#define __DEF_IF_HOST ;
#endif /* __CUDA_ARCH__ */
/******************************************************************************
* reduce *
******************************************************************************/
__SM_80_RT_DECL__ unsigned __reduce_add_sync(unsigned mask, unsigned value) __DEF_IF_HOST
__SM_80_RT_DECL__ unsigned __reduce_min_sync(unsigned mask, unsigned value) __DEF_IF_HOST
__SM_80_RT_DECL__ unsigned __reduce_max_sync(unsigned mask, unsigned value) __DEF_IF_HOST
__SM_80_RT_DECL__ int __reduce_add_sync(unsigned mask, int value) __DEF_IF_HOST
__SM_80_RT_DECL__ int __reduce_min_sync(unsigned mask, int value) __DEF_IF_HOST
__SM_80_RT_DECL__ int __reduce_max_sync(unsigned mask, int value) __DEF_IF_HOST
__SM_80_RT_DECL__ unsigned __reduce_and_sync(unsigned mask, unsigned value) __DEF_IF_HOST
__SM_80_RT_DECL__ unsigned __reduce_or_sync(unsigned mask, unsigned value) __DEF_IF_HOST
__SM_80_RT_DECL__ unsigned __reduce_xor_sync(unsigned mask, unsigned value) __DEF_IF_HOST
extern "C" {
inline __device__ void *__nv_associate_access_property(const void *ptr,
unsigned long long property) {
extern __device__ void *__nv_associate_access_property_impl(const void *,
unsigned long long);
return __nv_associate_access_property_impl(ptr, property);
}
inline __device__ void __nv_memcpy_async_shared_global_4(void *dst,
const void *src,
unsigned src_size) {
extern __device__ void __nv_memcpy_async_shared_global_4_impl(void *,
const void *,
unsigned);
__nv_memcpy_async_shared_global_4_impl(dst, src, src_size);
}
inline __device__ void __nv_memcpy_async_shared_global_8(void *dst,
const void *src,
unsigned src_size) {
extern __device__ void __nv_memcpy_async_shared_global_8_impl(void *,
const void *,
unsigned);
__nv_memcpy_async_shared_global_8_impl(dst, src, src_size);
}
inline __device__ void __nv_memcpy_async_shared_global_16(void *dst,
const void *src,
unsigned src_size) {
extern __device__ void __nv_memcpy_async_shared_global_16_impl(void *,
const void *,
unsigned);
__nv_memcpy_async_shared_global_16_impl(dst, src, src_size);
}
}
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 800 */
#endif /* __cplusplus && __CUDACC__ */
#undef __DEF_IF_HOST
#undef __SM_80_RT_DECL__
#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
#include "sm_80_rt.hpp"
#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
#endif /* !__SM_80_RT_H__ */
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_H__)
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_H__
#endif

View file

@ -1,148 +0,0 @@
/*
* Copyright 2017-2021 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
#if defined(_MSC_VER)
#pragma message("crt/sm_80_rt.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
#else
#warning "crt/sm_80_rt.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
#endif
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_HPP__
#endif
#if !defined(__SM_80_RT_HPP__)
#define __SM_80_RT_HPP__
#if defined(__CUDACC_RTC__)
#define __SM_80_RT_DECL__ __host__ __device__
#else /* !__CUDACC_RTC__ */
#define __SM_80_RT_DECL__ static __device__ __inline__
#endif /* __CUDACC_RTC__ */
#if defined(__cplusplus) && defined(__CUDACC__)
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "builtin_types.h"
#include "device_types.h"
#include "host_defines.h"
/*******************************************************************************
* *
* Below are implementations of SM-8.0 builtin functions which are included as *
* source (instead of being built in to the compiler) *
* *
*******************************************************************************/
extern "C" {
__device_builtin__ __device__ unsigned __reduce_add_sync_unsigned_impl(unsigned, unsigned);
__device_builtin__ __device__ unsigned __reduce_min_sync_unsigned_impl(unsigned, unsigned);
__device_builtin__ __device__ unsigned __reduce_max_sync_unsigned_impl(unsigned, unsigned);
__device_builtin__ __device__ int __reduce_add_sync_signed_impl(unsigned, int);
__device_builtin__ __device__ int __reduce_min_sync_signed_impl(unsigned, int);
__device_builtin__ __device__ int __reduce_max_sync_signed_impl(unsigned, int);
__device_builtin__ __device__ unsigned __reduce_or_sync_unsigned_impl(unsigned, unsigned);
__device_builtin__ __device__ unsigned __reduce_and_sync_unsigned_impl(unsigned, unsigned);
__device_builtin__ __device__ unsigned __reduce_xor_sync_unsigned_impl(unsigned, unsigned);
}
__SM_80_RT_DECL__ unsigned __reduce_add_sync(unsigned mask, unsigned value) {
return __reduce_add_sync_unsigned_impl(mask, value);
}
__SM_80_RT_DECL__ unsigned __reduce_min_sync(unsigned mask, unsigned value) {
return __reduce_min_sync_unsigned_impl(mask, value);
}
__SM_80_RT_DECL__ unsigned __reduce_max_sync(unsigned mask, unsigned value) {
return __reduce_max_sync_unsigned_impl(mask, value);
}
__SM_80_RT_DECL__ int __reduce_add_sync(unsigned mask, int value) {
return __reduce_add_sync_signed_impl(mask, value);
}
__SM_80_RT_DECL__ int __reduce_min_sync(unsigned mask, int value) {
return __reduce_min_sync_signed_impl(mask, value);
}
__SM_80_RT_DECL__ int __reduce_max_sync(unsigned mask, int value) {
return __reduce_max_sync_signed_impl(mask, value);
}
__SM_80_RT_DECL__ unsigned __reduce_and_sync(unsigned mask, unsigned value) {
return __reduce_and_sync_unsigned_impl(mask, value);
}
__SM_80_RT_DECL__ unsigned __reduce_or_sync(unsigned mask, unsigned value) {
return __reduce_or_sync_unsigned_impl(mask, value);
}
__SM_80_RT_DECL__ unsigned __reduce_xor_sync(unsigned mask, unsigned value) {
return __reduce_xor_sync_unsigned_impl(mask, value);
}
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 800 */
#endif /* __cplusplus && __CUDACC__ */
#undef __SM_80_RT_DECL__
#endif /* !__SM_80_RT_HPP__ */
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_HPP__)
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_HPP__
#endif

View file

@ -1,142 +0,0 @@
/*
* NVIDIA_COPYRIGHT_BEGIN
*
* Copyright (c) 2008-2018, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*
* NVIDIA_COPYRIGHT_END
*/
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
#if defined(_MSC_VER)
#pragma message("crt/storage_class.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
#else
#warning "crt/storage_class.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
#endif
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_STORAGE_CLASS_H__
#endif
#if !defined(__STORAGE_CLASS_H__)
#define __STORAGE_CLASS_H__
#if !defined(__var_used__)
#define __var_used__
#endif /* __var_used__ */
#if !defined(__loc_sc__)
#define __loc_sc__(loc, size, sc) \
__storage##_##sc##size##loc loc
#endif /* !__loc_sc__ */
#if !defined(__storage___device__)
#define __storage___device__ static __var_used__
#endif /* __storage___device__ */
#if !defined(__storage_extern__device__)
#define __storage_extern__device__ static __var_used__
#endif /* __storage_extern__device__ */
#if !defined(__storage_auto__device__)
#define __storage_auto__device__ @@@ COMPILER @@@ ERROR @@@
#endif /* __storage_auto__device__ */
#if !defined(__storage_static__device__)
#define __storage_static__device__ static __var_used__
#endif /* __storage_static__device__ */
#if !defined(__storage___constant__)
#define __storage___constant__ static __var_used__
#endif /* __storage___constant__ */
#if !defined(__storage_extern__constant__)
#define __storage_extern__constant__ static __var_used__
#endif /* __storage_extern__constant__ */
#if !defined(__storage_auto__constant__)
#define __storage_auto__constant__ @@@ COMPILER @@@ ERROR @@@
#endif /* __storage_auto__constant__ */
#if !defined(__storage_static__constant__)
#define __storage_static__constant__ static __var_used__
#endif /* __storage_static__constant__ */
#if !defined(__storage___shared__)
#define __storage___shared__ static __var_used__
#endif /* __storage___shared__ */
#if !defined(__storage_extern__shared__)
#define __storage_extern__shared__ static __var_used__
#endif /* __storage_extern__shared__ */
#if !defined(__storage_auto__shared__)
#define __storage_auto__shared__ static
#endif /* __storage_auto__shared__ */
#if !defined(__storage_static__shared__)
#define __storage_static__shared__ static __var_used__
#endif /* __storage_static__shared__ */
#if !defined(__storage__unsized__shared__)
#define __storage__unsized__shared__ @@@ COMPILER @@@ ERROR @@@
#endif /* __storage__unsized__shared__ */
#if !defined(__storage_extern_unsized__shared__)
#define __storage_extern_unsized__shared__ static __var_used__
#endif /* __storage_extern_unsized__shared__ */
#if !defined(__storage_auto_unsized__shared__)
#define __storage_auto_unsized__shared__ @@@ COMPILER @@@ ERROR @@@
#endif /* __storage_auto_unsized__shared__ */
#if !defined(__storage_static_unsized__shared__)
#define __storage_static_unsized__shared__ @@@ COMPILER @@@ ERROR @@@
#endif /* __storage_static_unsized__shared__ */
#if !defined(__storage___text__)
#define __storage___text__ static __var_used__
#endif /* __storage___text__ */
#if !defined(__storage_extern__text__)
#define __storage_extern__text__ static __var_used__
#endif /* __storage_extern__text__ */
#if !defined(__storage_auto__text__)
#define __storage_auto__text__ @@@ COMPILER @@@ ERROR @@@
#endif /* __storage_auto__text__ */
#if !defined(__storage_static__text__)
#define __storage_static__text__ static __var_used__
#endif /* __storage_static__text__ */
#if !defined(__storage___surf__)
#define __storage___surf__ static __var_used__
#endif /* __storage___surf__ */
#if !defined(__storage_extern__surf__)
#define __storage_extern__surf__ static __var_used__
#endif /* __storage_extern__surf__ */
#if !defined(__storage_auto__surf__)
#define __storage_auto__surf__ @@@ COMPILER @@@ ERROR @@@
#endif /* __storage_auto__surf__ */
#if !defined(__storage_static__surf__)
#define __storage_static__surf__ static __var_used__
#endif /* __storage_static__surf__ */
#endif /* !__STORAGE_CLASS_H__ */
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_STORAGE_CLASS_H__)
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_STORAGE_CLASS_H__
#endif

View file

@ -1,348 +0,0 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(CU_COMPLEX_H_)
#define CU_COMPLEX_H_
#if !defined(__CUDACC_RTC__)
#if defined(__GNUC__)
#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)))
#pragma GCC diagnostic ignored "-Wunused-function"
#endif
#endif
#endif
/* When trying to include C header file in C++ Code extern "C" is required
* But the Standard QNX headers already have ifdef extern in them when compiling C++ Code
* extern "C" cannot be nested
* Hence keep the header out of extern "C" block
*/
#if !defined(__CUDACC__)
#include <math.h> /* import fabsf, sqrt */
#endif /* !defined(__CUDACC__) */
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
#include "vector_types.h"
typedef float2 cuFloatComplex;
__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x)
{
return x.x;
}
__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x)
{
return x.y;
}
__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex
(float r, float i)
{
cuFloatComplex res;
res.x = r;
res.y = i;
return res;
}
__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)
{
return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x));
}
__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x,
cuFloatComplex y)
{
return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y),
cuCimagf(x) + cuCimagf(y));
}
__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x,
cuFloatComplex y)
{
return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y),
cuCimagf(x) - cuCimagf(y));
}
/* This implementation could suffer from intermediate overflow even though
* the final result would be in range. However, various implementations do
* not guard against this (presumably to avoid losing performance), so we
* don't do it either to stay competitive.
*/
__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x,
cuFloatComplex y)
{
cuFloatComplex prod;
prod = make_cuFloatComplex ((cuCrealf(x) * cuCrealf(y)) -
(cuCimagf(x) * cuCimagf(y)),
(cuCrealf(x) * cuCimagf(y)) +
(cuCimagf(x) * cuCrealf(y)));
return prod;
}
/* This implementation guards against intermediate underflow and overflow
* by scaling. Such guarded implementations are usually the default for
* complex library implementations, with some also offering an unguarded,
* faster version.
*/
__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x,
cuFloatComplex y)
{
cuFloatComplex quot;
float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));
float oos = 1.0f / s;
float ars = cuCrealf(x) * oos;
float ais = cuCimagf(x) * oos;
float brs = cuCrealf(y) * oos;
float bis = cuCimagf(y) * oos;
s = (brs * brs) + (bis * bis);
oos = 1.0f / s;
quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,
((ais * brs) - (ars * bis)) * oos);
return quot;
}
/*
* We would like to call hypotf(), but it's not available on all platforms.
* This discrete implementation guards against intermediate underflow and
* overflow by scaling. Otherwise we would lose half the exponent range.
* There are various ways of doing guarded computation. For now chose the
* simplest and fastest solution, however this may suffer from inaccuracies
* if sqrt and division are not IEEE compliant.
*/
__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
{
float a = cuCrealf(x);
float b = cuCimagf(x);
float v, w, t;
a = fabsf(a);
b = fabsf(b);
if (a > b) {
v = a;
w = b;
} else {
v = b;
w = a;
}
t = w / v;
t = 1.0f + t * t;
t = v * sqrtf(t);
if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {
t = v + w;
}
return t;
}
/* Double precision */
typedef double2 cuDoubleComplex;
__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x)
{
return x.x;
}
__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x)
{
return x.y;
}
__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex
(double r, double i)
{
cuDoubleComplex res;
res.x = r;
res.y = i;
return res;
}
__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x)
{
return make_cuDoubleComplex (cuCreal(x), -cuCimag(x));
}
__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x,
cuDoubleComplex y)
{
return make_cuDoubleComplex (cuCreal(x) + cuCreal(y),
cuCimag(x) + cuCimag(y));
}
__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x,
cuDoubleComplex y)
{
return make_cuDoubleComplex (cuCreal(x) - cuCreal(y),
cuCimag(x) - cuCimag(y));
}
/* This implementation could suffer from intermediate overflow even though
* the final result would be in range. However, various implementations do
* not guard against this (presumably to avoid losing performance), so we
* don't do it either to stay competitive.
*/
__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x,
cuDoubleComplex y)
{
cuDoubleComplex prod;
prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) -
(cuCimag(x) * cuCimag(y)),
(cuCreal(x) * cuCimag(y)) +
(cuCimag(x) * cuCreal(y)));
return prod;
}
/* This implementation guards against intermediate underflow and overflow
* by scaling. Such guarded implementations are usually the default for
* complex library implementations, with some also offering an unguarded,
* faster version.
*/
__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x,
cuDoubleComplex y)
{
cuDoubleComplex quot;
double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y)));
double oos = 1.0 / s;
double ars = cuCreal(x) * oos;
double ais = cuCimag(x) * oos;
double brs = cuCreal(y) * oos;
double bis = cuCimag(y) * oos;
s = (brs * brs) + (bis * bis);
oos = 1.0 / s;
quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos,
((ais * brs) - (ars * bis)) * oos);
return quot;
}
/* This implementation guards against intermediate underflow and overflow
* by scaling. Otherwise we would lose half the exponent range. There are
* various ways of doing guarded computation. For now chose the simplest
* and fastest solution, however this may suffer from inaccuracies if sqrt
* and division are not IEEE compliant.
*/
__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x)
{
double a = cuCreal(x);
double b = cuCimag(x);
double v, w, t;
a = fabs(a);
b = fabs(b);
if (a > b) {
v = a;
w = b;
} else {
v = b;
w = a;
}
t = w / v;
t = 1.0 + t * t;
t = v * sqrt(t);
if ((v == 0.0) ||
(v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) {
t = v + w;
}
return t;
}
#if defined(__cplusplus)
}
#endif /* __cplusplus */
/* aliases */
typedef cuFloatComplex cuComplex;
__host__ __device__ static __inline__ cuComplex make_cuComplex (float x,
float y)
{
return make_cuFloatComplex (x, y);
}
/* float-to-double promotion */
__host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble
(cuFloatComplex c)
{
return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));
}
__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat
(cuDoubleComplex c)
{
return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));
}
__host__ __device__ static __inline__ cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d)
{
float real_res;
float imag_res;
real_res = (cuCrealf(x) * cuCrealf(y)) + cuCrealf(d);
imag_res = (cuCrealf(x) * cuCimagf(y)) + cuCimagf(d);
real_res = -(cuCimagf(x) * cuCimagf(y)) + real_res;
imag_res = (cuCimagf(x) * cuCrealf(y)) + imag_res;
return make_cuComplex(real_res, imag_res);
}
__host__ __device__ static __inline__ cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d)
{
double real_res;
double imag_res;
real_res = (cuCreal(x) * cuCreal(y)) + cuCreal(d);
imag_res = (cuCreal(x) * cuCimag(y)) + cuCimag(d);
real_res = -(cuCimag(x) * cuCimag(y)) + real_res;
imag_res = (cuCimag(x) * cuCreal(y)) + imag_res;
return make_cuDoubleComplex(real_res, imag_res);
}
#endif /* !defined(CU_COMPLEX_H_) */

View file

@ -1,887 +0,0 @@
/*
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
/*
* This is the public header file for the CUBLAS library, defining the API
*
* CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines)
* on top of the CUDA runtime.
*/
#if !defined(CUBLAS_H_)
#define CUBLAS_H_
#include <cuda_runtime.h>
#ifndef CUBLASWINAPI
#ifdef _WIN32
#define CUBLASWINAPI __stdcall
#else
#define CUBLASWINAPI
#endif
#endif
#undef CUBLASAPI
#ifdef __CUDACC__
#define CUBLASAPI __host__
#else
#define CUBLASAPI
#endif
#include "cublas_api.h"
#if defined(__cplusplus)
extern "C" {
#endif
/* CUBLAS data types */
#define cublasStatus cublasStatus_t
cublasStatus CUBLASWINAPI cublasInit(void);
cublasStatus CUBLASWINAPI cublasShutdown(void);
cublasStatus CUBLASWINAPI cublasGetError(void);
cublasStatus CUBLASWINAPI cublasGetVersion(int* version);
cublasStatus CUBLASWINAPI cublasAlloc(int n, int elemSize, void** devicePtr);
cublasStatus CUBLASWINAPI cublasFree(void* devicePtr);
cublasStatus CUBLASWINAPI cublasSetKernelStream(cudaStream_t stream);
/* ---------------- CUBLAS BLAS1 functions ---------------- */
/* NRM2 */
float CUBLASWINAPI cublasSnrm2(int n, const float* x, int incx);
double CUBLASWINAPI cublasDnrm2(int n, const double* x, int incx);
float CUBLASWINAPI cublasScnrm2(int n, const cuComplex* x, int incx);
double CUBLASWINAPI cublasDznrm2(int n, const cuDoubleComplex* x, int incx);
/*------------------------------------------------------------------------*/
/* DOT */
float CUBLASWINAPI cublasSdot(int n, const float* x, int incx, const float* y, int incy);
double CUBLASWINAPI cublasDdot(int n, const double* x, int incx, const double* y, int incy);
cuComplex CUBLASWINAPI cublasCdotu(int n, const cuComplex* x, int incx, const cuComplex* y, int incy);
cuComplex CUBLASWINAPI cublasCdotc(int n, const cuComplex* x, int incx, const cuComplex* y, int incy);
cuDoubleComplex CUBLASWINAPI cublasZdotu(int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy);
cuDoubleComplex CUBLASWINAPI cublasZdotc(int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy);
/*------------------------------------------------------------------------*/
/* SCAL */
void CUBLASWINAPI cublasSscal(int n, float alpha, float* x, int incx);
void CUBLASWINAPI cublasDscal(int n, double alpha, double* x, int incx);
void CUBLASWINAPI cublasCscal(int n, cuComplex alpha, cuComplex* x, int incx);
void CUBLASWINAPI cublasZscal(int n, cuDoubleComplex alpha, cuDoubleComplex* x, int incx);
void CUBLASWINAPI cublasCsscal(int n, float alpha, cuComplex* x, int incx);
void CUBLASWINAPI cublasZdscal(int n, double alpha, cuDoubleComplex* x, int incx);
/*------------------------------------------------------------------------*/
/* AXPY */
void CUBLASWINAPI cublasSaxpy(int n, float alpha, const float* x, int incx, float* y, int incy);
void CUBLASWINAPI cublasDaxpy(int n, double alpha, const double* x, int incx, double* y, int incy);
void CUBLASWINAPI cublasCaxpy(int n, cuComplex alpha, const cuComplex* x, int incx, cuComplex* y, int incy);
void CUBLASWINAPI
cublasZaxpy(int n, cuDoubleComplex alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
/*------------------------------------------------------------------------*/
/* COPY */
void CUBLASWINAPI cublasScopy(int n, const float* x, int incx, float* y, int incy);
void CUBLASWINAPI cublasDcopy(int n, const double* x, int incx, double* y, int incy);
void CUBLASWINAPI cublasCcopy(int n, const cuComplex* x, int incx, cuComplex* y, int incy);
void CUBLASWINAPI cublasZcopy(int n, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
/*------------------------------------------------------------------------*/
/* SWAP */
void CUBLASWINAPI cublasSswap(int n, float* x, int incx, float* y, int incy);
void CUBLASWINAPI cublasDswap(int n, double* x, int incx, double* y, int incy);
void CUBLASWINAPI cublasCswap(int n, cuComplex* x, int incx, cuComplex* y, int incy);
void CUBLASWINAPI cublasZswap(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
/*------------------------------------------------------------------------*/
/* AMAX */
int CUBLASWINAPI cublasIsamax(int n, const float* x, int incx);
int CUBLASWINAPI cublasIdamax(int n, const double* x, int incx);
int CUBLASWINAPI cublasIcamax(int n, const cuComplex* x, int incx);
int CUBLASWINAPI cublasIzamax(int n, const cuDoubleComplex* x, int incx);
/*------------------------------------------------------------------------*/
/* AMIN */
int CUBLASWINAPI cublasIsamin(int n, const float* x, int incx);
int CUBLASWINAPI cublasIdamin(int n, const double* x, int incx);
int CUBLASWINAPI cublasIcamin(int n, const cuComplex* x, int incx);
int CUBLASWINAPI cublasIzamin(int n, const cuDoubleComplex* x, int incx);
/*------------------------------------------------------------------------*/
/* ASUM */
float CUBLASWINAPI cublasSasum(int n, const float* x, int incx);
double CUBLASWINAPI cublasDasum(int n, const double* x, int incx);
float CUBLASWINAPI cublasScasum(int n, const cuComplex* x, int incx);
double CUBLASWINAPI cublasDzasum(int n, const cuDoubleComplex* x, int incx);
/*------------------------------------------------------------------------*/
/* ROT */
void CUBLASWINAPI cublasSrot(int n, float* x, int incx, float* y, int incy, float sc, float ss);
void CUBLASWINAPI cublasDrot(int n, double* x, int incx, double* y, int incy, double sc, double ss);
void CUBLASWINAPI cublasCrot(int n, cuComplex* x, int incx, cuComplex* y, int incy, float c, cuComplex s);
void CUBLASWINAPI
cublasZrot(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, double sc, cuDoubleComplex cs);
void CUBLASWINAPI cublasCsrot(int n, cuComplex* x, int incx, cuComplex* y, int incy, float c, float s);
void CUBLASWINAPI cublasZdrot(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, double c, double s);
/*------------------------------------------------------------------------*/
/* ROTG */
void CUBLASWINAPI cublasSrotg(float* sa, float* sb, float* sc, float* ss);
void CUBLASWINAPI cublasDrotg(double* sa, double* sb, double* sc, double* ss);
void CUBLASWINAPI cublasCrotg(cuComplex* ca, cuComplex cb, float* sc, cuComplex* cs);
void CUBLASWINAPI cublasZrotg(cuDoubleComplex* ca, cuDoubleComplex cb, double* sc, cuDoubleComplex* cs);
/*------------------------------------------------------------------------*/
/* ROTM */
void CUBLASWINAPI cublasSrotm(int n, float* x, int incx, float* y, int incy, const float* sparam);
void CUBLASWINAPI cublasDrotm(int n, double* x, int incx, double* y, int incy, const double* sparam);
/*------------------------------------------------------------------------*/
/* ROTMG */
void CUBLASWINAPI cublasSrotmg(float* sd1, float* sd2, float* sx1, const float* sy1, float* sparam);
void CUBLASWINAPI cublasDrotmg(double* sd1, double* sd2, double* sx1, const double* sy1, double* sparam);
/* --------------- CUBLAS BLAS2 functions ---------------- */
/* GEMV */
void CUBLASWINAPI cublasSgemv(char trans,
int m,
int n,
float alpha,
const float* A,
int lda,
const float* x,
int incx,
float beta,
float* y,
int incy);
void CUBLASWINAPI cublasDgemv(char trans,
int m,
int n,
double alpha,
const double* A,
int lda,
const double* x,
int incx,
double beta,
double* y,
int incy);
void CUBLASWINAPI cublasCgemv(char trans,
int m,
int n,
cuComplex alpha,
const cuComplex* A,
int lda,
const cuComplex* x,
int incx,
cuComplex beta,
cuComplex* y,
int incy);
void CUBLASWINAPI cublasZgemv(char trans,
int m,
int n,
cuDoubleComplex alpha,
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* x,
int incx,
cuDoubleComplex beta,
cuDoubleComplex* y,
int incy);
/*------------------------------------------------------------------------*/
/* GBMV */
void CUBLASWINAPI cublasSgbmv(char trans,
int m,
int n,
int kl,
int ku,
float alpha,
const float* A,
int lda,
const float* x,
int incx,
float beta,
float* y,
int incy);
void CUBLASWINAPI cublasDgbmv(char trans,
int m,
int n,
int kl,
int ku,
double alpha,
const double* A,
int lda,
const double* x,
int incx,
double beta,
double* y,
int incy);
void CUBLASWINAPI cublasCgbmv(char trans,
int m,
int n,
int kl,
int ku,
cuComplex alpha,
const cuComplex* A,
int lda,
const cuComplex* x,
int incx,
cuComplex beta,
cuComplex* y,
int incy);
void CUBLASWINAPI cublasZgbmv(char trans,
int m,
int n,
int kl,
int ku,
cuDoubleComplex alpha,
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* x,
int incx,
cuDoubleComplex beta,
cuDoubleComplex* y,
int incy);
/*------------------------------------------------------------------------*/
/* TRMV */
void CUBLASWINAPI cublasStrmv(char uplo, char trans, char diag, int n, const float* A, int lda, float* x, int incx);
void CUBLASWINAPI cublasDtrmv(char uplo, char trans, char diag, int n, const double* A, int lda, double* x, int incx);
void CUBLASWINAPI
cublasCtrmv(char uplo, char trans, char diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx);
void CUBLASWINAPI
cublasZtrmv(char uplo, char trans, char diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
/*------------------------------------------------------------------------*/
/* TBMV */
void CUBLASWINAPI
cublasStbmv(char uplo, char trans, char diag, int n, int k, const float* A, int lda, float* x, int incx);
void CUBLASWINAPI
cublasDtbmv(char uplo, char trans, char diag, int n, int k, const double* A, int lda, double* x, int incx);
void CUBLASWINAPI
cublasCtbmv(char uplo, char trans, char diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx);
void CUBLASWINAPI cublasZtbmv(
char uplo, char trans, char diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
/*------------------------------------------------------------------------*/
/* TPMV */
void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float* AP, float* x, int incx);
void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double* AP, double* x, int incx);
void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex* AP, cuComplex* x, int incx);
void CUBLASWINAPI
cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx);
/*------------------------------------------------------------------------*/
/* TRSV */
void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float* A, int lda, float* x, int incx);
void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double* A, int lda, double* x, int incx);
void CUBLASWINAPI
cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx);
void CUBLASWINAPI
cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
/*------------------------------------------------------------------------*/
/* TPSV */
void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float* AP, float* x, int incx);
void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double* AP, double* x, int incx);
void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex* AP, cuComplex* x, int incx);
void CUBLASWINAPI
cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx);
/*------------------------------------------------------------------------*/
/* TBSV */
void CUBLASWINAPI
cublasStbsv(char uplo, char trans, char diag, int n, int k, const float* A, int lda, float* x, int incx);
void CUBLASWINAPI
cublasDtbsv(char uplo, char trans, char diag, int n, int k, const double* A, int lda, double* x, int incx);
void CUBLASWINAPI
cublasCtbsv(char uplo, char trans, char diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx);
void CUBLASWINAPI cublasZtbsv(
char uplo, char trans, char diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
/*------------------------------------------------------------------------*/
/* SYMV/HEMV */
void CUBLASWINAPI cublasSsymv(
char uplo, int n, float alpha, const float* A, int lda, const float* x, int incx, float beta, float* y, int incy);
void CUBLASWINAPI cublasDsymv(char uplo,
int n,
double alpha,
const double* A,
int lda,
const double* x,
int incx,
double beta,
double* y,
int incy);
void CUBLASWINAPI cublasChemv(char uplo,
int n,
cuComplex alpha,
const cuComplex* A,
int lda,
const cuComplex* x,
int incx,
cuComplex beta,
cuComplex* y,
int incy);
void CUBLASWINAPI cublasZhemv(char uplo,
int n,
cuDoubleComplex alpha,
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* x,
int incx,
cuDoubleComplex beta,
cuDoubleComplex* y,
int incy);
/*------------------------------------------------------------------------*/
/* SBMV/HBMV */
void CUBLASWINAPI cublasSsbmv(char uplo,
int n,
int k,
float alpha,
const float* A,
int lda,
const float* x,
int incx,
float beta,
float* y,
int incy);
void CUBLASWINAPI cublasDsbmv(char uplo,
int n,
int k,
double alpha,
const double* A,
int lda,
const double* x,
int incx,
double beta,
double* y,
int incy);
void CUBLASWINAPI cublasChbmv(char uplo,
int n,
int k,
cuComplex alpha,
const cuComplex* A,
int lda,
const cuComplex* x,
int incx,
cuComplex beta,
cuComplex* y,
int incy);
void CUBLASWINAPI cublasZhbmv(char uplo,
int n,
int k,
cuDoubleComplex alpha,
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* x,
int incx,
cuDoubleComplex beta,
cuDoubleComplex* y,
int incy);
/*------------------------------------------------------------------------*/
/* SPMV/HPMV */
void CUBLASWINAPI
cublasSspmv(char uplo, int n, float alpha, const float* AP, const float* x, int incx, float beta, float* y, int incy);
void CUBLASWINAPI cublasDspmv(
char uplo, int n, double alpha, const double* AP, const double* x, int incx, double beta, double* y, int incy);
void CUBLASWINAPI cublasChpmv(char uplo,
int n,
cuComplex alpha,
const cuComplex* AP,
const cuComplex* x,
int incx,
cuComplex beta,
cuComplex* y,
int incy);
void CUBLASWINAPI cublasZhpmv(char uplo,
int n,
cuDoubleComplex alpha,
const cuDoubleComplex* AP,
const cuDoubleComplex* x,
int incx,
cuDoubleComplex beta,
cuDoubleComplex* y,
int incy);
/*------------------------------------------------------------------------*/
/* GER */
void CUBLASWINAPI
cublasSger(int m, int n, float alpha, const float* x, int incx, const float* y, int incy, float* A, int lda);
void CUBLASWINAPI
cublasDger(int m, int n, double alpha, const double* x, int incx, const double* y, int incy, double* A, int lda);
void CUBLASWINAPI cublasCgeru(
int m, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda);
void CUBLASWINAPI cublasCgerc(
int m, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda);
void CUBLASWINAPI cublasZgeru(int m,
int n,
cuDoubleComplex alpha,
const cuDoubleComplex* x,
int incx,
const cuDoubleComplex* y,
int incy,
cuDoubleComplex* A,
int lda);
void CUBLASWINAPI cublasZgerc(int m,
int n,
cuDoubleComplex alpha,
const cuDoubleComplex* x,
int incx,
const cuDoubleComplex* y,
int incy,
cuDoubleComplex* A,
int lda);
/*------------------------------------------------------------------------*/
/* SYR/HER */
void CUBLASWINAPI cublasSsyr(char uplo, int n, float alpha, const float* x, int incx, float* A, int lda);
void CUBLASWINAPI cublasDsyr(char uplo, int n, double alpha, const double* x, int incx, double* A, int lda);
void CUBLASWINAPI cublasCher(char uplo, int n, float alpha, const cuComplex* x, int incx, cuComplex* A, int lda);
void CUBLASWINAPI
cublasZher(char uplo, int n, double alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* A, int lda);
/*------------------------------------------------------------------------*/
/* SPR/HPR */
void CUBLASWINAPI cublasSspr(char uplo, int n, float alpha, const float* x, int incx, float* AP);
void CUBLASWINAPI cublasDspr(char uplo, int n, double alpha, const double* x, int incx, double* AP);
void CUBLASWINAPI cublasChpr(char uplo, int n, float alpha, const cuComplex* x, int incx, cuComplex* AP);
void CUBLASWINAPI cublasZhpr(char uplo, int n, double alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* AP);
/*------------------------------------------------------------------------*/
/* SYR2/HER2 */
void CUBLASWINAPI
cublasSsyr2(char uplo, int n, float alpha, const float* x, int incx, const float* y, int incy, float* A, int lda);
void CUBLASWINAPI
cublasDsyr2(char uplo, int n, double alpha, const double* x, int incx, const double* y, int incy, double* A, int lda);
void CUBLASWINAPI cublasCher2(char uplo,
int n,
cuComplex alpha,
const cuComplex* x,
int incx,
const cuComplex* y,
int incy,
cuComplex* A,
int lda);
void CUBLASWINAPI cublasZher2(char uplo,
int n,
cuDoubleComplex alpha,
const cuDoubleComplex* x,
int incx,
const cuDoubleComplex* y,
int incy,
cuDoubleComplex* A,
int lda);
/*------------------------------------------------------------------------*/
/* SPR2/HPR2 */
void CUBLASWINAPI
cublasSspr2(char uplo, int n, float alpha, const float* x, int incx, const float* y, int incy, float* AP);
void CUBLASWINAPI
cublasDspr2(char uplo, int n, double alpha, const double* x, int incx, const double* y, int incy, double* AP);
void CUBLASWINAPI cublasChpr2(
char uplo, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* AP);
void CUBLASWINAPI cublasZhpr2(char uplo,
int n,
cuDoubleComplex alpha,
const cuDoubleComplex* x,
int incx,
const cuDoubleComplex* y,
int incy,
cuDoubleComplex* AP);
/* ------------------------BLAS3 Functions ------------------------------- */
/* GEMM */
void CUBLASWINAPI cublasSgemm(char transa,
char transb,
int m,
int n,
int k,
float alpha,
const float* A,
int lda,
const float* B,
int ldb,
float beta,
float* C,
int ldc);
void CUBLASWINAPI cublasDgemm(char transa,
char transb,
int m,
int n,
int k,
double alpha,
const double* A,
int lda,
const double* B,
int ldb,
double beta,
double* C,
int ldc);
void CUBLASWINAPI cublasCgemm(char transa,
char transb,
int m,
int n,
int k,
cuComplex alpha,
const cuComplex* A,
int lda,
const cuComplex* B,
int ldb,
cuComplex beta,
cuComplex* C,
int ldc);
void CUBLASWINAPI cublasZgemm(char transa,
char transb,
int m,
int n,
int k,
cuDoubleComplex alpha,
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* B,
int ldb,
cuDoubleComplex beta,
cuDoubleComplex* C,
int ldc);
/* -------------------------------------------------------*/
/* SYRK */
void CUBLASWINAPI
cublasSsyrk(char uplo, char trans, int n, int k, float alpha, const float* A, int lda, float beta, float* C, int ldc);
void CUBLASWINAPI cublasDsyrk(
char uplo, char trans, int n, int k, double alpha, const double* A, int lda, double beta, double* C, int ldc);
void CUBLASWINAPI cublasCsyrk(char uplo,
char trans,
int n,
int k,
cuComplex alpha,
const cuComplex* A,
int lda,
cuComplex beta,
cuComplex* C,
int ldc);
void CUBLASWINAPI cublasZsyrk(char uplo,
char trans,
int n,
int k,
cuDoubleComplex alpha,
const cuDoubleComplex* A,
int lda,
cuDoubleComplex beta,
cuDoubleComplex* C,
int ldc);
/* ------------------------------------------------------- */
/* HERK */
void CUBLASWINAPI cublasCherk(
char uplo, char trans, int n, int k, float alpha, const cuComplex* A, int lda, float beta, cuComplex* C, int ldc);
void CUBLASWINAPI cublasZherk(char uplo,
char trans,
int n,
int k,
double alpha,
const cuDoubleComplex* A,
int lda,
double beta,
cuDoubleComplex* C,
int ldc);
/* ------------------------------------------------------- */
/* SYR2K */
void CUBLASWINAPI cublasSsyr2k(char uplo,
char trans,
int n,
int k,
float alpha,
const float* A,
int lda,
const float* B,
int ldb,
float beta,
float* C,
int ldc);
void CUBLASWINAPI cublasDsyr2k(char uplo,
char trans,
int n,
int k,
double alpha,
const double* A,
int lda,
const double* B,
int ldb,
double beta,
double* C,
int ldc);
void CUBLASWINAPI cublasCsyr2k(char uplo,
char trans,
int n,
int k,
cuComplex alpha,
const cuComplex* A,
int lda,
const cuComplex* B,
int ldb,
cuComplex beta,
cuComplex* C,
int ldc);
void CUBLASWINAPI cublasZsyr2k(char uplo,
char trans,
int n,
int k,
cuDoubleComplex alpha,
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* B,
int ldb,
cuDoubleComplex beta,
cuDoubleComplex* C,
int ldc);
/* ------------------------------------------------------- */
/* HER2K */
void CUBLASWINAPI cublasCher2k(char uplo,
char trans,
int n,
int k,
cuComplex alpha,
const cuComplex* A,
int lda,
const cuComplex* B,
int ldb,
float beta,
cuComplex* C,
int ldc);
void CUBLASWINAPI cublasZher2k(char uplo,
char trans,
int n,
int k,
cuDoubleComplex alpha,
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* B,
int ldb,
double beta,
cuDoubleComplex* C,
int ldc);
/*------------------------------------------------------------------------*/
/* SYMM*/
void CUBLASWINAPI cublasSsymm(char side,
char uplo,
int m,
int n,
float alpha,
const float* A,
int lda,
const float* B,
int ldb,
float beta,
float* C,
int ldc);
void CUBLASWINAPI cublasDsymm(char side,
char uplo,
int m,
int n,
double alpha,
const double* A,
int lda,
const double* B,
int ldb,
double beta,
double* C,
int ldc);
void CUBLASWINAPI cublasCsymm(char side,
char uplo,
int m,
int n,
cuComplex alpha,
const cuComplex* A,
int lda,
const cuComplex* B,
int ldb,
cuComplex beta,
cuComplex* C,
int ldc);
void CUBLASWINAPI cublasZsymm(char side,
char uplo,
int m,
int n,
cuDoubleComplex alpha,
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* B,
int ldb,
cuDoubleComplex beta,
cuDoubleComplex* C,
int ldc);
/*------------------------------------------------------------------------*/
/* HEMM*/
void CUBLASWINAPI cublasChemm(char side,
char uplo,
int m,
int n,
cuComplex alpha,
const cuComplex* A,
int lda,
const cuComplex* B,
int ldb,
cuComplex beta,
cuComplex* C,
int ldc);
void CUBLASWINAPI cublasZhemm(char side,
char uplo,
int m,
int n,
cuDoubleComplex alpha,
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* B,
int ldb,
cuDoubleComplex beta,
cuDoubleComplex* C,
int ldc);
/*------------------------------------------------------------------------*/
/* TRSM*/
void CUBLASWINAPI cublasStrsm(char side,
char uplo,
char transa,
char diag,
int m,
int n,
float alpha,
const float* A,
int lda,
float* B,
int ldb);
void CUBLASWINAPI cublasDtrsm(char side,
char uplo,
char transa,
char diag,
int m,
int n,
double alpha,
const double* A,
int lda,
double* B,
int ldb);
void CUBLASWINAPI cublasCtrsm(char side,
char uplo,
char transa,
char diag,
int m,
int n,
cuComplex alpha,
const cuComplex* A,
int lda,
cuComplex* B,
int ldb);
void CUBLASWINAPI cublasZtrsm(char side,
char uplo,
char transa,
char diag,
int m,
int n,
cuDoubleComplex alpha,
const cuDoubleComplex* A,
int lda,
cuDoubleComplex* B,
int ldb);
/*------------------------------------------------------------------------*/
/* TRMM*/
void CUBLASWINAPI cublasStrmm(char side,
char uplo,
char transa,
char diag,
int m,
int n,
float alpha,
const float* A,
int lda,
float* B,
int ldb);
void CUBLASWINAPI cublasDtrmm(char side,
char uplo,
char transa,
char diag,
int m,
int n,
double alpha,
const double* A,
int lda,
double* B,
int ldb);
void CUBLASWINAPI cublasCtrmm(char side,
char uplo,
char transa,
char diag,
int m,
int n,
cuComplex alpha,
const cuComplex* A,
int lda,
cuComplex* B,
int ldb);
void CUBLASWINAPI cublasZtrmm(char side,
char uplo,
char transa,
char diag,
int m,
int n,
cuDoubleComplex alpha,
const cuDoubleComplex* A,
int lda,
cuDoubleComplex* B,
int ldb);
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#endif /* !defined(CUBLAS_H_) */

File diff suppressed because it is too large Load diff

View file

@ -1,693 +0,0 @@
/*
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
/* cublasXt : Host API, Out of Core and Multi-GPU BLAS Library
*/
#if !defined(CUBLAS_XT_H_)
#define CUBLAS_XT_H_
#include "driver_types.h"
#include "cuComplex.h" /* import complex data type */
#include "cublas_v2.h"
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
struct cublasXtContext;
typedef struct cublasXtContext* cublasXtHandle_t;
cublasStatus_t CUBLASWINAPI cublasXtCreate(cublasXtHandle_t* handle);
cublasStatus_t CUBLASWINAPI cublasXtDestroy(cublasXtHandle_t handle);
cublasStatus_t CUBLASWINAPI cublasXtGetNumBoards(int nbDevices, int deviceId[], int* nbBoards);
cublasStatus_t CUBLASWINAPI cublasXtMaxBoards(int* nbGpuBoards);
/* This routine selects the Gpus that the user want to use for CUBLAS-XT */
cublasStatus_t CUBLASWINAPI cublasXtDeviceSelect(cublasXtHandle_t handle, int nbDevices, int deviceId[]);
/* This routine allows to change the dimension of the tiles ( blockDim x blockDim ) */
cublasStatus_t CUBLASWINAPI cublasXtSetBlockDim(cublasXtHandle_t handle, int blockDim);
cublasStatus_t CUBLASWINAPI cublasXtGetBlockDim(cublasXtHandle_t handle, int* blockDim);
typedef enum { CUBLASXT_PINNING_DISABLED = 0, CUBLASXT_PINNING_ENABLED = 1 } cublasXtPinnedMemMode_t;
/* This routine allows to CUBLAS-XT to pin the Host memory if it find out that some of the matrix passed
are not pinned : Pinning/Unpinning the Host memory is still a costly operation
It is better if the user controls the memory on its own (by pinning/unpinning oly when necessary)
*/
cublasStatus_t CUBLASWINAPI cublasXtGetPinningMemMode(cublasXtHandle_t handle, cublasXtPinnedMemMode_t* mode);
cublasStatus_t CUBLASWINAPI cublasXtSetPinningMemMode(cublasXtHandle_t handle, cublasXtPinnedMemMode_t mode);
/* This routines is to provide a CPU Blas routines, used for too small sizes or hybrid computation */
typedef enum {
CUBLASXT_FLOAT = 0,
CUBLASXT_DOUBLE = 1,
CUBLASXT_COMPLEX = 2,
CUBLASXT_DOUBLECOMPLEX = 3,
} cublasXtOpType_t;
typedef enum {
CUBLASXT_GEMM = 0,
CUBLASXT_SYRK = 1,
CUBLASXT_HERK = 2,
CUBLASXT_SYMM = 3,
CUBLASXT_HEMM = 4,
CUBLASXT_TRSM = 5,
CUBLASXT_SYR2K = 6,
CUBLASXT_HER2K = 7,
CUBLASXT_SPMM = 8,
CUBLASXT_SYRKX = 9,
CUBLASXT_HERKX = 10,
CUBLASXT_TRMM = 11,
CUBLASXT_ROUTINE_MAX = 12,
} cublasXtBlasOp_t;
/* Currently only 32-bit integer BLAS routines are supported */
cublasStatus_t CUBLASWINAPI cublasXtSetCpuRoutine(cublasXtHandle_t handle,
cublasXtBlasOp_t blasOp,
cublasXtOpType_t type,
void* blasFunctor);
/* Specified the percentage of work that should done by the CPU, default is 0 (no work) */
cublasStatus_t CUBLASWINAPI cublasXtSetCpuRatio(cublasXtHandle_t handle,
cublasXtBlasOp_t blasOp,
cublasXtOpType_t type,
float ratio);
/* GEMM */
cublasStatus_t CUBLASWINAPI cublasXtSgemm(cublasXtHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
size_t m,
size_t n,
size_t k,
const float* alpha,
const float* A,
size_t lda,
const float* B,
size_t ldb,
const float* beta,
float* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtDgemm(cublasXtHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
size_t m,
size_t n,
size_t k,
const double* alpha,
const double* A,
size_t lda,
const double* B,
size_t ldb,
const double* beta,
double* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtCgemm(cublasXtHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
size_t m,
size_t n,
size_t k,
const cuComplex* alpha,
const cuComplex* A,
size_t lda,
const cuComplex* B,
size_t ldb,
const cuComplex* beta,
cuComplex* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtZgemm(cublasXtHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
size_t m,
size_t n,
size_t k,
const cuDoubleComplex* alpha,
const cuDoubleComplex* A,
size_t lda,
const cuDoubleComplex* B,
size_t ldb,
const cuDoubleComplex* beta,
cuDoubleComplex* C,
size_t ldc);
/* ------------------------------------------------------- */
/* SYRK */
cublasStatus_t CUBLASWINAPI cublasXtSsyrk(cublasXtHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
size_t n,
size_t k,
const float* alpha,
const float* A,
size_t lda,
const float* beta,
float* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtDsyrk(cublasXtHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
size_t n,
size_t k,
const double* alpha,
const double* A,
size_t lda,
const double* beta,
double* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtCsyrk(cublasXtHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
size_t n,
size_t k,
const cuComplex* alpha,
const cuComplex* A,
size_t lda,
const cuComplex* beta,
cuComplex* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtZsyrk(cublasXtHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
size_t n,
size_t k,
const cuDoubleComplex* alpha,
const cuDoubleComplex* A,
size_t lda,
const cuDoubleComplex* beta,
cuDoubleComplex* C,
size_t ldc);
/* -------------------------------------------------------------------- */
/* HERK */
cublasStatus_t CUBLASWINAPI cublasXtCherk(cublasXtHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
size_t n,
size_t k,
const float* alpha,
const cuComplex* A,
size_t lda,
const float* beta,
cuComplex* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtZherk(cublasXtHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
size_t n,
size_t k,
const double* alpha,
const cuDoubleComplex* A,
size_t lda,
const double* beta,
cuDoubleComplex* C,
size_t ldc);
/* -------------------------------------------------------------------- */
/* SYR2K */
cublasStatus_t CUBLASWINAPI cublasXtSsyr2k(cublasXtHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
size_t n,
size_t k,
const float* alpha,
const float* A,
size_t lda,
const float* B,
size_t ldb,
const float* beta,
float* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtDsyr2k(cublasXtHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
size_t n,
size_t k,
const double* alpha,
const double* A,
size_t lda,
const double* B,
size_t ldb,
const double* beta,
double* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtCsyr2k(cublasXtHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
size_t n,
size_t k,
const cuComplex* alpha,
const cuComplex* A,
size_t lda,
const cuComplex* B,
size_t ldb,
const cuComplex* beta,
cuComplex* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtZsyr2k(cublasXtHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
size_t n,
size_t k,
const cuDoubleComplex* alpha,
const cuDoubleComplex* A,
size_t lda,
const cuDoubleComplex* B,
size_t ldb,
const cuDoubleComplex* beta,
cuDoubleComplex* C,
size_t ldc);
/* -------------------------------------------------------------------- */
/* HERKX : variant extension of HERK */
cublasStatus_t CUBLASWINAPI cublasXtCherkx(cublasXtHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
size_t n,
size_t k,
const cuComplex* alpha,
const cuComplex* A,
size_t lda,
const cuComplex* B,
size_t ldb,
const float* beta,
cuComplex* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtZherkx(cublasXtHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
size_t n,
size_t k,
const cuDoubleComplex* alpha,
const cuDoubleComplex* A,
size_t lda,
const cuDoubleComplex* B,
size_t ldb,
const double* beta,
cuDoubleComplex* C,
size_t ldc);
/* -------------------------------------------------------------------- */
/* TRSM */
cublasStatus_t CUBLASWINAPI cublasXtStrsm(cublasXtHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
size_t m,
size_t n,
const float* alpha,
const float* A,
size_t lda,
float* B,
size_t ldb);
cublasStatus_t CUBLASWINAPI cublasXtDtrsm(cublasXtHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
size_t m,
size_t n,
const double* alpha,
const double* A,
size_t lda,
double* B,
size_t ldb);
cublasStatus_t CUBLASWINAPI cublasXtCtrsm(cublasXtHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
size_t m,
size_t n,
const cuComplex* alpha,
const cuComplex* A,
size_t lda,
cuComplex* B,
size_t ldb);
cublasStatus_t CUBLASWINAPI cublasXtZtrsm(cublasXtHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
size_t m,
size_t n,
const cuDoubleComplex* alpha,
const cuDoubleComplex* A,
size_t lda,
cuDoubleComplex* B,
size_t ldb);
/* -------------------------------------------------------------------- */
/* SYMM : Symmetric Multiply Matrix*/
cublasStatus_t CUBLASWINAPI cublasXtSsymm(cublasXtHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
size_t m,
size_t n,
const float* alpha,
const float* A,
size_t lda,
const float* B,
size_t ldb,
const float* beta,
float* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtDsymm(cublasXtHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
size_t m,
size_t n,
const double* alpha,
const double* A,
size_t lda,
const double* B,
size_t ldb,
const double* beta,
double* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtCsymm(cublasXtHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
size_t m,
size_t n,
const cuComplex* alpha,
const cuComplex* A,
size_t lda,
const cuComplex* B,
size_t ldb,
const cuComplex* beta,
cuComplex* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtZsymm(cublasXtHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
size_t m,
size_t n,
const cuDoubleComplex* alpha,
const cuDoubleComplex* A,
size_t lda,
const cuDoubleComplex* B,
size_t ldb,
const cuDoubleComplex* beta,
cuDoubleComplex* C,
size_t ldc);
/* -------------------------------------------------------------------- */
/* HEMM : Hermitian Matrix Multiply */
cublasStatus_t CUBLASWINAPI cublasXtChemm(cublasXtHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
size_t m,
size_t n,
const cuComplex* alpha,
const cuComplex* A,
size_t lda,
const cuComplex* B,
size_t ldb,
const cuComplex* beta,
cuComplex* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtZhemm(cublasXtHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
size_t m,
size_t n,
const cuDoubleComplex* alpha,
const cuDoubleComplex* A,
size_t lda,
const cuDoubleComplex* B,
size_t ldb,
const cuDoubleComplex* beta,
cuDoubleComplex* C,
size_t ldc);
/* -------------------------------------------------------------------- */
/* SYRKX : variant extension of SYRK */
cublasStatus_t CUBLASWINAPI cublasXtSsyrkx(cublasXtHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
size_t n,
size_t k,
const float* alpha,
const float* A,
size_t lda,
const float* B,
size_t ldb,
const float* beta,
float* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtDsyrkx(cublasXtHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
size_t n,
size_t k,
const double* alpha,
const double* A,
size_t lda,
const double* B,
size_t ldb,
const double* beta,
double* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtCsyrkx(cublasXtHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
size_t n,
size_t k,
const cuComplex* alpha,
const cuComplex* A,
size_t lda,
const cuComplex* B,
size_t ldb,
const cuComplex* beta,
cuComplex* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtZsyrkx(cublasXtHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
size_t n,
size_t k,
const cuDoubleComplex* alpha,
const cuDoubleComplex* A,
size_t lda,
const cuDoubleComplex* B,
size_t ldb,
const cuDoubleComplex* beta,
cuDoubleComplex* C,
size_t ldc);
/* -------------------------------------------------------------------- */
/* HER2K : variant extension of HERK */
cublasStatus_t CUBLASWINAPI cublasXtCher2k(cublasXtHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
size_t n,
size_t k,
const cuComplex* alpha,
const cuComplex* A,
size_t lda,
const cuComplex* B,
size_t ldb,
const float* beta,
cuComplex* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtZher2k(cublasXtHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
size_t n,
size_t k,
const cuDoubleComplex* alpha,
const cuDoubleComplex* A,
size_t lda,
const cuDoubleComplex* B,
size_t ldb,
const double* beta,
cuDoubleComplex* C,
size_t ldc);
/* -------------------------------------------------------------------- */
/* SPMM : Symmetric Packed Multiply Matrix*/
cublasStatus_t CUBLASWINAPI cublasXtSspmm(cublasXtHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
size_t m,
size_t n,
const float* alpha,
const float* AP,
const float* B,
size_t ldb,
const float* beta,
float* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtDspmm(cublasXtHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
size_t m,
size_t n,
const double* alpha,
const double* AP,
const double* B,
size_t ldb,
const double* beta,
double* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtCspmm(cublasXtHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
size_t m,
size_t n,
const cuComplex* alpha,
const cuComplex* AP,
const cuComplex* B,
size_t ldb,
const cuComplex* beta,
cuComplex* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtZspmm(cublasXtHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
size_t m,
size_t n,
const cuDoubleComplex* alpha,
const cuDoubleComplex* AP,
const cuDoubleComplex* B,
size_t ldb,
const cuDoubleComplex* beta,
cuDoubleComplex* C,
size_t ldc);
/* -------------------------------------------------------------------- */
/* TRMM */
cublasStatus_t CUBLASWINAPI cublasXtStrmm(cublasXtHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
size_t m,
size_t n,
const float* alpha,
const float* A,
size_t lda,
const float* B,
size_t ldb,
float* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtDtrmm(cublasXtHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
size_t m,
size_t n,
const double* alpha,
const double* A,
size_t lda,
const double* B,
size_t ldb,
double* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtCtrmm(cublasXtHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
size_t m,
size_t n,
const cuComplex* alpha,
const cuComplex* A,
size_t lda,
const cuComplex* B,
size_t ldb,
cuComplex* C,
size_t ldc);
cublasStatus_t CUBLASWINAPI cublasXtZtrmm(cublasXtHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
size_t m,
size_t n,
const cuDoubleComplex* alpha,
const cuDoubleComplex* A,
size_t lda,
const cuDoubleComplex* B,
size_t ldb,
cuDoubleComplex* C,
size_t ldc);
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#endif /* !defined(CUBLAS_XT_H_) */

File diff suppressed because it is too large Load diff

View file

@ -1,273 +0,0 @@
/*
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
/*
* This is the public header file for the new CUBLAS library API, it mapped the generic
* Cublas name functions to the actual _v2 implementations.
*/
#if !defined(CUBLAS_V2_H_)
#define CUBLAS_V2_H_
#undef CUBLASAPI
#ifdef __CUDACC__
#define CUBLASAPI __host__ __device__
#else
#define CUBLASAPI
#endif
#include "cublas_api.h"
#define cublasCreate cublasCreate_v2
#define cublasDestroy cublasDestroy_v2
#define cublasGetVersion cublasGetVersion_v2
#define cublasSetWorkspace cublasSetWorkspace_v2
#define cublasSetStream cublasSetStream_v2
#define cublasGetStream cublasGetStream_v2
#define cublasGetPointerMode cublasGetPointerMode_v2
#define cublasSetPointerMode cublasSetPointerMode_v2
/* Blas3 Routines */
#define cublasSnrm2 cublasSnrm2_v2
#define cublasDnrm2 cublasDnrm2_v2
#define cublasScnrm2 cublasScnrm2_v2
#define cublasDznrm2 cublasDznrm2_v2
#define cublasSdot cublasSdot_v2
#define cublasDdot cublasDdot_v2
#define cublasCdotu cublasCdotu_v2
#define cublasCdotc cublasCdotc_v2
#define cublasZdotu cublasZdotu_v2
#define cublasZdotc cublasZdotc_v2
#define cublasSscal cublasSscal_v2
#define cublasDscal cublasDscal_v2
#define cublasCscal cublasCscal_v2
#define cublasCsscal cublasCsscal_v2
#define cublasZscal cublasZscal_v2
#define cublasZdscal cublasZdscal_v2
#define cublasSaxpy cublasSaxpy_v2
#define cublasDaxpy cublasDaxpy_v2
#define cublasCaxpy cublasCaxpy_v2
#define cublasZaxpy cublasZaxpy_v2
#define cublasScopy cublasScopy_v2
#define cublasDcopy cublasDcopy_v2
#define cublasCcopy cublasCcopy_v2
#define cublasZcopy cublasZcopy_v2
#define cublasSswap cublasSswap_v2
#define cublasDswap cublasDswap_v2
#define cublasCswap cublasCswap_v2
#define cublasZswap cublasZswap_v2
#define cublasIsamax cublasIsamax_v2
#define cublasIdamax cublasIdamax_v2
#define cublasIcamax cublasIcamax_v2
#define cublasIzamax cublasIzamax_v2
#define cublasIsamin cublasIsamin_v2
#define cublasIdamin cublasIdamin_v2
#define cublasIcamin cublasIcamin_v2
#define cublasIzamin cublasIzamin_v2
#define cublasSasum cublasSasum_v2
#define cublasDasum cublasDasum_v2
#define cublasScasum cublasScasum_v2
#define cublasDzasum cublasDzasum_v2
#define cublasSrot cublasSrot_v2
#define cublasDrot cublasDrot_v2
#define cublasCrot cublasCrot_v2
#define cublasCsrot cublasCsrot_v2
#define cublasZrot cublasZrot_v2
#define cublasZdrot cublasZdrot_v2
#define cublasSrotg cublasSrotg_v2
#define cublasDrotg cublasDrotg_v2
#define cublasCrotg cublasCrotg_v2
#define cublasZrotg cublasZrotg_v2
#define cublasSrotm cublasSrotm_v2
#define cublasDrotm cublasDrotm_v2
#define cublasSrotmg cublasSrotmg_v2
#define cublasDrotmg cublasDrotmg_v2
/* Blas2 Routines */
#define cublasSgemv cublasSgemv_v2
#define cublasDgemv cublasDgemv_v2
#define cublasCgemv cublasCgemv_v2
#define cublasZgemv cublasZgemv_v2
#define cublasSgbmv cublasSgbmv_v2
#define cublasDgbmv cublasDgbmv_v2
#define cublasCgbmv cublasCgbmv_v2
#define cublasZgbmv cublasZgbmv_v2
#define cublasStrmv cublasStrmv_v2
#define cublasDtrmv cublasDtrmv_v2
#define cublasCtrmv cublasCtrmv_v2
#define cublasZtrmv cublasZtrmv_v2
#define cublasStbmv cublasStbmv_v2
#define cublasDtbmv cublasDtbmv_v2
#define cublasCtbmv cublasCtbmv_v2
#define cublasZtbmv cublasZtbmv_v2
#define cublasStpmv cublasStpmv_v2
#define cublasDtpmv cublasDtpmv_v2
#define cublasCtpmv cublasCtpmv_v2
#define cublasZtpmv cublasZtpmv_v2
#define cublasStrsv cublasStrsv_v2
#define cublasDtrsv cublasDtrsv_v2
#define cublasCtrsv cublasCtrsv_v2
#define cublasZtrsv cublasZtrsv_v2
#define cublasStpsv cublasStpsv_v2
#define cublasDtpsv cublasDtpsv_v2
#define cublasCtpsv cublasCtpsv_v2
#define cublasZtpsv cublasZtpsv_v2
#define cublasStbsv cublasStbsv_v2
#define cublasDtbsv cublasDtbsv_v2
#define cublasCtbsv cublasCtbsv_v2
#define cublasZtbsv cublasZtbsv_v2
#define cublasSsymv cublasSsymv_v2
#define cublasDsymv cublasDsymv_v2
#define cublasCsymv cublasCsymv_v2
#define cublasZsymv cublasZsymv_v2
#define cublasChemv cublasChemv_v2
#define cublasZhemv cublasZhemv_v2
#define cublasSsbmv cublasSsbmv_v2
#define cublasDsbmv cublasDsbmv_v2
#define cublasChbmv cublasChbmv_v2
#define cublasZhbmv cublasZhbmv_v2
#define cublasSspmv cublasSspmv_v2
#define cublasDspmv cublasDspmv_v2
#define cublasChpmv cublasChpmv_v2
#define cublasZhpmv cublasZhpmv_v2
#define cublasSger cublasSger_v2
#define cublasDger cublasDger_v2
#define cublasCgeru cublasCgeru_v2
#define cublasCgerc cublasCgerc_v2
#define cublasZgeru cublasZgeru_v2
#define cublasZgerc cublasZgerc_v2
#define cublasSsyr cublasSsyr_v2
#define cublasDsyr cublasDsyr_v2
#define cublasCsyr cublasCsyr_v2
#define cublasZsyr cublasZsyr_v2
#define cublasCher cublasCher_v2
#define cublasZher cublasZher_v2
#define cublasSspr cublasSspr_v2
#define cublasDspr cublasDspr_v2
#define cublasChpr cublasChpr_v2
#define cublasZhpr cublasZhpr_v2
#define cublasSsyr2 cublasSsyr2_v2
#define cublasDsyr2 cublasDsyr2_v2
#define cublasCsyr2 cublasCsyr2_v2
#define cublasZsyr2 cublasZsyr2_v2
#define cublasCher2 cublasCher2_v2
#define cublasZher2 cublasZher2_v2
#define cublasSspr2 cublasSspr2_v2
#define cublasDspr2 cublasDspr2_v2
#define cublasChpr2 cublasChpr2_v2
#define cublasZhpr2 cublasZhpr2_v2
/* Blas3 Routines */
#define cublasSgemm cublasSgemm_v2
#define cublasDgemm cublasDgemm_v2
#define cublasCgemm cublasCgemm_v2
#define cublasZgemm cublasZgemm_v2
#define cublasSsyrk cublasSsyrk_v2
#define cublasDsyrk cublasDsyrk_v2
#define cublasCsyrk cublasCsyrk_v2
#define cublasZsyrk cublasZsyrk_v2
#define cublasCherk cublasCherk_v2
#define cublasZherk cublasZherk_v2
#define cublasSsyr2k cublasSsyr2k_v2
#define cublasDsyr2k cublasDsyr2k_v2
#define cublasCsyr2k cublasCsyr2k_v2
#define cublasZsyr2k cublasZsyr2k_v2
#define cublasCher2k cublasCher2k_v2
#define cublasZher2k cublasZher2k_v2
#define cublasSsymm cublasSsymm_v2
#define cublasDsymm cublasDsymm_v2
#define cublasCsymm cublasCsymm_v2
#define cublasZsymm cublasZsymm_v2
#define cublasChemm cublasChemm_v2
#define cublasZhemm cublasZhemm_v2
#define cublasStrsm cublasStrsm_v2
#define cublasDtrsm cublasDtrsm_v2
#define cublasCtrsm cublasCtrsm_v2
#define cublasZtrsm cublasZtrsm_v2
#define cublasStrmm cublasStrmm_v2
#define cublasDtrmm cublasDtrmm_v2
#define cublasCtrmm cublasCtrmm_v2
#define cublasZtrmm cublasZtrmm_v2
#endif /* !defined(CUBLAS_V2_H_) */

File diff suppressed because it is too large Load diff

View file

@ -1,805 +0,0 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef CUDAD3D10_H
#define CUDAD3D10_H
#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
#define __CUDA_DEPRECATED
#elif defined(_MSC_VER)
#define __CUDA_DEPRECATED __declspec(deprecated)
#elif defined(__GNUC__)
#define __CUDA_DEPRECATED __attribute__((deprecated))
#else
#define __CUDA_DEPRECATED
#endif
#ifdef CUDA_FORCE_API_VERSION
#error "CUDA_FORCE_API_VERSION is no longer supported."
#endif
#define cuD3D10CtxCreate cuD3D10CtxCreate_v2
#define cuD3D10ResourceGetSurfaceDimensions cuD3D10ResourceGetSurfaceDimensions_v2
#define cuD3D10ResourceGetMappedPointer cuD3D10ResourceGetMappedPointer_v2
#define cuD3D10ResourceGetMappedSize cuD3D10ResourceGetMappedSize_v2
#define cuD3D10ResourceGetMappedPitch cuD3D10ResourceGetMappedPitch_v2
#ifdef __cplusplus
extern "C" {
#endif
/**
* \defgroup CUDA_D3D10 Direct3D 10 Interoperability
* \ingroup CUDA_DRIVER
*
* ___MANBRIEF___ Direct3D 10 interoperability functions of the low-level CUDA
* driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the Direct3D 10 interoperability functions of the
* low-level CUDA driver application programming interface. Note that mapping
* of Direct3D 10 resources is performed with the graphics API agnostic, resource
* mapping interface described in \ref CUDA_GRAPHICS "Graphics Interoperability".
*
* @{
*/
/**
* CUDA devices corresponding to a D3D10 device
*/
typedef enum CUd3d10DeviceList_enum {
CU_D3D10_DEVICE_LIST_ALL = 0x01, /**< The CUDA devices for all GPUs used by a D3D10 device */
CU_D3D10_DEVICE_LIST_CURRENT_FRAME = 0x02, /**< The CUDA devices for the GPUs used by a D3D10 device in its currently rendering frame */
CU_D3D10_DEVICE_LIST_NEXT_FRAME = 0x03, /**< The CUDA devices for the GPUs to be used by a D3D10 device in the next frame */
} CUd3d10DeviceList;
/**
* \brief Gets the CUDA device corresponding to a display adapter.
*
* Returns in \p *pCudaDevice the CUDA-compatible device corresponding to the
* adapter \p pAdapter obtained from ::IDXGIFactory::EnumAdapters.
*
* If no device on \p pAdapter is CUDA-compatible then the call will fail.
*
* \param pCudaDevice - Returned CUDA device corresponding to \p pAdapter
* \param pAdapter - Adapter to query for CUDA device
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_NOT_FOUND,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa
* ::cuD3D10GetDevices,
* ::cudaD3D10GetDevice
*/
CUresult CUDAAPI cuD3D10GetDevice(CUdevice *pCudaDevice, IDXGIAdapter *pAdapter);
/**
* \brief Gets the CUDA devices corresponding to a Direct3D 10 device
*
* Returns in \p *pCudaDeviceCount the number of CUDA-compatible device corresponding
* to the Direct3D 10 device \p pD3D10Device.
* Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the CUDA-compatible devices
* corresponding to the Direct3D 10 device \p pD3D10Device.
*
* If any of the GPUs being used to render \p pDevice are not CUDA capable then the
* call will return ::CUDA_ERROR_NO_DEVICE.
*
* \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D10Device
* \param pCudaDevices - Returned CUDA devices corresponding to \p pD3D10Device
* \param cudaDeviceCount - The size of the output device array \p pCudaDevices
* \param pD3D10Device - Direct3D 10 device to query for CUDA devices
* \param deviceList - The set of devices to return. This set may be
* ::CU_D3D10_DEVICE_LIST_ALL for all devices,
* ::CU_D3D10_DEVICE_LIST_CURRENT_FRAME for the devices used to
* render the current frame (in SLI), or
* ::CU_D3D10_DEVICE_LIST_NEXT_FRAME for the devices used to
* render the next frame (in SLI).
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_NO_DEVICE,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_NOT_FOUND,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa
* ::cuD3D10GetDevice,
* ::cudaD3D10GetDevices
*/
CUresult CUDAAPI cuD3D10GetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, ID3D10Device *pD3D10Device, CUd3d10DeviceList deviceList);
/**
* \brief Register a Direct3D 10 resource for access by CUDA
*
* Registers the Direct3D 10 resource \p pD3DResource for access by CUDA and
* returns a CUDA handle to \p pD3Dresource in \p pCudaResource.
* The handle returned in \p pCudaResource may be used to map and unmap this
* resource until it is unregistered.
* On success this call will increase the internal reference count on
* \p pD3DResource. This reference count will be decremented when this
* resource is unregistered through ::cuGraphicsUnregisterResource().
*
* This call is potentially high-overhead and should not be called every frame
* in interactive applications.
*
* The type of \p pD3DResource must be one of the following.
* - ::ID3D10Buffer: may be accessed through a device pointer.
* - ::ID3D10Texture1D: individual subresources of the texture may be accessed via arrays
* - ::ID3D10Texture2D: individual subresources of the texture may be accessed via arrays
* - ::ID3D10Texture3D: individual subresources of the texture may be accessed via arrays
*
* The \p Flags argument may be used to specify additional parameters at register
* time. The valid values for this parameter are
* - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
* resource will be used.
* - ::CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST: Specifies that CUDA will
* bind this resource to a surface reference.
* - ::CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER: Specifies that CUDA will perform
* texture gather operations on this resource.
*
* Not all Direct3D resources of the above types may be used for
* interoperability with CUDA. The following are some limitations.
* - The primary rendertarget may not be registered with CUDA.
* - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
* or 32-bit integer or floating-point data cannot be shared.
* - Surfaces of depth or stencil formats cannot be shared.
*
* A complete list of supported DXGI formats is as follows. For compactness the
* notation A_{B,C,D} represents A_B, A_C, and A_D.
* - DXGI_FORMAT_A8_UNORM
* - DXGI_FORMAT_B8G8R8A8_UNORM
* - DXGI_FORMAT_B8G8R8X8_UNORM
* - DXGI_FORMAT_R16_FLOAT
* - DXGI_FORMAT_R16G16B16A16_{FLOAT,SINT,SNORM,UINT,UNORM}
* - DXGI_FORMAT_R16G16_{FLOAT,SINT,SNORM,UINT,UNORM}
* - DXGI_FORMAT_R16_{SINT,SNORM,UINT,UNORM}
* - DXGI_FORMAT_R32_FLOAT
* - DXGI_FORMAT_R32G32B32A32_{FLOAT,SINT,UINT}
* - DXGI_FORMAT_R32G32_{FLOAT,SINT,UINT}
* - DXGI_FORMAT_R32_{SINT,UINT}
* - DXGI_FORMAT_R8G8B8A8_{SINT,SNORM,UINT,UNORM,UNORM_SRGB}
* - DXGI_FORMAT_R8G8_{SINT,SNORM,UINT,UNORM}
* - DXGI_FORMAT_R8_{SINT,SNORM,UINT,UNORM}
*
* If \p pD3DResource is of incorrect type or is already registered then
* ::CUDA_ERROR_INVALID_HANDLE is returned.
* If \p pD3DResource cannot be registered then ::CUDA_ERROR_UNKNOWN is returned.
* If \p Flags is not one of the above specified value then ::CUDA_ERROR_INVALID_VALUE
* is returned.
*
* \param pCudaResource - Returned graphics resource handle
* \param pD3DResource - Direct3D resource to register
* \param Flags - Parameters for resource registration
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_OUT_OF_MEMORY,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa
* ::cuGraphicsUnregisterResource,
* ::cuGraphicsMapResources,
* ::cuGraphicsSubResourceGetMappedArray,
* ::cuGraphicsResourceGetMappedPointer,
* ::cudaGraphicsD3D10RegisterResource
*/
CUresult CUDAAPI cuGraphicsD3D10RegisterResource(CUgraphicsResource *pCudaResource, ID3D10Resource *pD3DResource, unsigned int Flags);
/**
* \defgroup CUDA_D3D10_DEPRECATED Direct3D 10 Interoperability [DEPRECATED]
*
* ___MANBRIEF___ deprecated Direct3D 10 interoperability functions of the
* low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes deprecated Direct3D 10 interoperability functionality.
* @{
*/
/** Flags to register a resource */
typedef enum CUD3D10register_flags_enum {
CU_D3D10_REGISTER_FLAGS_NONE = 0x00,
CU_D3D10_REGISTER_FLAGS_ARRAY = 0x01,
} CUD3D10register_flags;
/** Flags to map or unmap a resource */
typedef enum CUD3D10map_flags_enum {
CU_D3D10_MAPRESOURCE_FLAGS_NONE = 0x00,
CU_D3D10_MAPRESOURCE_FLAGS_READONLY = 0x01,
CU_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD = 0x02,
} CUD3D10map_flags;
/**
* \brief Create a CUDA context for interoperability with Direct3D 10
*
* \deprecated This function is deprecated as of CUDA 5.0.
*
* This function is deprecated and should no longer be used. It is
* no longer necessary to associate a CUDA context with a D3D10
* device in order to achieve maximum interoperability performance.
*
* \param pCtx - Returned newly created CUDA context
* \param pCudaDevice - Returned pointer to the device on which the context was created
* \param Flags - Context creation flags (see ::cuCtxCreate() for details)
* \param pD3DDevice - Direct3D device to create interoperability context with
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_OUT_OF_MEMORY,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa
* ::cuD3D10GetDevice,
* ::cuGraphicsD3D10RegisterResource
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, ID3D10Device *pD3DDevice);
/**
* \brief Create a CUDA context for interoperability with Direct3D 10
*
* \deprecated This function is deprecated as of CUDA 5.0.
*
* This function is deprecated and should no longer be used. It is
* no longer necessary to associate a CUDA context with a D3D10
* device in order to achieve maximum interoperability performance.
*
* \param pCtx - Returned newly created CUDA context
* \param flags - Context creation flags (see ::cuCtxCreate() for details)
* \param pD3DDevice - Direct3D device to create interoperability context with
* \param cudaDevice - The CUDA device on which to create the context. This device
* must be among the devices returned when querying
* ::CU_D3D10_DEVICES_ALL from ::cuD3D10GetDevices.
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_OUT_OF_MEMORY,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa
* ::cuD3D10GetDevices,
* ::cuGraphicsD3D10RegisterResource
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10CtxCreateOnDevice(CUcontext *pCtx, unsigned int flags, ID3D10Device *pD3DDevice, CUdevice cudaDevice);
/**
* \brief Get the Direct3D 10 device against which the current CUDA context was
* created
*
* \deprecated This function is deprecated as of CUDA 5.0.
*
* This function is deprecated and should no longer be used. It is
* no longer necessary to associate a CUDA context with a D3D10
* device in order to achieve maximum interoperability performance.
*
* \param ppD3DDevice - Returned Direct3D device corresponding to CUDA context
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT
* \notefnerr
*
* \sa
* ::cuD3D10GetDevice
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10GetDirect3DDevice(ID3D10Device **ppD3DDevice);
/**
* \brief Register a Direct3D resource for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Registers the Direct3D resource \p pResource for access by CUDA.
*
* If this call is successful, then the application will be able to map and
* unmap this resource until it is unregistered through
* ::cuD3D10UnregisterResource(). Also on success, this call will increase the
* internal reference count on \p pResource. This reference count will be
* decremented when this resource is unregistered through
* ::cuD3D10UnregisterResource().
*
* This call is potentially high-overhead and should not be called every frame
* in interactive applications.
*
* The type of \p pResource must be one of the following.
*
* - ::ID3D10Buffer: Cannot be used with \p Flags set to
* ::CU_D3D10_REGISTER_FLAGS_ARRAY.
* - ::ID3D10Texture1D: No restrictions.
* - ::ID3D10Texture2D: No restrictions.
* - ::ID3D10Texture3D: No restrictions.
*
* The \p Flags argument specifies the mechanism through which CUDA will
* access the Direct3D resource. The following values are allowed.
*
* - ::CU_D3D10_REGISTER_FLAGS_NONE: Specifies that CUDA will access this
* resource through a ::CUdeviceptr. The pointer, size, and (for textures),
* pitch for each subresource of this allocation may be queried through
* ::cuD3D10ResourceGetMappedPointer(), ::cuD3D10ResourceGetMappedSize(),
* and ::cuD3D10ResourceGetMappedPitch() respectively. This option is valid
* for all resource types.
* - ::CU_D3D10_REGISTER_FLAGS_ARRAY: Specifies that CUDA will access this
* resource through a ::CUarray queried on a sub-resource basis through
* ::cuD3D10ResourceGetMappedArray(). This option is only valid for
* resources of type ::ID3D10Texture1D, ::ID3D10Texture2D, and
* ::ID3D10Texture3D.
*
* Not all Direct3D resources of the above types may be used for
* interoperability with CUDA. The following are some limitations.
*
* - The primary rendertarget may not be registered with CUDA.
* - Resources allocated as shared may not be registered with CUDA.
* - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
* or 32-bit integer or floating-point data cannot be shared.
* - Surfaces of depth or stencil formats cannot be shared.
*
* If Direct3D interoperability is not initialized on this context then
* ::CUDA_ERROR_INVALID_CONTEXT is returned. If \p pResource is of incorrect
* type or is already registered, then ::CUDA_ERROR_INVALID_HANDLE is
* returned. If \p pResource cannot be registered, then ::CUDA_ERROR_UNKNOWN
* is returned.
*
* \param pResource - Resource to register
* \param Flags - Parameters for resource registration
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_OUT_OF_MEMORY,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa ::cuGraphicsD3D10RegisterResource
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10RegisterResource(ID3D10Resource *pResource, unsigned int Flags);
/**
* \brief Unregister a Direct3D resource
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Unregisters the Direct3D resource \p pResource so it is not accessible by
* CUDA unless registered again.
*
* If \p pResource is not registered, then ::CUDA_ERROR_INVALID_HANDLE is
* returned.
*
* \param pResource - Resources to unregister
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa ::cuGraphicsUnregisterResource
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10UnregisterResource(ID3D10Resource *pResource);
/**
* \brief Map Direct3D resources for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Maps the \p count Direct3D resources in \p ppResources for access by CUDA.
*
* The resources in \p ppResources may be accessed in CUDA kernels until they
* are unmapped. Direct3D should not access any resources while they are mapped
* by CUDA. If an application does so, the results are undefined.
*
* This function provides the synchronization guarantee that any Direct3D calls
* issued before ::cuD3D10MapResources() will complete before any CUDA kernels
* issued after ::cuD3D10MapResources() begin.
*
* If any of \p ppResources have not been registered for use with CUDA or if
* \p ppResources contains any duplicate entries, then
* ::CUDA_ERROR_INVALID_HANDLE is returned. If any of \p ppResources are
* presently mapped for access by CUDA, then ::CUDA_ERROR_ALREADY_MAPPED is
* returned.
*
* \param count - Number of resources to map for CUDA
* \param ppResources - Resources to map for CUDA
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_ALREADY_MAPPED,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa ::cuGraphicsMapResources
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10MapResources(unsigned int count, ID3D10Resource **ppResources);
/**
* \brief Unmap Direct3D resources
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Unmaps the \p count Direct3D resources in \p ppResources.
*
* This function provides the synchronization guarantee that any CUDA kernels
* issued before ::cuD3D10UnmapResources() will complete before any Direct3D
* calls issued after ::cuD3D10UnmapResources() begin.
*
* If any of \p ppResources have not been registered for use with CUDA or if
* \p ppResources contains any duplicate entries, then
* ::CUDA_ERROR_INVALID_HANDLE is returned. If any of \p ppResources are not
* presently mapped for access by CUDA, then ::CUDA_ERROR_NOT_MAPPED is
* returned.
*
* \param count - Number of resources to unmap for CUDA
* \param ppResources - Resources to unmap for CUDA
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_NOT_MAPPED,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa ::cuGraphicsUnmapResources
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10UnmapResources(unsigned int count, ID3D10Resource **ppResources);
/**
* \brief Set usage flags for mapping a Direct3D resource
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Set flags for mapping the Direct3D resource \p pResource.
*
* Changes to flags will take effect the next time \p pResource is mapped. The
* \p Flags argument may be any of the following.
*
* - ::CU_D3D10_MAPRESOURCE_FLAGS_NONE: Specifies no hints about how this
* resource will be used. It is therefore assumed that this resource will be
* read from and written to by CUDA kernels. This is the default value.
* - ::CU_D3D10_MAPRESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which
* access this resource will not write to this resource.
* - ::CU_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels
* which access this resource will not read from this resource and will
* write over the entire contents of the resource, so none of the data
* previously stored in the resource will be preserved.
*
* If \p pResource has not been registered for use with CUDA, then
* ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is presently
* mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned.
*
* \param pResource - Registered resource to set flags for
* \param Flags - Parameters for resource mapping
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_ALREADY_MAPPED
* \notefnerr
*
* \sa ::cuGraphicsResourceSetMapFlags
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceSetMapFlags(ID3D10Resource *pResource, unsigned int Flags);
/**
* \brief Get an array through which to access a subresource of a Direct3D
* resource which has been mapped for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Returns in \p *pArray an array through which the subresource of the mapped
* Direct3D resource \p pResource, which corresponds to \p SubResource may be
* accessed. The value set in \p pArray may change every time that \p pResource
* is mapped.
*
* If \p pResource is not registered, then ::CUDA_ERROR_INVALID_HANDLE is
* returned. If \p pResource was not registered with usage flags
* ::CU_D3D10_REGISTER_FLAGS_ARRAY, then ::CUDA_ERROR_INVALID_HANDLE is
* returned. If \p pResource is not mapped, then ::CUDA_ERROR_NOT_MAPPED is
* returned.
*
* For usage requirements of the \p SubResource parameter, see
* ::cuD3D10ResourceGetMappedPointer().
*
* \param pArray - Returned array corresponding to subresource
* \param pResource - Mapped resource to access
* \param SubResource - Subresource of pResource to access
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_NOT_MAPPED
* \notefnerr
*
* \sa ::cuGraphicsSubResourceGetMappedArray
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceGetMappedArray(CUarray *pArray, ID3D10Resource *pResource, unsigned int SubResource);
/**
* \brief Get a pointer through which to access a subresource of a Direct3D
* resource which has been mapped for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Returns in \p *pDevPtr the base pointer of the subresource of the mapped
* Direct3D resource \p pResource, which corresponds to \p SubResource. The
* value set in \p pDevPtr may change every time that \p pResource is mapped.
*
* If \p pResource is not registered, then ::CUDA_ERROR_INVALID_HANDLE is
* returned. If \p pResource was not registered with usage flags
* ::CU_D3D10_REGISTER_FLAGS_NONE, then ::CUDA_ERROR_INVALID_HANDLE is
* returned. If \p pResource is not mapped, then ::CUDA_ERROR_NOT_MAPPED is
* returned.
*
* If \p pResource is of type ::ID3D10Buffer, then \p SubResource must be 0.
* If \p pResource is of any other type, then the value of \p SubResource must
* come from the subresource calculation in ::D3D10CalcSubResource().
*
* \param pDevPtr - Returned pointer corresponding to subresource
* \param pResource - Mapped resource to access
* \param SubResource - Subresource of pResource to access
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_NOT_MAPPED
* \notefnerr
*
* \sa ::cuGraphicsResourceGetMappedPointer
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceGetMappedPointer(CUdeviceptr *pDevPtr, ID3D10Resource *pResource, unsigned int SubResource);
/**
* \brief Get the size of a subresource of a Direct3D resource which has been
* mapped for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Returns in \p *pSize the size of the subresource of the mapped Direct3D
* resource \p pResource, which corresponds to \p SubResource. The value set
* in \p pSize may change every time that \p pResource is mapped.
*
* If \p pResource has not been registered for use with CUDA, then
* ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource was not registered
* with usage flags ::CU_D3D10_REGISTER_FLAGS_NONE, then
* ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is not mapped for
* access by CUDA, then ::CUDA_ERROR_NOT_MAPPED is returned.
*
* For usage requirements of the \p SubResource parameter, see
* ::cuD3D10ResourceGetMappedPointer().
*
* \param pSize - Returned size of subresource
* \param pResource - Mapped resource to access
* \param SubResource - Subresource of pResource to access
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_NOT_MAPPED
* \notefnerr
*
* \sa ::cuGraphicsResourceGetMappedPointer
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceGetMappedSize(size_t *pSize, ID3D10Resource *pResource, unsigned int SubResource);
/**
* \brief Get the pitch of a subresource of a Direct3D resource which has been
* mapped for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Returns in \p *pPitch and \p *pPitchSlice the pitch and Z-slice pitch of the
* subresource of the mapped Direct3D resource \p pResource, which corresponds
* to \p SubResource. The values set in \p pPitch and \p pPitchSlice may
* change every time that \p pResource is mapped.
*
* The pitch and Z-slice pitch values may be used to compute the location of a
* sample on a surface as follows.
*
* For a 2D surface, the byte offset of the sample at position \b x, \b y from
* the base pointer of the surface is:
*
* \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
*
* For a 3D surface, the byte offset of the sample at position \b x, \b y,
* \b z from the base pointer of the surface is:
*
* \b z* \b slicePitch + \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
*
* Both parameters \p pPitch and \p pPitchSlice are optional and may be set to
* NULL.
*
* If \p pResource is not of type ::IDirect3DBaseTexture10 or one of its
* sub-types or if \p pResource has not been registered for use with CUDA, then
* ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource was not registered
* with usage flags ::CU_D3D10_REGISTER_FLAGS_NONE, then
* ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is not mapped for
* access by CUDA, then ::CUDA_ERROR_NOT_MAPPED is returned.
*
* For usage requirements of the \p SubResource parameter, see
* ::cuD3D10ResourceGetMappedPointer().
*
* \param pPitch - Returned pitch of subresource
* \param pPitchSlice - Returned Z-slice pitch of subresource
* \param pResource - Mapped resource to access
* \param SubResource - Subresource of pResource to access
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_NOT_MAPPED
* \notefnerr
*
* \sa ::cuGraphicsSubResourceGetMappedArray
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceGetMappedPitch(size_t *pPitch, size_t *pPitchSlice, ID3D10Resource *pResource, unsigned int SubResource);
/**
* \brief Get the dimensions of a registered surface
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Returns in \p *pWidth, \p *pHeight, and \p *pDepth the dimensions of the
* subresource of the mapped Direct3D resource \p pResource, which corresponds
* to \p SubResource.
*
* Because anti-aliased surfaces may have multiple samples per pixel, it is
* possible that the dimensions of a resource will be an integer factor larger
* than the dimensions reported by the Direct3D runtime.
*
* The parameters \p pWidth, \p pHeight, and \p pDepth are optional. For 2D
* surfaces, the value returned in \p *pDepth will be 0.
*
* If \p pResource is not of type ::IDirect3DBaseTexture10 or
* ::IDirect3DSurface10 or if \p pResource has not been registered for use
* with CUDA, then ::CUDA_ERROR_INVALID_HANDLE is returned.
*
* For usage requirements of the \p SubResource parameter, see
* ::cuD3D10ResourceGetMappedPointer().
*
* \param pWidth - Returned width of surface
* \param pHeight - Returned height of surface
* \param pDepth - Returned depth of surface
* \param pResource - Registered resource to access
* \param SubResource - Subresource of pResource to access
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_INVALID_HANDLE
* \notefnerr
*
* \sa ::cuGraphicsSubResourceGetMappedArray
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceGetSurfaceDimensions(size_t *pWidth, size_t *pHeight, size_t *pDepth, ID3D10Resource *pResource, unsigned int SubResource);
/** @} */ /* END CUDA_D3D10_DEPRECATED */
/** @} */ /* END CUDA_D3D10 */
#if defined(__CUDA_API_VERSION_INTERNAL)
#undef cuD3D10CtxCreate
#undef cuD3D10ResourceGetSurfaceDimensions
#undef cuD3D10ResourceGetMappedPointer
#undef cuD3D10ResourceGetMappedSize
#undef cuD3D10ResourceGetMappedPitch
CUresult CUDAAPI cuD3D10CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, ID3D10Device *pD3DDevice);
CUresult CUDAAPI cuD3D10ResourceGetMappedPitch(unsigned int *pPitch, unsigned int *pPitchSlice, ID3D10Resource *pResource, unsigned int SubResource);
CUresult CUDAAPI cuD3D10ResourceGetMappedPointer(CUdeviceptr_v1 *pDevPtr, ID3D10Resource *pResource, unsigned int SubResource);
CUresult CUDAAPI cuD3D10ResourceGetMappedSize(unsigned int *pSize, ID3D10Resource *pResource, unsigned int SubResource);
CUresult CUDAAPI cuD3D10ResourceGetSurfaceDimensions(unsigned int *pWidth, unsigned int *pHeight, unsigned int *pDepth, ID3D10Resource *pResource, unsigned int SubResource);
#endif /* __CUDA_API_VERSION_INTERNAL */
#ifdef __cplusplus
};
#endif
#undef __CUDA_DEPRECATED
#endif

View file

@ -1,119 +0,0 @@
/*
* Copyright 2020-2021 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef CUDAD3D10TYPEDEFS_H
#define CUDAD3D10TYPEDEFS_H
// Dependent includes for cudaD3D10.h
#include <rpcsal.h>
#include <D3D10_1.h>
#include <cudaD3D10.h>
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
/*
* Macros for the latest version for each driver function in cudaD3D10.h
*/
#define PFN_cuD3D10GetDevice PFN_cuD3D10GetDevice_v2010
#define PFN_cuD3D10GetDevices PFN_cuD3D10GetDevices_v3020
#define PFN_cuGraphicsD3D10RegisterResource PFN_cuGraphicsD3D10RegisterResource_v3000
#define PFN_cuD3D10CtxCreate PFN_cuD3D10CtxCreate_v3020
#define PFN_cuD3D10CtxCreateOnDevice PFN_cuD3D10CtxCreateOnDevice_v3020
#define PFN_cuD3D10GetDirect3DDevice PFN_cuD3D10GetDirect3DDevice_v3020
#define PFN_cuD3D10RegisterResource PFN_cuD3D10RegisterResource_v2010
#define PFN_cuD3D10UnregisterResource PFN_cuD3D10UnregisterResource_v2010
#define PFN_cuD3D10MapResources PFN_cuD3D10MapResources_v2010
#define PFN_cuD3D10UnmapResources PFN_cuD3D10UnmapResources_v2010
#define PFN_cuD3D10ResourceSetMapFlags PFN_cuD3D10ResourceSetMapFlags_v2010
#define PFN_cuD3D10ResourceGetMappedArray PFN_cuD3D10ResourceGetMappedArray_v2010
#define PFN_cuD3D10ResourceGetMappedPointer PFN_cuD3D10ResourceGetMappedPointer_v3020
#define PFN_cuD3D10ResourceGetMappedSize PFN_cuD3D10ResourceGetMappedSize_v3020
#define PFN_cuD3D10ResourceGetMappedPitch PFN_cuD3D10ResourceGetMappedPitch_v3020
#define PFN_cuD3D10ResourceGetSurfaceDimensions PFN_cuD3D10ResourceGetSurfaceDimensions_v3020
/**
* Type definitions for functions defined in cudaD3D10.h
*/
typedef CUresult (CUDAAPI *PFN_cuD3D10GetDevice_v2010)(CUdevice_v1 *pCudaDevice, IDXGIAdapter *pAdapter);
typedef CUresult (CUDAAPI *PFN_cuD3D10GetDevices_v3020)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, ID3D10Device *pD3D10Device, CUd3d10DeviceList deviceList);
typedef CUresult (CUDAAPI *PFN_cuGraphicsD3D10RegisterResource_v3000)(CUgraphicsResource *pCudaResource, ID3D10Resource *pD3DResource, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuD3D10CtxCreate_v3020)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, ID3D10Device *pD3DDevice);
typedef CUresult (CUDAAPI *PFN_cuD3D10CtxCreateOnDevice_v3020)(CUcontext *pCtx, unsigned int flags, ID3D10Device *pD3DDevice, CUdevice_v1 cudaDevice);
typedef CUresult (CUDAAPI *PFN_cuD3D10GetDirect3DDevice_v3020)(ID3D10Device **ppD3DDevice);
typedef CUresult (CUDAAPI *PFN_cuD3D10RegisterResource_v2010)(ID3D10Resource *pResource, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuD3D10UnregisterResource_v2010)(ID3D10Resource *pResource);
typedef CUresult (CUDAAPI *PFN_cuD3D10MapResources_v2010)(unsigned int count, ID3D10Resource **ppResources);
typedef CUresult (CUDAAPI *PFN_cuD3D10UnmapResources_v2010)(unsigned int count, ID3D10Resource **ppResources);
typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceSetMapFlags_v2010)(ID3D10Resource *pResource, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedArray_v2010)(CUarray *pArray, ID3D10Resource *pResource, unsigned int SubResource);
typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedPointer_v3020)(CUdeviceptr_v2 *pDevPtr, ID3D10Resource *pResource, unsigned int SubResource);
typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedSize_v3020)(size_t *pSize, ID3D10Resource *pResource, unsigned int SubResource);
typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedPitch_v3020)(size_t *pPitch, size_t *pPitchSlice, ID3D10Resource *pResource, unsigned int SubResource);
typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetSurfaceDimensions_v3020)(size_t *pWidth, size_t *pHeight, size_t *pDepth, ID3D10Resource *pResource, unsigned int SubResource);
/*
* Type definitions for older versioned functions in cudaD3D10.h
*/
#if defined(__CUDA_API_VERSION_INTERNAL)
typedef CUresult (CUDAAPI *PFN_cuD3D10CtxCreate_v2010)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, ID3D10Device *pD3DDevice);
typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedPitch_v2010)(unsigned int *pPitch, unsigned int *pPitchSlice, ID3D10Resource *pResource, unsigned int SubResource);
typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedPointer_v2010)(CUdeviceptr_v1 *pDevPtr, ID3D10Resource *pResource, unsigned int SubResource);
typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedSize_v2010)(unsigned int *pSize, ID3D10Resource *pResource, unsigned int SubResource);
typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetSurfaceDimensions_v2010)(unsigned int *pWidth, unsigned int *pHeight, unsigned int *pDepth, ID3D10Resource *pResource, unsigned int SubResource);
#endif
#ifdef __cplusplus
}
#endif // __cplusplus
#endif // file guard

View file

@ -1,357 +0,0 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef CUDAD3D11_H
#define CUDAD3D11_H
#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
#define __CUDA_DEPRECATED
#elif defined(_MSC_VER)
#define __CUDA_DEPRECATED __declspec(deprecated)
#elif defined(__GNUC__)
#define __CUDA_DEPRECATED __attribute__((deprecated))
#else
#define __CUDA_DEPRECATED
#endif
#ifdef CUDA_FORCE_API_VERSION
#error "CUDA_FORCE_API_VERSION is no longer supported."
#endif
#define cuD3D11CtxCreate cuD3D11CtxCreate_v2
#ifdef __cplusplus
extern "C" {
#endif
/**
* \defgroup CUDA_D3D11 Direct3D 11 Interoperability
* \ingroup CUDA_DRIVER
*
* ___MANBRIEF___ Direct3D 11 interoperability functions of the low-level CUDA
* driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the Direct3D 11 interoperability functions of the
* low-level CUDA driver application programming interface. Note that mapping
* of Direct3D 11 resources is performed with the graphics API agnostic, resource
* mapping interface described in \ref CUDA_GRAPHICS "Graphics Interoperability".
*
* @{
*/
/**
* CUDA devices corresponding to a D3D11 device
*/
typedef enum CUd3d11DeviceList_enum {
CU_D3D11_DEVICE_LIST_ALL = 0x01, /**< The CUDA devices for all GPUs used by a D3D11 device */
CU_D3D11_DEVICE_LIST_CURRENT_FRAME = 0x02, /**< The CUDA devices for the GPUs used by a D3D11 device in its currently rendering frame */
CU_D3D11_DEVICE_LIST_NEXT_FRAME = 0x03, /**< The CUDA devices for the GPUs to be used by a D3D11 device in the next frame */
} CUd3d11DeviceList;
/**
* \brief Gets the CUDA device corresponding to a display adapter.
*
* Returns in \p *pCudaDevice the CUDA-compatible device corresponding to the
* adapter \p pAdapter obtained from ::IDXGIFactory::EnumAdapters.
*
* If no device on \p pAdapter is CUDA-compatible the call will return
* ::CUDA_ERROR_NO_DEVICE.
*
* \param pCudaDevice - Returned CUDA device corresponding to \p pAdapter
* \param pAdapter - Adapter to query for CUDA device
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_NO_DEVICE,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_NOT_FOUND,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa
* ::cuD3D11GetDevices,
* ::cudaD3D11GetDevice
*/
CUresult CUDAAPI cuD3D11GetDevice(CUdevice *pCudaDevice, IDXGIAdapter *pAdapter);
/**
* \brief Gets the CUDA devices corresponding to a Direct3D 11 device
*
* Returns in \p *pCudaDeviceCount the number of CUDA-compatible device corresponding
* to the Direct3D 11 device \p pD3D11Device.
* Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the CUDA-compatible devices
* corresponding to the Direct3D 11 device \p pD3D11Device.
*
* If any of the GPUs being used to render \p pDevice are not CUDA capable then the
* call will return ::CUDA_ERROR_NO_DEVICE.
*
* \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D11Device
* \param pCudaDevices - Returned CUDA devices corresponding to \p pD3D11Device
* \param cudaDeviceCount - The size of the output device array \p pCudaDevices
* \param pD3D11Device - Direct3D 11 device to query for CUDA devices
* \param deviceList - The set of devices to return. This set may be
* ::CU_D3D11_DEVICE_LIST_ALL for all devices,
* ::CU_D3D11_DEVICE_LIST_CURRENT_FRAME for the devices used to
* render the current frame (in SLI), or
* ::CU_D3D11_DEVICE_LIST_NEXT_FRAME for the devices used to
* render the next frame (in SLI).
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_NO_DEVICE,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_NOT_FOUND,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa
* ::cuD3D11GetDevice,
* ::cudaD3D11GetDevices
*/
CUresult CUDAAPI cuD3D11GetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, ID3D11Device *pD3D11Device, CUd3d11DeviceList deviceList);
/**
* \brief Register a Direct3D 11 resource for access by CUDA
*
* Registers the Direct3D 11 resource \p pD3DResource for access by CUDA and
* returns a CUDA handle to \p pD3Dresource in \p pCudaResource.
* The handle returned in \p pCudaResource may be used to map and unmap this
* resource until it is unregistered.
* On success this call will increase the internal reference count on
* \p pD3DResource. This reference count will be decremented when this
* resource is unregistered through ::cuGraphicsUnregisterResource().
*
* This call is potentially high-overhead and should not be called every frame
* in interactive applications.
*
* The type of \p pD3DResource must be one of the following.
* - ::ID3D11Buffer: may be accessed through a device pointer.
* - ::ID3D11Texture1D: individual subresources of the texture may be accessed via arrays
* - ::ID3D11Texture2D: individual subresources of the texture may be accessed via arrays
* - ::ID3D11Texture3D: individual subresources of the texture may be accessed via arrays
*
* The \p Flags argument may be used to specify additional parameters at register
* time. The valid values for this parameter are
* - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
* resource will be used.
* - ::CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST: Specifies that CUDA will
* bind this resource to a surface reference.
* - ::CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER: Specifies that CUDA will perform
* texture gather operations on this resource.
*
* Not all Direct3D resources of the above types may be used for
* interoperability with CUDA. The following are some limitations.
* - The primary rendertarget may not be registered with CUDA.
* - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
* or 32-bit integer or floating-point data cannot be shared.
* - Surfaces of depth or stencil formats cannot be shared.
*
* A complete list of supported DXGI formats is as follows. For compactness the
* notation A_{B,C,D} represents A_B, A_C, and A_D.
* - DXGI_FORMAT_A8_UNORM
* - DXGI_FORMAT_B8G8R8A8_UNORM
* - DXGI_FORMAT_B8G8R8X8_UNORM
* - DXGI_FORMAT_R16_FLOAT
* - DXGI_FORMAT_R16G16B16A16_{FLOAT,SINT,SNORM,UINT,UNORM}
* - DXGI_FORMAT_R16G16_{FLOAT,SINT,SNORM,UINT,UNORM}
* - DXGI_FORMAT_R16_{SINT,SNORM,UINT,UNORM}
* - DXGI_FORMAT_R32_FLOAT
* - DXGI_FORMAT_R32G32B32A32_{FLOAT,SINT,UINT}
* - DXGI_FORMAT_R32G32_{FLOAT,SINT,UINT}
* - DXGI_FORMAT_R32_{SINT,UINT}
* - DXGI_FORMAT_R8G8B8A8_{SINT,SNORM,UINT,UNORM,UNORM_SRGB}
* - DXGI_FORMAT_R8G8_{SINT,SNORM,UINT,UNORM}
* - DXGI_FORMAT_R8_{SINT,SNORM,UINT,UNORM}
*
* If \p pD3DResource is of incorrect type or is already registered then
* ::CUDA_ERROR_INVALID_HANDLE is returned.
* If \p pD3DResource cannot be registered then ::CUDA_ERROR_UNKNOWN is returned.
* If \p Flags is not one of the above specified value then ::CUDA_ERROR_INVALID_VALUE
* is returned.
*
* \param pCudaResource - Returned graphics resource handle
* \param pD3DResource - Direct3D resource to register
* \param Flags - Parameters for resource registration
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_OUT_OF_MEMORY,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa
* ::cuGraphicsUnregisterResource,
* ::cuGraphicsMapResources,
* ::cuGraphicsSubResourceGetMappedArray,
* ::cuGraphicsResourceGetMappedPointer,
* ::cudaGraphicsD3D11RegisterResource
*/
CUresult CUDAAPI cuGraphicsD3D11RegisterResource(CUgraphicsResource *pCudaResource, ID3D11Resource *pD3DResource, unsigned int Flags);
/**
* \defgroup CUDA_D3D11_DEPRECATED Direct3D 11 Interoperability [DEPRECATED]
*
* ___MANBRIEF___ deprecated Direct3D 11 interoperability functions of the
* low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes deprecated Direct3D 11 interoperability functionality.
* @{
*/
/**
* \brief Create a CUDA context for interoperability with Direct3D 11
*
* \deprecated This function is deprecated as of CUDA 5.0.
*
* This function is deprecated and should no longer be used. It is
* no longer necessary to associate a CUDA context with a D3D11
* device in order to achieve maximum interoperability performance.
*
* \param pCtx - Returned newly created CUDA context
* \param pCudaDevice - Returned pointer to the device on which the context was created
* \param Flags - Context creation flags (see ::cuCtxCreate() for details)
* \param pD3DDevice - Direct3D device to create interoperability context with
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_OUT_OF_MEMORY,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa
* ::cuD3D11GetDevice,
* ::cuGraphicsD3D11RegisterResource
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D11CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, ID3D11Device *pD3DDevice);
/**
* \brief Create a CUDA context for interoperability with Direct3D 11
*
* \deprecated This function is deprecated as of CUDA 5.0.
*
* This function is deprecated and should no longer be used. It is
* no longer necessary to associate a CUDA context with a D3D11
* device in order to achieve maximum interoperability performance.
*
* \param pCtx - Returned newly created CUDA context
* \param flags - Context creation flags (see ::cuCtxCreate() for details)
* \param pD3DDevice - Direct3D device to create interoperability context with
* \param cudaDevice - The CUDA device on which to create the context. This device
* must be among the devices returned when querying
* ::CU_D3D11_DEVICES_ALL from ::cuD3D11GetDevices.
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_OUT_OF_MEMORY,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa
* ::cuD3D11GetDevices,
* ::cuGraphicsD3D11RegisterResource
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D11CtxCreateOnDevice(CUcontext *pCtx, unsigned int flags, ID3D11Device *pD3DDevice, CUdevice cudaDevice);
/**
* \brief Get the Direct3D 11 device against which the current CUDA context was
* created
*
* \deprecated This function is deprecated as of CUDA 5.0.
*
* This function is deprecated and should no longer be used. It is
* no longer necessary to associate a CUDA context with a D3D11
* device in order to achieve maximum interoperability performance.
*
* \param ppD3DDevice - Returned Direct3D device corresponding to CUDA context
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT
* \notefnerr
*
* \sa
* ::cuD3D11GetDevice
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D11GetDirect3DDevice(ID3D11Device **ppD3DDevice);
/** @} */ /* END CUDA_D3D11_DEPRECATED */
/** @} */ /* END CUDA_D3D11 */
#if defined(__CUDA_API_VERSION_INTERNAL)
#undef cuD3D11CtxCreate
CUresult CUDAAPI cuD3D11CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, ID3D11Device *pD3DDevice);
#endif /* __CUDA_API_VERSION_INTERNAL */
#ifdef __cplusplus
};
#endif
#undef __CUDA_DEPRECATED
#endif

View file

@ -1,92 +0,0 @@
/*
* Copyright 2020-2021 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef CUDAD3D11TYPEDEFS_H
#define CUDAD3D11TYPEDEFS_H
// Dependent includes for cudaD3D11.h
#include <rpcsal.h>
#include <D3D11_1.h>
#include <cudaD3D11.h>
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
/*
* Macros for the latest version for each driver function in cudaD3D11.h
*/
#define PFN_cuD3D11GetDevice PFN_cuD3D11GetDevice_v3000
#define PFN_cuD3D11GetDevices PFN_cuD3D11GetDevices_v3020
#define PFN_cuGraphicsD3D11RegisterResource PFN_cuGraphicsD3D11RegisterResource_v3000
#define PFN_cuD3D11CtxCreate PFN_cuD3D11CtxCreate_v3020
#define PFN_cuD3D11CtxCreateOnDevice PFN_cuD3D11CtxCreateOnDevice_v3020
#define PFN_cuD3D11GetDirect3DDevice PFN_cuD3D11GetDirect3DDevice_v3020
/**
* Type definitions for functions defined in cudaD3D11.h
*/
typedef CUresult (CUDAAPI *PFN_cuD3D11GetDevice_v3000)(CUdevice_v1 *pCudaDevice, IDXGIAdapter *pAdapter);
typedef CUresult (CUDAAPI *PFN_cuD3D11GetDevices_v3020)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, ID3D11Device *pD3D11Device, CUd3d11DeviceList deviceList);
typedef CUresult (CUDAAPI *PFN_cuGraphicsD3D11RegisterResource_v3000)(CUgraphicsResource *pCudaResource, ID3D11Resource *pD3DResource, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuD3D11CtxCreate_v3020)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, ID3D11Device *pD3DDevice);
typedef CUresult (CUDAAPI *PFN_cuD3D11CtxCreateOnDevice_v3020)(CUcontext *pCtx, unsigned int flags, ID3D11Device *pD3DDevice, CUdevice_v1 cudaDevice);
typedef CUresult (CUDAAPI *PFN_cuD3D11GetDirect3DDevice_v3020)(ID3D11Device **ppD3DDevice);
#if defined(__CUDA_API_VERSION_INTERNAL)
typedef CUresult (CUDAAPI *PFN_cuD3D11CtxCreate_v3000)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, ID3D11Device *pD3DDevice);
#endif
#ifdef __cplusplus
}
#endif // __cplusplus
#endif // file guard

View file

@ -1,886 +0,0 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef CUDAD3D9_H
#define CUDAD3D9_H
#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
#define __CUDA_DEPRECATED
#elif defined(_MSC_VER)
#define __CUDA_DEPRECATED __declspec(deprecated)
#elif defined(__GNUC__)
#define __CUDA_DEPRECATED __attribute__((deprecated))
#else
#define __CUDA_DEPRECATED
#endif
#ifdef CUDA_FORCE_API_VERSION
#error "CUDA_FORCE_API_VERSION is no longer supported."
#endif
#define cuD3D9CtxCreate cuD3D9CtxCreate_v2
#define cuD3D9ResourceGetSurfaceDimensions cuD3D9ResourceGetSurfaceDimensions_v2
#define cuD3D9ResourceGetMappedPointer cuD3D9ResourceGetMappedPointer_v2
#define cuD3D9ResourceGetMappedSize cuD3D9ResourceGetMappedSize_v2
#define cuD3D9ResourceGetMappedPitch cuD3D9ResourceGetMappedPitch_v2
#define cuD3D9MapVertexBuffer cuD3D9MapVertexBuffer_v2
#ifdef __cplusplus
extern "C" {
#endif
/**
* \file cudaD3D9.h
* \brief Header file for the Direct3D 9 interoperability functions of the
* low-level CUDA driver application programming interface.
*/
/**
* \defgroup CUDA_D3D9 Direct3D 9 Interoperability
* \ingroup CUDA_DRIVER
*
* ___MANBRIEF___ Direct3D 9 interoperability functions of the low-level CUDA
* driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the Direct3D 9 interoperability functions of the
* low-level CUDA driver application programming interface. Note that mapping
* of Direct3D 9 resources is performed with the graphics API agnostic, resource
* mapping interface described in \ref CUDA_GRAPHICS "Graphics Interoperability".
*
* @{
*/
/**
* CUDA devices corresponding to a D3D9 device
*/
typedef enum CUd3d9DeviceList_enum {
CU_D3D9_DEVICE_LIST_ALL = 0x01, /**< The CUDA devices for all GPUs used by a D3D9 device */
CU_D3D9_DEVICE_LIST_CURRENT_FRAME = 0x02, /**< The CUDA devices for the GPUs used by a D3D9 device in its currently rendering frame */
CU_D3D9_DEVICE_LIST_NEXT_FRAME = 0x03, /**< The CUDA devices for the GPUs to be used by a D3D9 device in the next frame */
} CUd3d9DeviceList;
/**
* \brief Gets the CUDA device corresponding to a display adapter.
*
* Returns in \p *pCudaDevice the CUDA-compatible device corresponding to the
* adapter name \p pszAdapterName obtained from ::EnumDisplayDevices() or
* ::IDirect3D9::GetAdapterIdentifier().
*
* If no device on the adapter with name \p pszAdapterName is CUDA-compatible,
* then the call will fail.
*
* \param pCudaDevice - Returned CUDA device corresponding to pszAdapterName
* \param pszAdapterName - Adapter name to query for device
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_NOT_FOUND,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa
* ::cuD3D9CtxCreate,
* ::cudaD3D9GetDevice
*/
CUresult CUDAAPI cuD3D9GetDevice(CUdevice *pCudaDevice, const char *pszAdapterName);
/**
* \brief Gets the CUDA devices corresponding to a Direct3D 9 device
*
* Returns in \p *pCudaDeviceCount the number of CUDA-compatible device corresponding
* to the Direct3D 9 device \p pD3D9Device.
* Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the CUDA-compatible devices
* corresponding to the Direct3D 9 device \p pD3D9Device.
*
* If any of the GPUs being used to render \p pDevice are not CUDA capable then the
* call will return ::CUDA_ERROR_NO_DEVICE.
*
* \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D9Device
* \param pCudaDevices - Returned CUDA devices corresponding to \p pD3D9Device
* \param cudaDeviceCount - The size of the output device array \p pCudaDevices
* \param pD3D9Device - Direct3D 9 device to query for CUDA devices
* \param deviceList - The set of devices to return. This set may be
* ::CU_D3D9_DEVICE_LIST_ALL for all devices,
* ::CU_D3D9_DEVICE_LIST_CURRENT_FRAME for the devices used to
* render the current frame (in SLI), or
* ::CU_D3D9_DEVICE_LIST_NEXT_FRAME for the devices used to
* render the next frame (in SLI).
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_NO_DEVICE,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_NOT_FOUND,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa
* ::cuD3D9CtxCreate,
* ::cudaD3D9GetDevices
*/
CUresult CUDAAPI cuD3D9GetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, IDirect3DDevice9 *pD3D9Device, CUd3d9DeviceList deviceList);
/**
* \brief Create a CUDA context for interoperability with Direct3D 9
*
* Creates a new CUDA context, enables interoperability for that context with
* the Direct3D device \p pD3DDevice, and associates the created CUDA context
* with the calling thread.
* The created ::CUcontext will be returned in \p *pCtx.
* Direct3D resources from this device may be registered and mapped through the
* lifetime of this CUDA context.
* If \p pCudaDevice is non-NULL then the ::CUdevice on which this CUDA context was
* created will be returned in \p *pCudaDevice.
*
* On success, this call will increase the internal reference count on
* \p pD3DDevice. This reference count will be decremented upon destruction of
* this context through ::cuCtxDestroy().
* This context will cease to function if \p pD3DDevice is destroyed or encounters
* an error.
*
* Note that this function is never required for correct functionality. Use of
* this function will result in accelerated interoperability only when the
* operating system is Windows Vista or Windows 7, and the device \p pD3DDdevice
* is not an IDirect3DDevice9Ex. In all other circumstances, this function is
* not necessary.
*
* \param pCtx - Returned newly created CUDA context
* \param pCudaDevice - Returned pointer to the device on which the context was created
* \param Flags - Context creation flags (see ::cuCtxCreate() for details)
* \param pD3DDevice - Direct3D device to create interoperability context with
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_OUT_OF_MEMORY,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa
* ::cuD3D9GetDevice,
* ::cuGraphicsD3D9RegisterResource
*/
CUresult CUDAAPI cuD3D9CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, IDirect3DDevice9 *pD3DDevice);
/**
* \brief Create a CUDA context for interoperability with Direct3D 9
*
* Creates a new CUDA context, enables interoperability for that context with
* the Direct3D device \p pD3DDevice, and associates the created CUDA context
* with the calling thread.
* The created ::CUcontext will be returned in \p *pCtx.
* Direct3D resources from this device may be registered and mapped through the
* lifetime of this CUDA context.
*
* On success, this call will increase the internal reference count on
* \p pD3DDevice. This reference count will be decremented upon destruction of
* this context through ::cuCtxDestroy().
* This context will cease to function if \p pD3DDevice is destroyed or encounters
* an error.
*
* Note that this function is never required for correct functionality. Use of
* this function will result in accelerated interoperability only when the
* operating system is Windows Vista or Windows 7, and the device \p pD3DDdevice
* is not an IDirect3DDevice9Ex. In all other circumstances, this function is
* not necessary.
*
* \param pCtx - Returned newly created CUDA context
* \param flags - Context creation flags (see ::cuCtxCreate() for details)
* \param pD3DDevice - Direct3D device to create interoperability context with
* \param cudaDevice - The CUDA device on which to create the context. This device
* must be among the devices returned when querying
* ::CU_D3D9_DEVICES_ALL from ::cuD3D9GetDevices.
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_OUT_OF_MEMORY,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa
* ::cuD3D9GetDevices,
* ::cuGraphicsD3D9RegisterResource
*/
CUresult CUDAAPI cuD3D9CtxCreateOnDevice(CUcontext *pCtx, unsigned int flags, IDirect3DDevice9 *pD3DDevice, CUdevice cudaDevice);
/**
* \brief Get the Direct3D 9 device against which the current CUDA context was
* created
*
* Returns in \p *ppD3DDevice the Direct3D device against which this CUDA context
* was created in ::cuD3D9CtxCreate().
*
* \param ppD3DDevice - Returned Direct3D device corresponding to CUDA context
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT
* ::CUDA_ERROR_INVALID_GRAPHICS_CONTEXT
* \notefnerr
*
* \sa
* ::cuD3D9GetDevice,
* ::cudaD3D9GetDirect3DDevice
*/
CUresult CUDAAPI cuD3D9GetDirect3DDevice(IDirect3DDevice9 **ppD3DDevice);
/**
* \brief Register a Direct3D 9 resource for access by CUDA
*
* Registers the Direct3D 9 resource \p pD3DResource for access by CUDA and
* returns a CUDA handle to \p pD3Dresource in \p pCudaResource.
* The handle returned in \p pCudaResource may be used to map and unmap this
* resource until it is unregistered.
* On success this call will increase the internal reference count on
* \p pD3DResource. This reference count will be decremented when this
* resource is unregistered through ::cuGraphicsUnregisterResource().
*
* This call is potentially high-overhead and should not be called every frame
* in interactive applications.
*
* The type of \p pD3DResource must be one of the following.
* - ::IDirect3DVertexBuffer9: may be accessed through a device pointer
* - ::IDirect3DIndexBuffer9: may be accessed through a device pointer
* - ::IDirect3DSurface9: may be accessed through an array.
* Only stand-alone objects of type ::IDirect3DSurface9
* may be explicitly shared. In particular, individual mipmap levels and faces
* of cube maps may not be registered directly. To access individual surfaces
* associated with a texture, one must register the base texture object.
* - ::IDirect3DBaseTexture9: individual surfaces on this texture may be accessed
* through an array.
*
* The \p Flags argument may be used to specify additional parameters at register
* time. The valid values for this parameter are
* - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
* resource will be used.
* - ::CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST: Specifies that CUDA will
* bind this resource to a surface reference.
* - ::CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER: Specifies that CUDA will perform
* texture gather operations on this resource.
*
* Not all Direct3D resources of the above types may be used for
* interoperability with CUDA. The following are some limitations.
* - The primary rendertarget may not be registered with CUDA.
* - Resources allocated as shared may not be registered with CUDA.
* - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
* or 32-bit integer or floating-point data cannot be shared.
* - Surfaces of depth or stencil formats cannot be shared.
*
* A complete list of supported formats is as follows:
* - D3DFMT_L8
* - D3DFMT_L16
* - D3DFMT_A8R8G8B8
* - D3DFMT_X8R8G8B8
* - D3DFMT_G16R16
* - D3DFMT_A8B8G8R8
* - D3DFMT_A8
* - D3DFMT_A8L8
* - D3DFMT_Q8W8V8U8
* - D3DFMT_V16U16
* - D3DFMT_A16B16G16R16F
* - D3DFMT_A16B16G16R16
* - D3DFMT_R32F
* - D3DFMT_G16R16F
* - D3DFMT_A32B32G32R32F
* - D3DFMT_G32R32F
* - D3DFMT_R16F
*
* If Direct3D interoperability is not initialized for this context using
* ::cuD3D9CtxCreate then ::CUDA_ERROR_INVALID_CONTEXT is returned.
* If \p pD3DResource is of incorrect type or is already registered then
* ::CUDA_ERROR_INVALID_HANDLE is returned.
* If \p pD3DResource cannot be registered then ::CUDA_ERROR_UNKNOWN is returned.
* If \p Flags is not one of the above specified value then ::CUDA_ERROR_INVALID_VALUE
* is returned.
*
* \param pCudaResource - Returned graphics resource handle
* \param pD3DResource - Direct3D resource to register
* \param Flags - Parameters for resource registration
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_OUT_OF_MEMORY,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa
* ::cuD3D9CtxCreate,
* ::cuGraphicsUnregisterResource,
* ::cuGraphicsMapResources,
* ::cuGraphicsSubResourceGetMappedArray,
* ::cuGraphicsResourceGetMappedPointer,
* ::cudaGraphicsD3D9RegisterResource
*/
CUresult CUDAAPI cuGraphicsD3D9RegisterResource(CUgraphicsResource *pCudaResource, IDirect3DResource9 *pD3DResource, unsigned int Flags);
/**
* \defgroup CUDA_D3D9_DEPRECATED Direct3D 9 Interoperability [DEPRECATED]
*
* ___MANBRIEF___ deprecated Direct3D 9 interoperability functions of the
* low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes deprecated Direct3D 9 interoperability functionality.
* @{
*/
/** Flags to register a resource */
typedef enum CUd3d9register_flags_enum {
CU_D3D9_REGISTER_FLAGS_NONE = 0x00,
CU_D3D9_REGISTER_FLAGS_ARRAY = 0x01,
} CUd3d9register_flags;
/** Flags to map or unmap a resource */
typedef enum CUd3d9map_flags_enum {
CU_D3D9_MAPRESOURCE_FLAGS_NONE = 0x00,
CU_D3D9_MAPRESOURCE_FLAGS_READONLY = 0x01,
CU_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD = 0x02,
} CUd3d9map_flags;
/**
* \brief Register a Direct3D resource for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Registers the Direct3D resource \p pResource for access by CUDA.
*
* If this call is successful, then the application will be able to map and
* unmap this resource until it is unregistered through
* ::cuD3D9UnregisterResource(). Also on success, this call will increase the
* internal reference count on \p pResource. This reference count will be
* decremented when this resource is unregistered through
* ::cuD3D9UnregisterResource().
*
* This call is potentially high-overhead and should not be called every frame
* in interactive applications.
*
* The type of \p pResource must be one of the following.
*
* - ::IDirect3DVertexBuffer9: Cannot be used with \p Flags set to
* ::CU_D3D9_REGISTER_FLAGS_ARRAY.
* - ::IDirect3DIndexBuffer9: Cannot be used with \p Flags set to
* ::CU_D3D9_REGISTER_FLAGS_ARRAY.
* - ::IDirect3DSurface9: Only stand-alone objects of type ::IDirect3DSurface9
* may be explicitly shared. In particular, individual mipmap levels and
* faces of cube maps may not be registered directly. To access individual
* surfaces associated with a texture, one must register the base texture
* object. For restrictions on the \p Flags parameter, see type
* ::IDirect3DBaseTexture9.
* - ::IDirect3DBaseTexture9: When a texture is registered, all surfaces
* associated with the all mipmap levels of all faces of the texture will be
* accessible to CUDA.
*
* The \p Flags argument specifies the mechanism through which CUDA will access
* the Direct3D resource. The following values are allowed.
*
* - CU_D3D9_REGISTER_FLAGS_NONE: Specifies that CUDA will access this resource
* through a ::CUdeviceptr. The pointer, size, and (for textures), pitch for
* each subresource of this allocation may be queried through
* ::cuD3D9ResourceGetMappedPointer(), ::cuD3D9ResourceGetMappedSize(), and
* ::cuD3D9ResourceGetMappedPitch() respectively. This option is valid for
* all resource types.
* - ::CU_D3D9_REGISTER_FLAGS_ARRAY: Specifies that CUDA will access this
* resource through a ::CUarray queried on a sub-resource basis through
* ::cuD3D9ResourceGetMappedArray(). This option is only valid for resources
* of type ::IDirect3DSurface9 and subtypes of ::IDirect3DBaseTexture9.
*
* Not all Direct3D resources of the above types may be used for
* interoperability with CUDA. The following are some limitations.
*
* - The primary rendertarget may not be registered with CUDA.
* - Resources allocated as shared may not be registered with CUDA.
* - Any resources allocated in ::D3DPOOL_SYSTEMMEM or ::D3DPOOL_MANAGED may
* not be registered with CUDA.
* - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
* or 32-bit integer or floating-point data cannot be shared.
* - Surfaces of depth or stencil formats cannot be shared.
*
* If Direct3D interoperability is not initialized on this context, then
* ::CUDA_ERROR_INVALID_CONTEXT is returned. If \p pResource is of incorrect
* type (e.g. is a non-stand-alone ::IDirect3DSurface9) or is already
* registered, then ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource
* cannot be registered then ::CUDA_ERROR_UNKNOWN is returned.
*
* \param pResource - Resource to register for CUDA access
* \param Flags - Flags for resource registration
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_OUT_OF_MEMORY,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa
* ::cuGraphicsD3D9RegisterResource
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9RegisterResource(IDirect3DResource9 *pResource, unsigned int Flags);
/**
* \brief Unregister a Direct3D resource
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Unregisters the Direct3D resource \p pResource so it is not accessible by
* CUDA unless registered again.
*
* If \p pResource is not registered, then ::CUDA_ERROR_INVALID_HANDLE is
* returned.
*
* \param pResource - Resource to unregister
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa
* ::cuGraphicsUnregisterResource
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9UnregisterResource(IDirect3DResource9 *pResource);
/**
* \brief Map Direct3D resources for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Maps the \p count Direct3D resources in \p ppResource for access by CUDA.
*
* The resources in \p ppResource may be accessed in CUDA kernels until they
* are unmapped. Direct3D should not access any resources while they are mapped
* by CUDA. If an application does so the results are undefined.
*
* This function provides the synchronization guarantee that any Direct3D calls
* issued before ::cuD3D9MapResources() will complete before any CUDA kernels
* issued after ::cuD3D9MapResources() begin.
*
* If any of \p ppResource have not been registered for use with CUDA or if
* \p ppResource contains any duplicate entries, then
* ::CUDA_ERROR_INVALID_HANDLE is returned. If any of \p ppResource are
* presently mapped for access by CUDA, then ::CUDA_ERROR_ALREADY_MAPPED is
* returned.
*
* \param count - Number of resources in ppResource
* \param ppResource - Resources to map for CUDA usage
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_ALREADY_MAPPED,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa
* ::cuGraphicsMapResources
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9MapResources(unsigned int count, IDirect3DResource9 **ppResource);
/**
* \brief Unmaps Direct3D resources
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Unmaps the \p count Direct3D resources in \p ppResource.
*
* This function provides the synchronization guarantee that any CUDA kernels
* issued before ::cuD3D9UnmapResources() will complete before any Direct3D
* calls issued after ::cuD3D9UnmapResources() begin.
*
* If any of \p ppResource have not been registered for use with CUDA or if
* \p ppResource contains any duplicate entries, then
* ::CUDA_ERROR_INVALID_HANDLE is returned. If any of \p ppResource are not
* presently mapped for access by CUDA, then ::CUDA_ERROR_NOT_MAPPED is
* returned.
*
* \param count - Number of resources to unmap for CUDA
* \param ppResource - Resources to unmap for CUDA
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_NOT_MAPPED,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa
* ::cuGraphicsUnmapResources
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9UnmapResources(unsigned int count, IDirect3DResource9 **ppResource);
/**
* \brief Set usage flags for mapping a Direct3D resource
*
* \deprecated This function is deprecated as of Cuda 3.0.
*
* Set \p Flags for mapping the Direct3D resource \p pResource.
*
* Changes to \p Flags will take effect the next time \p pResource is mapped.
* The \p Flags argument may be any of the following:
* - ::CU_D3D9_MAPRESOURCE_FLAGS_NONE: Specifies no hints about how this
* resource will be used. It is therefore assumed that this resource will be
* read from and written to by CUDA kernels. This is the default value.
* - ::CU_D3D9_MAPRESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which
* access this resource will not write to this resource.
* - ::CU_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels
* which access this resource will not read from this resource and will
* write over the entire contents of the resource, so none of the data
* previously stored in the resource will be preserved.
*
* If \p pResource has not been registered for use with CUDA, then
* ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is presently
* mapped for access by CUDA, then ::CUDA_ERROR_ALREADY_MAPPED is returned.
*
* \param pResource - Registered resource to set flags for
* \param Flags - Parameters for resource mapping
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_ALREADY_MAPPED
* \notefnerr
*
* \sa ::cuGraphicsResourceSetMapFlags
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceSetMapFlags(IDirect3DResource9 *pResource, unsigned int Flags);
/**
* \brief Get the dimensions of a registered surface
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Returns in \p *pWidth, \p *pHeight, and \p *pDepth the dimensions of the
* subresource of the mapped Direct3D resource \p pResource, which corresponds
* to \p Face and \p Level.
*
* Because anti-aliased surfaces may have multiple samples per pixel, it is
* possible that the dimensions of a resource will be an integer factor larger
* than the dimensions reported by the Direct3D runtime.
*
* The parameters \p pWidth, \p pHeight, and \p pDepth are optional. For 2D
* surfaces, the value returned in \p *pDepth will be 0.
*
* If \p pResource is not of type ::IDirect3DBaseTexture9 or
* ::IDirect3DSurface9 or if \p pResource has not been registered for use with
* CUDA, then ::CUDA_ERROR_INVALID_HANDLE is returned.
*
* For usage requirements of \p Face and \p Level parameters, see
* ::cuD3D9ResourceGetMappedPointer().
*
* \param pWidth - Returned width of surface
* \param pHeight - Returned height of surface
* \param pDepth - Returned depth of surface
* \param pResource - Registered resource to access
* \param Face - Face of resource to access
* \param Level - Level of resource to access
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_INVALID_HANDLE
* \notefnerr
*
* \sa ::cuGraphicsSubResourceGetMappedArray
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceGetSurfaceDimensions(size_t *pWidth, size_t *pHeight, size_t *pDepth, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
/**
* \brief Get an array through which to access a subresource of a Direct3D
* resource which has been mapped for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Returns in \p *pArray an array through which the subresource of the mapped
* Direct3D resource \p pResource which corresponds to \p Face and \p Level may
* be accessed. The value set in \p pArray may change every time that
* \p pResource is mapped.
*
* If \p pResource is not registered then ::CUDA_ERROR_INVALID_HANDLE is
* returned. If \p pResource was not registered with usage flags
* ::CU_D3D9_REGISTER_FLAGS_ARRAY then ::CUDA_ERROR_INVALID_HANDLE is
* returned. If \p pResource is not mapped then ::CUDA_ERROR_NOT_MAPPED is
* returned.
*
* For usage requirements of \p Face and \p Level parameters, see
* ::cuD3D9ResourceGetMappedPointer().
*
* \param pArray - Returned array corresponding to subresource
* \param pResource - Mapped resource to access
* \param Face - Face of resource to access
* \param Level - Level of resource to access
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_NOT_MAPPED
* \notefnerr
*
* \sa ::cuGraphicsSubResourceGetMappedArray
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceGetMappedArray(CUarray *pArray, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
/**
* \brief Get the pointer through which to access a subresource of a Direct3D
* resource which has been mapped for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Returns in \p *pDevPtr the base pointer of the subresource of the mapped
* Direct3D resource \p pResource, which corresponds to \p Face and \p Level.
* The value set in \p pDevPtr may change every time that \p pResource is
* mapped.
*
* If \p pResource is not registered, then ::CUDA_ERROR_INVALID_HANDLE is
* returned. If \p pResource was not registered with usage flags
* ::CU_D3D9_REGISTER_FLAGS_NONE, then ::CUDA_ERROR_INVALID_HANDLE is returned.
* If \p pResource is not mapped, then ::CUDA_ERROR_NOT_MAPPED is returned.
*
* If \p pResource is of type ::IDirect3DCubeTexture9, then \p Face must one
* of the values enumerated by type ::D3DCUBEMAP_FACES. For all other types
* \p Face must be 0. If \p Face is invalid, then ::CUDA_ERROR_INVALID_VALUE
* is returned.
*
* If \p pResource is of type ::IDirect3DBaseTexture9, then \p Level must
* correspond to a valid mipmap level. At present only mipmap level 0 is
* supported. For all other types \p Level must be 0. If \p Level is invalid,
* then ::CUDA_ERROR_INVALID_VALUE is returned.
*
* \param pDevPtr - Returned pointer corresponding to subresource
* \param pResource - Mapped resource to access
* \param Face - Face of resource to access
* \param Level - Level of resource to access
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_NOT_MAPPED
* \notefnerr
*
* \sa ::cuGraphicsResourceGetMappedPointer
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceGetMappedPointer(CUdeviceptr *pDevPtr, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
/**
* \brief Get the size of a subresource of a Direct3D resource which has been
* mapped for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Returns in \p *pSize the size of the subresource of the mapped Direct3D
* resource \p pResource, which corresponds to \p Face and \p Level. The value
* set in \p pSize may change every time that \p pResource is mapped.
*
* If \p pResource has not been registered for use with CUDA, then
* ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource was not registered
* with usage flags ::CU_D3D9_REGISTER_FLAGS_NONE, then
* ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is not mapped for
* access by CUDA, then ::CUDA_ERROR_NOT_MAPPED is returned.
*
* For usage requirements of \p Face and \p Level parameters, see
* ::cuD3D9ResourceGetMappedPointer.
*
* \param pSize - Returned size of subresource
* \param pResource - Mapped resource to access
* \param Face - Face of resource to access
* \param Level - Level of resource to access
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_NOT_MAPPED
* \notefnerr
*
* \sa ::cuGraphicsResourceGetMappedPointer
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceGetMappedSize(size_t *pSize, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
/**
* \brief Get the pitch of a subresource of a Direct3D resource which has been
* mapped for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Returns in \p *pPitch and \p *pPitchSlice the pitch and Z-slice pitch of
* the subresource of the mapped Direct3D resource \p pResource, which
* corresponds to \p Face and \p Level. The values set in \p pPitch and
* \p pPitchSlice may change every time that \p pResource is mapped.
*
* The pitch and Z-slice pitch values may be used to compute the location of a
* sample on a surface as follows.
*
* For a 2D surface, the byte offset of the sample at position \b x, \b y from
* the base pointer of the surface is:
*
* \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
*
* For a 3D surface, the byte offset of the sample at position \b x, \b y,
* \b z from the base pointer of the surface is:
*
* \b z* \b slicePitch + \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
*
* Both parameters \p pPitch and \p pPitchSlice are optional and may be set to
* NULL.
*
* If \p pResource is not of type ::IDirect3DBaseTexture9 or one of its
* sub-types or if \p pResource has not been registered for use with CUDA,
* then ::cudaErrorInvalidResourceHandle is returned. If \p pResource was not
* registered with usage flags ::CU_D3D9_REGISTER_FLAGS_NONE, then
* ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is not mapped
* for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned.
*
* For usage requirements of \p Face and \p Level parameters, see
* ::cuD3D9ResourceGetMappedPointer().
*
* \param pPitch - Returned pitch of subresource
* \param pPitchSlice - Returned Z-slice pitch of subresource
* \param pResource - Mapped resource to access
* \param Face - Face of resource to access
* \param Level - Level of resource to access
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_NOT_MAPPED
* \notefnerr
*
* \sa ::cuGraphicsSubResourceGetMappedArray
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceGetMappedPitch(size_t *pPitch, size_t *pPitchSlice, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
/* CUDA 1.x compatibility API. These functions are deprecated, please use the ones above. */
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9Begin(IDirect3DDevice9 *pDevice);
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9End(void);
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9RegisterVertexBuffer(IDirect3DVertexBuffer9 *pVB);
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9MapVertexBuffer(CUdeviceptr *pDevPtr, size_t *pSize, IDirect3DVertexBuffer9 *pVB);
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9UnmapVertexBuffer(IDirect3DVertexBuffer9 *pVB);
__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9UnregisterVertexBuffer(IDirect3DVertexBuffer9 *pVB);
/** @} */ /* END CUDA_D3D9_DEPRECATED */
/** @} */ /* END CUDA_D3D9 */
/**
* CUDA API versioning support
*/
#if defined(__CUDA_API_VERSION_INTERNAL)
#undef cuD3D9CtxCreate
#undef cuD3D9ResourceGetSurfaceDimensions
#undef cuD3D9ResourceGetMappedPointer
#undef cuD3D9ResourceGetMappedSize
#undef cuD3D9ResourceGetMappedPitch
#undef cuD3D9MapVertexBuffer
CUresult CUDAAPI cuD3D9CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, IDirect3DDevice9 *pD3DDevice);
CUresult CUDAAPI cuD3D9ResourceGetSurfaceDimensions(unsigned int *pWidth, unsigned int *pHeight, unsigned int *pDepth, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
CUresult CUDAAPI cuD3D9ResourceGetMappedPointer(CUdeviceptr_v1 *pDevPtr, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
CUresult CUDAAPI cuD3D9ResourceGetMappedSize(unsigned int *pSize, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
CUresult CUDAAPI cuD3D9ResourceGetMappedPitch(unsigned int *pPitch, unsigned int *pPitchSlice, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
CUresult CUDAAPI cuD3D9MapVertexBuffer(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, IDirect3DVertexBuffer9 *pVB);
#endif /* __CUDA_API_VERSION_INTERNAL */
#ifdef __cplusplus
};
#endif
#undef __CUDA_DEPRECATED
#endif

View file

@ -1,131 +0,0 @@
/*
* Copyright 2020-2021 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef CUDAD3D9TYPEDEFS_H
#define CUDAD3D9TYPEDEFS_H
// Dependent includes for cudaD3D11.h
#include <d3d9.h>
#include <cudaD3D9.h>
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
/*
* Macros for the latest version for each driver function in cudaD3D9.h
*/
#define PFN_cuD3D9GetDevice PFN_cuD3D9GetDevice_v2000
#define PFN_cuD3D9GetDevices PFN_cuD3D9GetDevices_v3020
#define PFN_cuD3D9CtxCreate PFN_cuD3D9CtxCreate_v3020
#define PFN_cuD3D9CtxCreateOnDevice PFN_cuD3D9CtxCreateOnDevice_v3020
#define PFN_cuD3D9GetDirect3DDevice PFN_cuD3D9GetDirect3DDevice_v2000
#define PFN_cuGraphicsD3D9RegisterResource PFN_cuGraphicsD3D9RegisterResource_v3000
#define PFN_cuD3D9RegisterResource PFN_cuD3D9RegisterResource_v2000
#define PFN_cuD3D9UnregisterResource PFN_cuD3D9UnregisterResource_v2000
#define PFN_cuD3D9MapResources PFN_cuD3D9MapResources_v2000
#define PFN_cuD3D9UnmapResources PFN_cuD3D9UnmapResources_v2000
#define PFN_cuD3D9ResourceSetMapFlags PFN_cuD3D9ResourceSetMapFlags_v2000
#define PFN_cuD3D9ResourceGetSurfaceDimensions PFN_cuD3D9ResourceGetSurfaceDimensions_v3020
#define PFN_cuD3D9ResourceGetMappedArray PFN_cuD3D9ResourceGetMappedArray_v2010
#define PFN_cuD3D9ResourceGetMappedPointer PFN_cuD3D9ResourceGetMappedPointer_v3020
#define PFN_cuD3D9ResourceGetMappedSize PFN_cuD3D9ResourceGetMappedSize_v3020
#define PFN_cuD3D9ResourceGetMappedPitch PFN_cuD3D9ResourceGetMappedPitch_v3020
#define PFN_cuD3D9Begin PFN_cuD3D9Begin_v2000
#define PFN_cuD3D9End PFN_cuD3D9End_v2000
#define PFN_cuD3D9RegisterVertexBuffer PFN_cuD3D9RegisterVertexBuffer_v2000
#define PFN_cuD3D9MapVertexBuffer PFN_cuD3D9MapVertexBuffer_v3020
#define PFN_cuD3D9UnmapVertexBuffer PFN_cuD3D9UnmapVertexBuffer_v2000
#define PFN_cuD3D9UnregisterVertexBuffer PFN_cuD3D9UnregisterVertexBuffer_v2000
/**
* Type definitions for functions defined in cudaD3D9.h
*/
typedef CUresult (CUDAAPI *PFN_cuD3D9GetDevice_v2000)(CUdevice_v1 *pCudaDevice, const char *pszAdapterName);
typedef CUresult (CUDAAPI *PFN_cuD3D9GetDevices_v3020)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, IDirect3DDevice9 *pD3D9Device, CUd3d9DeviceList deviceList);
typedef CUresult (CUDAAPI *PFN_cuD3D9CtxCreate_v3020)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, IDirect3DDevice9 *pD3DDevice);
typedef CUresult (CUDAAPI *PFN_cuD3D9CtxCreateOnDevice_v3020)(CUcontext *pCtx, unsigned int flags, IDirect3DDevice9 *pD3DDevice, CUdevice_v1 cudaDevice);
typedef CUresult (CUDAAPI *PFN_cuD3D9GetDirect3DDevice_v2000)(IDirect3DDevice9 **ppD3DDevice);
typedef CUresult (CUDAAPI *PFN_cuGraphicsD3D9RegisterResource_v3000)(CUgraphicsResource *pCudaResource, IDirect3DResource9 *pD3DResource, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuD3D9RegisterResource_v2000)(IDirect3DResource9 *pResource, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuD3D9UnregisterResource_v2000)(IDirect3DResource9 *pResource);
typedef CUresult (CUDAAPI *PFN_cuD3D9MapResources_v2000)(unsigned int count, IDirect3DResource9 **ppResource);
typedef CUresult (CUDAAPI *PFN_cuD3D9UnmapResources_v2000)(unsigned int count, IDirect3DResource9 **ppResource);
typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceSetMapFlags_v2000)(IDirect3DResource9 *pResource, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetSurfaceDimensions_v3020)(size_t *pWidth, size_t *pHeight, size_t *pDepth, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedArray_v2010)(CUarray *pArray, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedPointer_v3020)(CUdeviceptr_v2 *pDevPtr, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedSize_v3020)(size_t *pSize, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedPitch_v3020)(size_t *pPitch, size_t *pPitchSlice, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
typedef CUresult (CUDAAPI *PFN_cuD3D9Begin_v2000)(IDirect3DDevice9 *pDevice);
typedef CUresult (CUDAAPI *PFN_cuD3D9End_v2000)(void);
typedef CUresult (CUDAAPI *PFN_cuD3D9RegisterVertexBuffer_v2000)(IDirect3DVertexBuffer9 *pVB);
typedef CUresult (CUDAAPI *PFN_cuD3D9MapVertexBuffer_v3020)(CUdeviceptr_v2 *pDevPtr, size_t *pSize, IDirect3DVertexBuffer9 *pVB);
typedef CUresult (CUDAAPI *PFN_cuD3D9UnmapVertexBuffer_v2000)(IDirect3DVertexBuffer9 *pVB);
typedef CUresult (CUDAAPI *PFN_cuD3D9UnregisterVertexBuffer_v2000)(IDirect3DVertexBuffer9 *pVB);
/*
* Type definitions for older versioned functions in cudaD3D9.h
*/
#if defined(__CUDA_API_VERSION_INTERNAL)
typedef CUresult (CUDAAPI *PFN_cuD3D9CtxCreate_v2000)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, IDirect3DDevice9 *pD3DDevice);
typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetSurfaceDimensions_v2000)(unsigned int *pWidth, unsigned int *pHeight, unsigned int *pDepth, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedPointer_v2000)(CUdeviceptr_v1 *pDevPtr, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedSize_v2000)(unsigned int *pSize, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedPitch_v2000)(unsigned int *pPitch, unsigned int *pPitchSlice, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
typedef CUresult (CUDAAPI *PFN_cuD3D9MapVertexBuffer_v2000)(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, IDirect3DVertexBuffer9 *pVB);
#endif
#ifdef __cplusplus
}
#endif // __cplusplus
#endif // file guard

View file

@ -1,610 +0,0 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef CUDAGL_H
#define CUDAGL_H
#include <cuda.h>
#ifdef __APPLE__
#include <OpenGL/gl.h>
#else
#include <GL/gl.h>
#endif
#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
#define __CUDA_DEPRECATED
#elif defined(_MSC_VER)
#define __CUDA_DEPRECATED __declspec(deprecated)
#elif defined(__GNUC__)
#define __CUDA_DEPRECATED __attribute__((deprecated))
#else
#define __CUDA_DEPRECATED
#endif
#ifdef CUDA_FORCE_API_VERSION
#error "CUDA_FORCE_API_VERSION is no longer supported."
#endif
#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
#define __CUDA_API_PER_THREAD_DEFAULT_STREAM
#define __CUDA_API_PTDS(api) api ## _ptds
#define __CUDA_API_PTSZ(api) api ## _ptsz
#else
#define __CUDA_API_PTDS(api) api
#define __CUDA_API_PTSZ(api) api
#endif
#define cuGLCtxCreate cuGLCtxCreate_v2
#define cuGLMapBufferObject __CUDA_API_PTDS(cuGLMapBufferObject_v2)
#define cuGLMapBufferObjectAsync __CUDA_API_PTSZ(cuGLMapBufferObjectAsync_v2)
#define cuGLGetDevices cuGLGetDevices_v2
#ifdef __cplusplus
extern "C" {
#endif
/**
* \file cudaGL.h
* \brief Header file for the OpenGL interoperability functions of the
* low-level CUDA driver application programming interface.
*/
/**
* \defgroup CUDA_GL OpenGL Interoperability
* \ingroup CUDA_DRIVER
*
* ___MANBRIEF___ OpenGL interoperability functions of the low-level CUDA
* driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the OpenGL interoperability functions of the
* low-level CUDA driver application programming interface. Note that mapping
* of OpenGL resources is performed with the graphics API agnostic, resource
* mapping interface described in \ref CUDA_GRAPHICS "Graphics Interoperability".
*
* @{
*/
#if defined(_WIN32)
#if !defined(WGL_NV_gpu_affinity)
typedef void* HGPUNV;
#endif
#endif /* _WIN32 */
/**
* \brief Registers an OpenGL buffer object
*
* Registers the buffer object specified by \p buffer for access by
* CUDA. A handle to the registered object is returned as \p
* pCudaResource. The register flags \p Flags specify the intended usage,
* as follows:
*
* - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
* resource will be used. It is therefore assumed that this resource will be
* read from and written to by CUDA. This is the default value.
* - ::CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: Specifies that CUDA
* will not write to this resource.
* - ::CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD: Specifies that
* CUDA will not read from this resource and will write over the
* entire contents of the resource, so none of the data previously
* stored in the resource will be preserved.
*
* \param pCudaResource - Pointer to the returned object handle
* \param buffer - name of buffer object to be registered
* \param Flags - Register flags
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_ALREADY_MAPPED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* \notefnerr
*
* \sa
* ::cuGraphicsUnregisterResource,
* ::cuGraphicsMapResources,
* ::cuGraphicsResourceGetMappedPointer,
* ::cudaGraphicsGLRegisterBuffer
*/
CUresult CUDAAPI cuGraphicsGLRegisterBuffer(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags);
/**
* \brief Register an OpenGL texture or renderbuffer object
*
* Registers the texture or renderbuffer object specified by \p image for access by CUDA.
* A handle to the registered object is returned as \p pCudaResource.
*
* \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D,
* ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY,
* or ::GL_RENDERBUFFER.
*
* The register flags \p Flags specify the intended usage, as follows:
*
* - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
* resource will be used. It is therefore assumed that this resource will be
* read from and written to by CUDA. This is the default value.
* - ::CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: Specifies that CUDA
* will not write to this resource.
* - ::CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD: Specifies that
* CUDA will not read from this resource and will write over the
* entire contents of the resource, so none of the data previously
* stored in the resource will be preserved.
* - ::CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST: Specifies that CUDA will
* bind this resource to a surface reference.
* - ::CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER: Specifies that CUDA will perform
* texture gather operations on this resource.
*
* The following image formats are supported. For brevity's sake, the list is abbreviated.
* For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats
* {GL_R8, GL_R16, GL_RG8, GL_RG16} :
* - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY
* - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I}
* - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X
* {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT}
*
* The following image classes are currently disallowed:
* - Textures with borders
* - Multisampled renderbuffers
*
* \param pCudaResource - Pointer to the returned object handle
* \param image - name of texture or renderbuffer object to be registered
* \param target - Identifies the type of object specified by \p image
* \param Flags - Register flags
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_ALREADY_MAPPED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* \notefnerr
*
* \sa
* ::cuGraphicsUnregisterResource,
* ::cuGraphicsMapResources,
* ::cuGraphicsSubResourceGetMappedArray,
* ::cudaGraphicsGLRegisterImage
*/
CUresult CUDAAPI cuGraphicsGLRegisterImage(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags);
#ifdef _WIN32
/**
* \brief Gets the CUDA device associated with hGpu
*
* Returns in \p *pDevice the CUDA device associated with a \p hGpu, if
* applicable.
*
* \param pDevice - Device associated with hGpu
* \param hGpu - Handle to a GPU, as queried via ::WGL_NV_gpu_affinity()
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE
* \notefnerr
*
* \sa ::cuGLMapBufferObject,
* ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
* ::cuGLUnregisterBufferObject, ::cuGLUnmapBufferObjectAsync,
* ::cuGLSetBufferObjectMapFlags,
* ::cudaWGLGetDevice
*/
CUresult CUDAAPI cuWGLGetDevice(CUdevice *pDevice, HGPUNV hGpu);
#endif /* _WIN32 */
/**
* CUDA devices corresponding to an OpenGL device
*/
typedef enum CUGLDeviceList_enum {
CU_GL_DEVICE_LIST_ALL = 0x01, /**< The CUDA devices for all GPUs used by the current OpenGL context */
CU_GL_DEVICE_LIST_CURRENT_FRAME = 0x02, /**< The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame */
CU_GL_DEVICE_LIST_NEXT_FRAME = 0x03, /**< The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame */
} CUGLDeviceList;
/**
* \brief Gets the CUDA devices associated with the current OpenGL context
*
* Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices
* corresponding to the current OpenGL context. Also returns in \p *pCudaDevices
* at most cudaDeviceCount of the CUDA-compatible devices corresponding to
* the current OpenGL context. If any of the GPUs being used by the current OpenGL
* context are not CUDA capable then the call will return CUDA_ERROR_NO_DEVICE.
*
* The \p deviceList argument may be any of the following:
* - ::CU_GL_DEVICE_LIST_ALL: Query all devices used by the current OpenGL context.
* - ::CU_GL_DEVICE_LIST_CURRENT_FRAME: Query the devices used by the current OpenGL context to
* render the current frame (in SLI).
* - ::CU_GL_DEVICE_LIST_NEXT_FRAME: Query the devices used by the current OpenGL context to
* render the next frame (in SLI). Note that this is a prediction, it can't be guaranteed that
* this is correct in all cases.
*
* \param pCudaDeviceCount - Returned number of CUDA devices.
* \param pCudaDevices - Returned CUDA devices.
* \param cudaDeviceCount - The size of the output device array pCudaDevices.
* \param deviceList - The set of devices to return.
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_NO_DEVICE,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_GRAPHICS_CONTEXT
*
* \note This function is not supported on Mac OS X.
* \notefnerr
*
* \sa
* ::cuWGLGetDevice,
* ::cudaGLGetDevices
*/
CUresult CUDAAPI cuGLGetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
/**
* \defgroup CUDA_GL_DEPRECATED OpenGL Interoperability [DEPRECATED]
*
* ___MANBRIEF___ deprecated OpenGL interoperability functions of the low-level
* CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes deprecated OpenGL interoperability functionality.
*
* @{
*/
/** Flags to map or unmap a resource */
typedef enum CUGLmap_flags_enum {
CU_GL_MAP_RESOURCE_FLAGS_NONE = 0x00,
CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01,
CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02,
} CUGLmap_flags;
/**
* \brief Create a CUDA context for interoperability with OpenGL
*
* \deprecated This function is deprecated as of Cuda 5.0.
*
* This function is deprecated and should no longer be used. It is
* no longer necessary to associate a CUDA context with an OpenGL
* context in order to achieve maximum interoperability performance.
*
* \param pCtx - Returned CUDA context
* \param Flags - Options for CUDA context creation
* \param device - Device on which to create the context
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_OUT_OF_MEMORY
* \notefnerr
*
* \sa ::cuCtxCreate, ::cuGLInit, ::cuGLMapBufferObject,
* ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
* ::cuGLUnregisterBufferObject, ::cuGLMapBufferObjectAsync,
* ::cuGLUnmapBufferObjectAsync, ::cuGLSetBufferObjectMapFlags,
* ::cuWGLGetDevice
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuGLCtxCreate(CUcontext *pCtx, unsigned int Flags, CUdevice device );
/**
* \brief Initializes OpenGL interoperability
*
* \deprecated This function is deprecated as of Cuda 3.0.
*
* Initializes OpenGL interoperability. This function is deprecated
* and calling it is no longer required. It may fail if the needed
* OpenGL driver facilities are not available.
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_UNKNOWN
* \notefnerr
*
* \sa ::cuGLMapBufferObject,
* ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
* ::cuGLUnregisterBufferObject, ::cuGLMapBufferObjectAsync,
* ::cuGLUnmapBufferObjectAsync, ::cuGLSetBufferObjectMapFlags,
* ::cuWGLGetDevice
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuGLInit(void);
/**
* \brief Registers an OpenGL buffer object
*
* \deprecated This function is deprecated as of Cuda 3.0.
*
* Registers the buffer object specified by \p buffer for access by
* CUDA. This function must be called before CUDA can map the buffer
* object. There must be a valid OpenGL context bound to the current
* thread when this function is called, and the buffer name is
* resolved by that context.
*
* \param buffer - The name of the buffer object to register.
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_ALREADY_MAPPED
* \notefnerr
*
* \sa ::cuGraphicsGLRegisterBuffer
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuGLRegisterBufferObject(GLuint buffer);
/**
* \brief Maps an OpenGL buffer object
*
* \deprecated This function is deprecated as of Cuda 3.0.
*
* Maps the buffer object specified by \p buffer into the address space of the
* current CUDA context and returns in \p *dptr and \p *size the base pointer
* and size of the resulting mapping.
*
* There must be a valid OpenGL context bound to the current thread
* when this function is called. This must be the same context, or a
* member of the same shareGroup, as the context that was bound when
* the buffer was registered.
*
* All streams in the current CUDA context are synchronized with the
* current GL context.
*
* \param dptr - Returned mapped base pointer
* \param size - Returned size of mapping
* \param buffer - The name of the buffer object to map
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_MAP_FAILED
* \notefnerr
*
* \sa ::cuGraphicsMapResources
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuGLMapBufferObject(CUdeviceptr *dptr, size_t *size, GLuint buffer);
/**
* \brief Unmaps an OpenGL buffer object
*
* \deprecated This function is deprecated as of Cuda 3.0.
*
* Unmaps the buffer object specified by \p buffer for access by CUDA.
*
* There must be a valid OpenGL context bound to the current thread
* when this function is called. This must be the same context, or a
* member of the same shareGroup, as the context that was bound when
* the buffer was registered.
*
* All streams in the current CUDA context are synchronized with the
* current GL context.
*
* \param buffer - Buffer object to unmap
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE
* \notefnerr
*
* \sa ::cuGraphicsUnmapResources
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnmapBufferObject(GLuint buffer);
/**
* \brief Unregister an OpenGL buffer object
*
* \deprecated This function is deprecated as of Cuda 3.0.
*
* Unregisters the buffer object specified by \p buffer. This
* releases any resources associated with the registered buffer.
* After this call, the buffer may no longer be mapped for access by
* CUDA.
*
* There must be a valid OpenGL context bound to the current thread
* when this function is called. This must be the same context, or a
* member of the same shareGroup, as the context that was bound when
* the buffer was registered.
*
* \param buffer - Name of the buffer object to unregister
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE
* \notefnerr
*
* \sa ::cuGraphicsUnregisterResource
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnregisterBufferObject(GLuint buffer);
/**
* \brief Set the map flags for an OpenGL buffer object
*
* \deprecated This function is deprecated as of Cuda 3.0.
*
* Sets the map flags for the buffer object specified by \p buffer.
*
* Changes to \p Flags will take effect the next time \p buffer is mapped.
* The \p Flags argument may be any of the following:
* - ::CU_GL_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
* resource will be used. It is therefore assumed that this resource will be
* read from and written to by CUDA kernels. This is the default value.
* - ::CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA kernels which
* access this resource will not write to this resource.
* - ::CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that CUDA kernels
* which access this resource will not read from this resource and will
* write over the entire contents of the resource, so none of the data
* previously stored in the resource will be preserved.
*
* If \p buffer has not been registered for use with CUDA, then
* ::CUDA_ERROR_INVALID_HANDLE is returned. If \p buffer is presently
* mapped for access by CUDA, then ::CUDA_ERROR_ALREADY_MAPPED is returned.
*
* There must be a valid OpenGL context bound to the current thread
* when this function is called. This must be the same context, or a
* member of the same shareGroup, as the context that was bound when
* the buffer was registered.
*
* \param buffer - Buffer object to unmap
* \param Flags - Map flags
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_HANDLE,
* ::CUDA_ERROR_ALREADY_MAPPED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* \notefnerr
*
* \sa ::cuGraphicsResourceSetMapFlags
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuGLSetBufferObjectMapFlags(GLuint buffer, unsigned int Flags);
/**
* \brief Maps an OpenGL buffer object
*
* \deprecated This function is deprecated as of Cuda 3.0.
*
* Maps the buffer object specified by \p buffer into the address space of the
* current CUDA context and returns in \p *dptr and \p *size the base pointer
* and size of the resulting mapping.
*
* There must be a valid OpenGL context bound to the current thread
* when this function is called. This must be the same context, or a
* member of the same shareGroup, as the context that was bound when
* the buffer was registered.
*
* Stream \p hStream in the current CUDA context is synchronized with
* the current GL context.
*
* \param dptr - Returned mapped base pointer
* \param size - Returned size of mapping
* \param buffer - The name of the buffer object to map
* \param hStream - Stream to synchronize
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_MAP_FAILED
* \notefnerr
*
* \sa ::cuGraphicsMapResources
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuGLMapBufferObjectAsync(CUdeviceptr *dptr, size_t *size, GLuint buffer, CUstream hStream);
/**
* \brief Unmaps an OpenGL buffer object
*
* \deprecated This function is deprecated as of Cuda 3.0.
*
* Unmaps the buffer object specified by \p buffer for access by CUDA.
*
* There must be a valid OpenGL context bound to the current thread
* when this function is called. This must be the same context, or a
* member of the same shareGroup, as the context that was bound when
* the buffer was registered.
*
* Stream \p hStream in the current CUDA context is synchronized with
* the current GL context.
*
* \param buffer - Name of the buffer object to unmap
* \param hStream - Stream to synchronize
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_DEINITIALIZED,
* ::CUDA_ERROR_NOT_INITIALIZED,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE
* \notefnerr
*
* \sa ::cuGraphicsUnmapResources
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnmapBufferObjectAsync(GLuint buffer, CUstream hStream);
/** @} */ /* END CUDA_GL_DEPRECATED */
/** @} */ /* END CUDA_GL */
#if defined(__CUDA_API_VERSION_INTERNAL)
#undef cuGLCtxCreate
#undef cuGLMapBufferObject
#undef cuGLMapBufferObjectAsync
#undef cuGLGetDevices
CUresult CUDAAPI cuGLGetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
CUresult CUDAAPI cuGLMapBufferObject_v2(CUdeviceptr *dptr, size_t *size, GLuint buffer);
CUresult CUDAAPI cuGLMapBufferObjectAsync_v2(CUdeviceptr *dptr, size_t *size, GLuint buffer, CUstream hStream);
CUresult CUDAAPI cuGLCtxCreate(CUcontext *pCtx, unsigned int Flags, CUdevice device );
CUresult CUDAAPI cuGLMapBufferObject(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer);
CUresult CUDAAPI cuGLMapBufferObjectAsync(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer, CUstream hStream);
#endif /* __CUDA_API_VERSION_INTERNAL */
#ifdef __cplusplus
};
#endif
#undef __CUDA_DEPRECATED
#endif

View file

@ -1,127 +0,0 @@
/*
* Copyright 2020-2021 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef CUDAGLTYPEDEFS_H
#define CUDAGLTYPEDEFS_H
// Dependent includes for cudagl.h
#ifdef __APPLE__
#include <OpenGL/gl.h>
#else
#include <GL/gl.h>
#endif
#include <cudaGL.h>
#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
#define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptds
#define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptsz
#else
#define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## default_version
#define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## default_version
#endif
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
/*
* Macros for the latest version for each driver function in cudaGL.h
*/
#define PFN_cuGraphicsGLRegisterBuffer PFN_cuGraphicsGLRegisterBuffer_v3000
#define PFN_cuGraphicsGLRegisterImage PFN_cuGraphicsGLRegisterImage_v3000
#define PFN_cuWGLGetDevice PFN_cuWGLGetDevice_v2020
#define PFN_cuGLGetDevices PFN_cuGLGetDevices_v6050
#define PFN_cuGLCtxCreate PFN_cuGLCtxCreate_v3020
#define PFN_cuGLInit PFN_cuGLInit_v2000
#define PFN_cuGLRegisterBufferObject PFN_cuGLRegisterBufferObject_v2000
#define PFN_cuGLMapBufferObject __API_TYPEDEF_PTDS(PFN_cuGLMapBufferObject, 3020, 7000)
#define PFN_cuGLUnmapBufferObject PFN_cuGLUnmapBufferObject_v2000
#define PFN_cuGLUnregisterBufferObject PFN_cuGLUnregisterBufferObject_v2000
#define PFN_cuGLSetBufferObjectMapFlags PFN_cuGLSetBufferObjectMapFlags_v2030
#define PFN_cuGLMapBufferObjectAsync __API_TYPEDEF_PTSZ(PFN_cuGLMapBufferObjectAsync, 3020, 7000)
#define PFN_cuGLUnmapBufferObjectAsync PFN_cuGLUnmapBufferObjectAsync_v2030
/**
* Type definitions for functions defined in cudaGL.h
*/
typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterBuffer_v3000)(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterImage_v3000)(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags);
#ifdef _WIN32
typedef CUresult (CUDAAPI *PFN_cuWGLGetDevice_v2020)(CUdevice_v1 *pDevice, HGPUNV hGpu);
#endif
typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v6050)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v3020)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
typedef CUresult (CUDAAPI *PFN_cuGLInit_v2000)(void);
typedef CUresult (CUDAAPI *PFN_cuGLRegisterBufferObject_v2000)(GLuint buffer);
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v7000_ptds)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObject_v2000)(GLuint buffer);
typedef CUresult (CUDAAPI *PFN_cuGLUnregisterBufferObject_v2000)(GLuint buffer);
typedef CUresult (CUDAAPI *PFN_cuGLSetBufferObjectMapFlags_v2030)(GLuint buffer, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v7000_ptsz)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObjectAsync_v2030)(GLuint buffer, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
/*
* Type definitions for older versioned functions in cuda.h
*/
#if defined(__CUDA_API_VERSION_INTERNAL)
typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v4010)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v2000)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer);
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v2030)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v2000)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
#endif
#ifdef __cplusplus
}
#endif // __cplusplus
#endif // file guard

View file

@ -1,217 +0,0 @@
/*
* Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef cuda_profiler_H
#define cuda_profiler_H
#include <cuda.h>
#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
#define __CUDA_DEPRECATED
#elif defined(_MSC_VER)
#define __CUDA_DEPRECATED __declspec(deprecated)
#elif defined(__GNUC__)
#define __CUDA_DEPRECATED __attribute__((deprecated))
#else
#define __CUDA_DEPRECATED
#endif
#ifdef __cplusplus
extern "C" {
#endif
/**
* Profiler Output Modes
*/
/*DEVICE_BUILTIN*/
typedef enum CUoutput_mode_enum
{
CU_OUT_KEY_VALUE_PAIR = 0x00, /**< Output mode Key-Value pair format. */
CU_OUT_CSV = 0x01 /**< Output mode Comma separated values format. */
}CUoutput_mode;
/**
* \ingroup CUDA_DRIVER
* \defgroup CUDA_PROFILER_DEPRECATED Profiler Control [DEPRECATED]
*
* ___MANBRIEF___ profiler control functions of the low-level CUDA driver API
* (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the profiler control functions of the low-level CUDA
* driver application programming interface.
*
* @{
*/
/**
* \brief Initialize the profiling.
*
* \deprecated
*
* Using this API user can initialize the CUDA profiler by specifying
* the configuration file, output file and output file format. This
* API is generally used to profile different set of counters by
* looping the kernel launch. The \p configFile parameter can be used
* to select profiling options including profiler counters. Refer to
* the "Compute Command Line Profiler User Guide" for supported
* profiler options and counters.
*
* Limitation: The CUDA profiler cannot be initialized with this API
* if another profiling tool is already active, as indicated by the
* ::CUDA_ERROR_PROFILER_DISABLED return code.
*
* Typical usage of the profiling APIs is as follows:
*
* for each set of counters/options\n
* {\n
* cuProfilerInitialize(); //Initialize profiling, set the counters or options in the config file \n
* ...\n
* cuProfilerStart(); \n
* // code to be profiled \n
* cuProfilerStop(); \n
* ...\n
* cuProfilerStart(); \n
* // code to be profiled \n
* cuProfilerStop(); \n
* ...\n
* }\n
*
* \param configFile - Name of the config file that lists the counters/options
* for profiling.
* \param outputFile - Name of the outputFile where the profiling results will
* be stored.
* \param outputMode - outputMode, can be ::CU_OUT_KEY_VALUE_PAIR or ::CU_OUT_CSV.
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_INVALID_CONTEXT,
* ::CUDA_ERROR_INVALID_VALUE,
* ::CUDA_ERROR_PROFILER_DISABLED
* \notefnerr
*
* \sa
* ::cuProfilerStart,
* ::cuProfilerStop,
* ::cudaProfilerInitialize
*/
__CUDA_DEPRECATED CUresult CUDAAPI cuProfilerInitialize(const char *configFile, const char *outputFile, CUoutput_mode outputMode);
/** @} */ /* END CUDA_PROFILER_DEPRECATED */
/**
* \ingroup CUDA_DRIVER
* \defgroup CUDA_PROFILER Profiler Control
*
* ___MANBRIEF___ profiler control functions of the low-level CUDA driver API
* (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the profiler control functions of the low-level CUDA
* driver application programming interface.
*
* @{
*/
/**
* \brief Enable profiling.
*
* Enables profile collection by the active profiling tool for the
* current context. If profiling is already enabled, then
* cuProfilerStart() has no effect.
*
* cuProfilerStart and cuProfilerStop APIs are used to
* programmatically control the profiling granularity by allowing
* profiling to be done only on selective pieces of code.
*
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_INVALID_CONTEXT
* \notefnerr
*
* \sa
* ::cuProfilerInitialize,
* ::cuProfilerStop,
* ::cudaProfilerStart
*/
CUresult CUDAAPI cuProfilerStart(void);
/**
* \brief Disable profiling.
*
* Disables profile collection by the active profiling tool for the
* current context. If profiling is already disabled, then
* cuProfilerStop() has no effect.
*
* cuProfilerStart and cuProfilerStop APIs are used to
* programmatically control the profiling granularity by allowing
* profiling to be done only on selective pieces of code.
*
* \return
* ::CUDA_SUCCESS,
* ::CUDA_ERROR_INVALID_CONTEXT
* \notefnerr
*
* \sa
* ::cuProfilerInitialize,
* ::cuProfilerStart,
* ::cudaProfilerStop
*/
CUresult CUDAAPI cuProfilerStop(void);
/** @} */ /* END CUDA_PROFILER */
#ifdef __cplusplus
};
#endif
#undef __CUDA_DEPRECATED
#endif

View file

@ -1,78 +0,0 @@
/*
* Copyright 2020-2021 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef CUDAPROFILERTYPEDEFS_H
#define CUDAPROFILERTYPEDEFS_H
#include <cudaProfiler.h>
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
/*
* Macros for the latest version for each driver function in cudaProfiler.h
*/
#define PFN_cuProfilerInitialize PFN_cuProfilerInitialize_v4000
#define PFN_cuProfilerStart PFN_cuProfilerStart_v4000
#define PFN_cuProfilerStop PFN_cuProfilerStop_v4000
/**
* Type definitions for functions defined in cudaProfiler.h
*/
typedef CUresult (CUDAAPI *PFN_cuProfilerInitialize_v4000)(const char *configFile, const char *outputFile, CUoutput_mode outputMode);
typedef CUresult (CUDAAPI *PFN_cuProfilerStart_v4000)(void);
typedef CUresult (CUDAAPI *PFN_cuProfilerStop_v4000)(void);
#ifdef __cplusplus
}
#endif // __cplusplus
#endif // file guard

View file

@ -1,939 +0,0 @@
/*
* Copyright 2020-2021 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef CUDATYPEDEFS_H
#define CUDATYPEDEFS_H
#include <cuda.h>
#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
#define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptds
#define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptsz
#else
#define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## default_version
#define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## default_version
#endif
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
/*
* Macros for the latest version for each driver function in cuda.h
*/
#define PFN_cuGetErrorString PFN_cuGetErrorString_v6000
#define PFN_cuGetErrorName PFN_cuGetErrorName_v6000
#define PFN_cuInit PFN_cuInit_v2000
#define PFN_cuDriverGetVersion PFN_cuDriverGetVersion_v2020
#define PFN_cuDeviceGet PFN_cuDeviceGet_v2000
#define PFN_cuDeviceGetCount PFN_cuDeviceGetCount_v2000
#define PFN_cuDeviceGetName PFN_cuDeviceGetName_v2000
#define PFN_cuDeviceGetUuid PFN_cuDeviceGetUuid_v11040
#define PFN_cuDeviceGetLuid PFN_cuDeviceGetLuid_v10000
#define PFN_cuDeviceTotalMem PFN_cuDeviceTotalMem_v3020
#define PFN_cuDeviceGetTexture1DLinearMaxWidth PFN_cuDeviceGetTexture1DLinearMaxWidth_v11010
#define PFN_cuDeviceGetAttribute PFN_cuDeviceGetAttribute_v2000
#define PFN_cuDeviceGetNvSciSyncAttributes PFN_cuDeviceGetNvSciSyncAttributes_v10020
#define PFN_cuDeviceSetMemPool PFN_cuDeviceSetMemPool_v11020
#define PFN_cuDeviceGetMemPool PFN_cuDeviceGetMemPool_v11020
#define PFN_cuDeviceGetDefaultMemPool PFN_cuDeviceGetDefaultMemPool_v11020
#define PFN_cuDeviceGetProperties PFN_cuDeviceGetProperties_v2000
#define PFN_cuDeviceComputeCapability PFN_cuDeviceComputeCapability_v2000
#define PFN_cuDevicePrimaryCtxRetain PFN_cuDevicePrimaryCtxRetain_v7000
#define PFN_cuDevicePrimaryCtxRelease PFN_cuDevicePrimaryCtxRelease_v11000
#define PFN_cuDevicePrimaryCtxSetFlags PFN_cuDevicePrimaryCtxSetFlags_v11000
#define PFN_cuDevicePrimaryCtxGetState PFN_cuDevicePrimaryCtxGetState_v7000
#define PFN_cuDevicePrimaryCtxReset PFN_cuDevicePrimaryCtxReset_v11000
#define PFN_cuDeviceGetExecAffinitySupport PFN_cuDeviceGetExecAffinitySupport_v11040
#define PFN_cuCtxCreate PFN_cuCtxCreate_v11040
#define PFN_cuCtxDestroy PFN_cuCtxDestroy_v4000
#define PFN_cuCtxPushCurrent PFN_cuCtxPushCurrent_v4000
#define PFN_cuCtxPopCurrent PFN_cuCtxPopCurrent_v4000
#define PFN_cuCtxSetCurrent PFN_cuCtxSetCurrent_v4000
#define PFN_cuCtxGetCurrent PFN_cuCtxGetCurrent_v4000
#define PFN_cuCtxGetDevice PFN_cuCtxGetDevice_v2000
#define PFN_cuCtxGetFlags PFN_cuCtxGetFlags_v7000
#define PFN_cuCtxSynchronize PFN_cuCtxSynchronize_v2000
#define PFN_cuCtxSetLimit PFN_cuCtxSetLimit_v3010
#define PFN_cuCtxGetLimit PFN_cuCtxGetLimit_v3010
#define PFN_cuCtxGetCacheConfig PFN_cuCtxGetCacheConfig_v3020
#define PFN_cuCtxSetCacheConfig PFN_cuCtxSetCacheConfig_v3020
#define PFN_cuCtxGetSharedMemConfig PFN_cuCtxGetSharedMemConfig_v4020
#define PFN_cuCtxSetSharedMemConfig PFN_cuCtxSetSharedMemConfig_v4020
#define PFN_cuCtxGetApiVersion PFN_cuCtxGetApiVersion_v3020
#define PFN_cuCtxGetStreamPriorityRange PFN_cuCtxGetStreamPriorityRange_v5050
#define PFN_cuCtxResetPersistingL2Cache PFN_cuCtxResetPersistingL2Cache_v11000
#define PFN_cuCtxAttach PFN_cuCtxAttach_v2000
#define PFN_cuCtxDetach PFN_cuCtxDetach_v2000
#define PFN_cuCtxGetExecAffinity PFN_cuCtxGetExecAffinity_v11040
#define PFN_cuModuleLoad PFN_cuModuleLoad_v2000
#define PFN_cuModuleLoadData PFN_cuModuleLoadData_v2000
#define PFN_cuModuleLoadDataEx PFN_cuModuleLoadDataEx_v2010
#define PFN_cuModuleLoadFatBinary PFN_cuModuleLoadFatBinary_v2000
#define PFN_cuModuleUnload PFN_cuModuleUnload_v2000
#define PFN_cuModuleGetFunction PFN_cuModuleGetFunction_v2000
#define PFN_cuModuleGetGlobal PFN_cuModuleGetGlobal_v3020
#define PFN_cuModuleGetTexRef PFN_cuModuleGetTexRef_v2000
#define PFN_cuModuleGetSurfRef PFN_cuModuleGetSurfRef_v3000
#define PFN_cuLinkCreate PFN_cuLinkCreate_v6050
#define PFN_cuLinkAddData PFN_cuLinkAddData_v6050
#define PFN_cuLinkAddFile PFN_cuLinkAddFile_v6050
#define PFN_cuLinkComplete PFN_cuLinkComplete_v5050
#define PFN_cuLinkDestroy PFN_cuLinkDestroy_v5050
#define PFN_cuMemGetInfo PFN_cuMemGetInfo_v3020
#define PFN_cuMemAlloc PFN_cuMemAlloc_v3020
#define PFN_cuMemAllocPitch PFN_cuMemAllocPitch_v3020
#define PFN_cuMemFree PFN_cuMemFree_v3020
#define PFN_cuMemGetAddressRange PFN_cuMemGetAddressRange_v3020
#define PFN_cuMemAllocHost PFN_cuMemAllocHost_v3020
#define PFN_cuMemFreeHost PFN_cuMemFreeHost_v2000
#define PFN_cuMemHostAlloc PFN_cuMemHostAlloc_v2020
#define PFN_cuMemHostGetDevicePointer PFN_cuMemHostGetDevicePointer_v3020
#define PFN_cuMemHostGetFlags PFN_cuMemHostGetFlags_v2030
#define PFN_cuMemAllocManaged PFN_cuMemAllocManaged_v6000
#define PFN_cuDeviceGetByPCIBusId PFN_cuDeviceGetByPCIBusId_v4010
#define PFN_cuDeviceGetPCIBusId PFN_cuDeviceGetPCIBusId_v4010
#define PFN_cuIpcGetEventHandle PFN_cuIpcGetEventHandle_v4010
#define PFN_cuIpcOpenEventHandle PFN_cuIpcOpenEventHandle_v4010
#define PFN_cuIpcGetMemHandle PFN_cuIpcGetMemHandle_v4010
#define PFN_cuIpcOpenMemHandle PFN_cuIpcOpenMemHandle_v11000
#define PFN_cuIpcCloseMemHandle PFN_cuIpcCloseMemHandle_v4010
#define PFN_cuMemHostRegister PFN_cuMemHostRegister_v6050
#define PFN_cuMemHostUnregister PFN_cuMemHostUnregister_v4000
#define PFN_cuMemcpy __API_TYPEDEF_PTDS(PFN_cuMemcpy, 4000, 7000)
#define PFN_cuMemcpyPeer __API_TYPEDEF_PTDS(PFN_cuMemcpyPeer, 4000, 7000)
#define PFN_cuMemcpyHtoD __API_TYPEDEF_PTDS(PFN_cuMemcpyHtoD, 3020, 7000)
#define PFN_cuMemcpyDtoH __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoH, 3020, 7000)
#define PFN_cuMemcpyDtoD __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoD, 3020, 7000)
#define PFN_cuMemcpyDtoA __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoA, 3020, 7000)
#define PFN_cuMemcpyAtoD __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoD, 3020, 7000)
#define PFN_cuMemcpyHtoA __API_TYPEDEF_PTDS(PFN_cuMemcpyHtoA, 3020, 7000)
#define PFN_cuMemcpyAtoH __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoH, 3020, 7000)
#define PFN_cuMemcpyAtoA __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoA, 3020, 7000)
#define PFN_cuMemcpy2D __API_TYPEDEF_PTDS(PFN_cuMemcpy2D, 3020, 7000)
#define PFN_cuMemcpy2DUnaligned __API_TYPEDEF_PTDS(PFN_cuMemcpy2DUnaligned, 3020, 7000)
#define PFN_cuMemcpy3D __API_TYPEDEF_PTDS(PFN_cuMemcpy3D, 3020, 7000)
#define PFN_cuMemcpy3DPeer __API_TYPEDEF_PTDS(PFN_cuMemcpy3DPeer, 4000, 7000)
#define PFN_cuMemcpyAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyAsync, 4000, 7000)
#define PFN_cuMemcpyPeerAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyPeerAsync, 4000, 7000)
#define PFN_cuMemcpyHtoDAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyHtoDAsync, 3020, 7000)
#define PFN_cuMemcpyDtoHAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyDtoHAsync, 3020, 7000)
#define PFN_cuMemcpyDtoDAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyDtoDAsync, 3020, 7000)
#define PFN_cuMemcpyHtoAAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyHtoAAsync, 3020, 7000)
#define PFN_cuMemcpyAtoHAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyAtoHAsync, 3020, 7000)
#define PFN_cuMemcpy2DAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpy2DAsync, 3020, 7000)
#define PFN_cuMemcpy3DAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpy3DAsync, 3020, 7000)
#define PFN_cuMemcpy3DPeerAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpy3DPeerAsync, 4000, 7000)
#define PFN_cuMemsetD8 __API_TYPEDEF_PTDS(PFN_cuMemsetD8, 3020, 7000)
#define PFN_cuMemsetD16 __API_TYPEDEF_PTDS(PFN_cuMemsetD16, 3020, 7000)
#define PFN_cuMemsetD32 __API_TYPEDEF_PTDS(PFN_cuMemsetD32, 3020, 7000)
#define PFN_cuMemsetD2D8 __API_TYPEDEF_PTDS(PFN_cuMemsetD2D8, 3020, 7000)
#define PFN_cuMemsetD2D16 __API_TYPEDEF_PTDS(PFN_cuMemsetD2D16, 3020, 7000)
#define PFN_cuMemsetD2D32 __API_TYPEDEF_PTDS(PFN_cuMemsetD2D32, 3020, 7000)
#define PFN_cuMemsetD8Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD8Async, 3020, 7000)
#define PFN_cuMemsetD16Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD16Async, 3020, 7000)
#define PFN_cuMemsetD32Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD32Async, 3020, 7000)
#define PFN_cuMemsetD2D8Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D8Async, 3020, 7000)
#define PFN_cuMemsetD2D16Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D16Async, 3020, 7000)
#define PFN_cuMemsetD2D32Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D32Async, 3020, 7000)
#define PFN_cuArrayCreate PFN_cuArrayCreate_v3020
#define PFN_cuArrayGetDescriptor PFN_cuArrayGetDescriptor_v3020
#define PFN_cuArrayGetSparseProperties PFN_cuArrayGetSparseProperties_v11010
#define PFN_cuMipmappedArrayGetSparseProperties PFN_cuMipmappedArrayGetSparseProperties_v11010
#define PFN_cuArrayGetMemoryRequirements PFN_cuArrayGetMemoryRequirements_v11060
#define PFN_cuMipmappedArrayGetMemoryRequirements PFN_cuMipmappedArrayGetMemoryRequirements_v11060
#define PFN_cuArrayGetPlane PFN_cuArrayGetPlane_v11020
#define PFN_cuArrayDestroy PFN_cuArrayDestroy_v2000
#define PFN_cuArray3DCreate PFN_cuArray3DCreate_v3020
#define PFN_cuArray3DGetDescriptor PFN_cuArray3DGetDescriptor_v3020
#define PFN_cuMipmappedArrayCreate PFN_cuMipmappedArrayCreate_v5000
#define PFN_cuMipmappedArrayGetLevel PFN_cuMipmappedArrayGetLevel_v5000
#define PFN_cuMipmappedArrayDestroy PFN_cuMipmappedArrayDestroy_v5000
#define PFN_cuMemAddressReserve PFN_cuMemAddressReserve_v10020
#define PFN_cuMemAddressFree PFN_cuMemAddressFree_v10020
#define PFN_cuMemCreate PFN_cuMemCreate_v10020
#define PFN_cuMemRelease PFN_cuMemRelease_v10020
#define PFN_cuMemMap PFN_cuMemMap_v10020
#define PFN_cuMemMapArrayAsync __API_TYPEDEF_PTSZ(PFN_cuMemMapArrayAsync, 11010, 11010)
#define PFN_cuMemUnmap PFN_cuMemUnmap_v10020
#define PFN_cuMemSetAccess PFN_cuMemSetAccess_v10020
#define PFN_cuMemGetAccess PFN_cuMemGetAccess_v10020
#define PFN_cuMemExportToShareableHandle PFN_cuMemExportToShareableHandle_v10020
#define PFN_cuMemImportFromShareableHandle PFN_cuMemImportFromShareableHandle_v10020
#define PFN_cuMemGetAllocationGranularity PFN_cuMemGetAllocationGranularity_v10020
#define PFN_cuMemGetAllocationPropertiesFromHandle PFN_cuMemGetAllocationPropertiesFromHandle_v10020
#define PFN_cuMemRetainAllocationHandle PFN_cuMemRetainAllocationHandle_v11000
#define PFN_cuMemFreeAsync __API_TYPEDEF_PTSZ(PFN_cuMemFreeAsync, 11020, 11020)
#define PFN_cuMemAllocAsync __API_TYPEDEF_PTSZ(PFN_cuMemAllocAsync, 11020, 11020)
#define PFN_cuMemPoolTrimTo PFN_cuMemPoolTrimTo_v11020
#define PFN_cuMemPoolSetAttribute PFN_cuMemPoolSetAttribute_v11020
#define PFN_cuMemPoolGetAttribute PFN_cuMemPoolGetAttribute_v11020
#define PFN_cuMemPoolSetAccess PFN_cuMemPoolSetAccess_v11020
#define PFN_cuMemPoolGetAccess PFN_cuMemPoolGetAccess_v11020
#define PFN_cuMemPoolCreate PFN_cuMemPoolCreate_v11020
#define PFN_cuMemPoolDestroy PFN_cuMemPoolDestroy_v11020
#define PFN_cuMemAllocFromPoolAsync __API_TYPEDEF_PTSZ(PFN_cuMemAllocFromPoolAsync, 11020, 11020)
#define PFN_cuMemPoolExportToShareableHandle PFN_cuMemPoolExportToShareableHandle_v11020
#define PFN_cuMemPoolImportFromShareableHandle PFN_cuMemPoolImportFromShareableHandle_v11020
#define PFN_cuMemPoolExportPointer PFN_cuMemPoolExportPointer_v11020
#define PFN_cuMemPoolImportPointer PFN_cuMemPoolImportPointer_v11020
#define PFN_cuPointerGetAttribute PFN_cuPointerGetAttribute_v4000
#define PFN_cuMemPrefetchAsync __API_TYPEDEF_PTSZ(PFN_cuMemPrefetchAsync, 8000, 8000)
#define PFN_cuMemAdvise PFN_cuMemAdvise_v8000
#define PFN_cuMemRangeGetAttribute PFN_cuMemRangeGetAttribute_v8000
#define PFN_cuMemRangeGetAttributes PFN_cuMemRangeGetAttributes_v8000
#define PFN_cuPointerSetAttribute PFN_cuPointerSetAttribute_v6000
#define PFN_cuPointerGetAttributes PFN_cuPointerGetAttributes_v7000
#define PFN_cuStreamCreate PFN_cuStreamCreate_v2000
#define PFN_cuStreamCreateWithPriority PFN_cuStreamCreateWithPriority_v5050
#define PFN_cuStreamGetPriority __API_TYPEDEF_PTSZ(PFN_cuStreamGetPriority, 5050, 7000)
#define PFN_cuStreamGetFlags __API_TYPEDEF_PTSZ(PFN_cuStreamGetFlags, 5050, 7000)
#define PFN_cuStreamGetCtx __API_TYPEDEF_PTSZ(PFN_cuStreamGetCtx, 9020, 9020)
#define PFN_cuStreamWaitEvent __API_TYPEDEF_PTSZ(PFN_cuStreamWaitEvent, 3020, 7000)
#define PFN_cuStreamAddCallback __API_TYPEDEF_PTSZ(PFN_cuStreamAddCallback, 5000, 7000)
#define PFN_cuStreamBeginCapture __API_TYPEDEF_PTSZ(PFN_cuStreamBeginCapture, 10010, 10010)
#define PFN_cuThreadExchangeStreamCaptureMode PFN_cuThreadExchangeStreamCaptureMode_v10010
#define PFN_cuStreamEndCapture __API_TYPEDEF_PTSZ(PFN_cuStreamEndCapture, 10000, 10000)
#define PFN_cuStreamIsCapturing __API_TYPEDEF_PTSZ(PFN_cuStreamIsCapturing, 10000, 10000)
#define PFN_cuStreamGetCaptureInfo __API_TYPEDEF_PTSZ(PFN_cuStreamGetCaptureInfo, 10010, 10010)
#define PFN_cuStreamGetCaptureInfo_v2 __API_TYPEDEF_PTSZ(PFN_cuStreamGetCaptureInfo, 11030, 11030)
#define PFN_cuStreamUpdateCaptureDependencies __API_TYPEDEF_PTSZ(PFN_cuStreamUpdateCaptureDependencies, 11030, 11030)
#define PFN_cuStreamAttachMemAsync __API_TYPEDEF_PTSZ(PFN_cuStreamAttachMemAsync, 6000, 7000)
#define PFN_cuStreamQuery __API_TYPEDEF_PTSZ(PFN_cuStreamQuery, 2000, 7000)
#define PFN_cuStreamSynchronize __API_TYPEDEF_PTSZ(PFN_cuStreamSynchronize, 2000, 7000)
#define PFN_cuStreamDestroy PFN_cuStreamDestroy_v4000
#define PFN_cuStreamCopyAttributes __API_TYPEDEF_PTSZ(PFN_cuStreamCopyAttributes, 11000, 11000)
#define PFN_cuStreamGetAttribute __API_TYPEDEF_PTSZ(PFN_cuStreamGetAttribute, 11000, 11000)
#define PFN_cuStreamSetAttribute __API_TYPEDEF_PTSZ(PFN_cuStreamSetAttribute, 11000, 11000)
#define PFN_cuEventCreate PFN_cuEventCreate_v2000
#define PFN_cuEventRecord __API_TYPEDEF_PTSZ(PFN_cuEventRecord, 2000, 7000)
#define PFN_cuEventRecordWithFlags __API_TYPEDEF_PTSZ(PFN_cuEventRecordWithFlags, 11010, 11010)
#define PFN_cuEventQuery PFN_cuEventQuery_v2000
#define PFN_cuEventSynchronize PFN_cuEventSynchronize_v2000
#define PFN_cuEventDestroy PFN_cuEventDestroy_v4000
#define PFN_cuEventElapsedTime PFN_cuEventElapsedTime_v2000
#define PFN_cuImportExternalMemory PFN_cuImportExternalMemory_v10000
#define PFN_cuExternalMemoryGetMappedBuffer PFN_cuExternalMemoryGetMappedBuffer_v10000
#define PFN_cuExternalMemoryGetMappedMipmappedArray PFN_cuExternalMemoryGetMappedMipmappedArray_v10000
#define PFN_cuDestroyExternalMemory PFN_cuDestroyExternalMemory_v10000
#define PFN_cuImportExternalSemaphore PFN_cuImportExternalSemaphore_v10000
#define PFN_cuSignalExternalSemaphoresAsync __API_TYPEDEF_PTSZ(PFN_cuSignalExternalSemaphoresAsync, 10000, 10000)
#define PFN_cuWaitExternalSemaphoresAsync __API_TYPEDEF_PTSZ(PFN_cuWaitExternalSemaphoresAsync, 10000, 10000)
#define PFN_cuDestroyExternalSemaphore PFN_cuDestroyExternalSemaphore_v10000
#define PFN_cuStreamWaitValue32 __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue32, 8000, 8000)
#define PFN_cuStreamWaitValue64 __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue64, 9000, 9000)
#define PFN_cuStreamWriteValue32 __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue32, 8000, 8000)
#define PFN_cuStreamWriteValue64 __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue64, 9000, 9000)
#define PFN_cuStreamBatchMemOp __API_TYPEDEF_PTSZ(PFN_cuStreamBatchMemOp, 8000, 8000)
#define PFN_cuFuncGetAttribute PFN_cuFuncGetAttribute_v2020
#define PFN_cuFuncSetAttribute PFN_cuFuncSetAttribute_v9000
#define PFN_cuFuncSetCacheConfig PFN_cuFuncSetCacheConfig_v3000
#define PFN_cuFuncSetSharedMemConfig PFN_cuFuncSetSharedMemConfig_v4020
#define PFN_cuLaunchKernel __API_TYPEDEF_PTSZ(PFN_cuLaunchKernel, 4000, 7000)
#define PFN_cuLaunchCooperativeKernel __API_TYPEDEF_PTSZ(PFN_cuLaunchCooperativeKernel, 9000, 9000)
#define PFN_cuLaunchCooperativeKernelMultiDevice PFN_cuLaunchCooperativeKernelMultiDevice_v9000
#define PFN_cuLaunchHostFunc __API_TYPEDEF_PTSZ(PFN_cuLaunchHostFunc, 10000, 10000)
#define PFN_cuFuncSetBlockShape PFN_cuFuncSetBlockShape_v2000
#define PFN_cuFuncSetSharedSize PFN_cuFuncSetSharedSize_v2000
#define PFN_cuParamSetSize PFN_cuParamSetSize_v2000
#define PFN_cuParamSeti PFN_cuParamSeti_v2000
#define PFN_cuParamSetf PFN_cuParamSetf_v2000
#define PFN_cuParamSetv PFN_cuParamSetv_v2000
#define PFN_cuLaunch PFN_cuLaunch_v2000
#define PFN_cuLaunchGrid PFN_cuLaunchGrid_v2000
#define PFN_cuLaunchGridAsync PFN_cuLaunchGridAsync_v2000
#define PFN_cuParamSetTexRef PFN_cuParamSetTexRef_v2000
#define PFN_cuGraphCreate PFN_cuGraphCreate_v10000
#define PFN_cuGraphAddKernelNode PFN_cuGraphAddKernelNode_v10000
#define PFN_cuGraphKernelNodeGetParams PFN_cuGraphKernelNodeGetParams_v10000
#define PFN_cuGraphKernelNodeSetParams PFN_cuGraphKernelNodeSetParams_v10000
#define PFN_cuGraphAddMemcpyNode PFN_cuGraphAddMemcpyNode_v10000
#define PFN_cuGraphMemcpyNodeGetParams PFN_cuGraphMemcpyNodeGetParams_v10000
#define PFN_cuGraphMemcpyNodeSetParams PFN_cuGraphMemcpyNodeSetParams_v10000
#define PFN_cuGraphAddMemsetNode PFN_cuGraphAddMemsetNode_v10000
#define PFN_cuGraphMemsetNodeGetParams PFN_cuGraphMemsetNodeGetParams_v10000
#define PFN_cuGraphMemsetNodeSetParams PFN_cuGraphMemsetNodeSetParams_v10000
#define PFN_cuGraphAddHostNode PFN_cuGraphAddHostNode_v10000
#define PFN_cuGraphHostNodeGetParams PFN_cuGraphHostNodeGetParams_v10000
#define PFN_cuGraphHostNodeSetParams PFN_cuGraphHostNodeSetParams_v10000
#define PFN_cuGraphAddChildGraphNode PFN_cuGraphAddChildGraphNode_v10000
#define PFN_cuGraphChildGraphNodeGetGraph PFN_cuGraphChildGraphNodeGetGraph_v10000
#define PFN_cuGraphAddEmptyNode PFN_cuGraphAddEmptyNode_v10000
#define PFN_cuGraphAddEventRecordNode PFN_cuGraphAddEventRecordNode_v11010
#define PFN_cuGraphEventRecordNodeGetEvent PFN_cuGraphEventRecordNodeGetEvent_v11010
#define PFN_cuGraphEventRecordNodeSetEvent PFN_cuGraphEventRecordNodeSetEvent_v11010
#define PFN_cuGraphAddEventWaitNode PFN_cuGraphAddEventWaitNode_v11010
#define PFN_cuGraphEventWaitNodeGetEvent PFN_cuGraphEventWaitNodeGetEvent_v11010
#define PFN_cuGraphEventWaitNodeSetEvent PFN_cuGraphEventWaitNodeSetEvent_v11010
#define PFN_cuGraphAddExternalSemaphoresSignalNode PFN_cuGraphAddExternalSemaphoresSignalNode_v11020
#define PFN_cuGraphExternalSemaphoresSignalNodeGetParams PFN_cuGraphExternalSemaphoresSignalNodeGetParams_v11020
#define PFN_cuGraphExternalSemaphoresSignalNodeSetParams PFN_cuGraphExternalSemaphoresSignalNodeSetParams_v11020
#define PFN_cuGraphAddExternalSemaphoresWaitNode PFN_cuGraphAddExternalSemaphoresWaitNode_v11020
#define PFN_cuGraphExternalSemaphoresWaitNodeGetParams PFN_cuGraphExternalSemaphoresWaitNodeGetParams_v11020
#define PFN_cuGraphExternalSemaphoresWaitNodeSetParams PFN_cuGraphExternalSemaphoresWaitNodeSetParams_v11020
#define PFN_cuGraphClone PFN_cuGraphClone_v10000
#define PFN_cuGraphNodeFindInClone PFN_cuGraphNodeFindInClone_v10000
#define PFN_cuGraphNodeGetType PFN_cuGraphNodeGetType_v10000
#define PFN_cuGraphGetNodes PFN_cuGraphGetNodes_v10000
#define PFN_cuGraphGetRootNodes PFN_cuGraphGetRootNodes_v10000
#define PFN_cuGraphGetEdges PFN_cuGraphGetEdges_v10000
#define PFN_cuGraphNodeGetDependencies PFN_cuGraphNodeGetDependencies_v10000
#define PFN_cuGraphNodeGetDependentNodes PFN_cuGraphNodeGetDependentNodes_v10000
#define PFN_cuGraphAddDependencies PFN_cuGraphAddDependencies_v10000
#define PFN_cuGraphRemoveDependencies PFN_cuGraphRemoveDependencies_v10000
#define PFN_cuGraphDestroyNode PFN_cuGraphDestroyNode_v10000
#define PFN_cuGraphInstantiate PFN_cuGraphInstantiate_v11000
#define PFN_cuGraphInstantiateWithFlags PFN_cuGraphInstantiateWithFlags_v11040
#define PFN_cuGraphExecKernelNodeSetParams PFN_cuGraphExecKernelNodeSetParams_v10010
#define PFN_cuGraphExecMemcpyNodeSetParams PFN_cuGraphExecMemcpyNodeSetParams_v10020
#define PFN_cuGraphExecMemsetNodeSetParams PFN_cuGraphExecMemsetNodeSetParams_v10020
#define PFN_cuGraphExecHostNodeSetParams PFN_cuGraphExecHostNodeSetParams_v10020
#define PFN_cuGraphExecChildGraphNodeSetParams PFN_cuGraphExecChildGraphNodeSetParams_v11010
#define PFN_cuGraphExecEventRecordNodeSetEvent PFN_cuGraphExecEventRecordNodeSetEvent_v11010
#define PFN_cuGraphExecEventWaitNodeSetEvent PFN_cuGraphExecEventWaitNodeSetEvent_v11010
#define PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams_v11020
#define PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams_v11020
#define PFN_cuGraphUpload __API_TYPEDEF_PTSZ(PFN_cuGraphUpload, 11010, 11010)
#define PFN_cuGraphLaunch __API_TYPEDEF_PTSZ(PFN_cuGraphLaunch, 10000, 10000)
#define PFN_cuGraphExecDestroy PFN_cuGraphExecDestroy_v10000
#define PFN_cuGraphDestroy PFN_cuGraphDestroy_v10000
#define PFN_cuGraphExecUpdate PFN_cuGraphExecUpdate_v10020
#define PFN_cuGraphKernelNodeCopyAttributes PFN_cuGraphKernelNodeCopyAttributes_v11000
#define PFN_cuGraphKernelNodeGetAttribute PFN_cuGraphKernelNodeGetAttribute_v11000
#define PFN_cuGraphKernelNodeSetAttribute PFN_cuGraphKernelNodeSetAttribute_v11000
#define PFN_cuGraphDebugDotPrint PFN_cuGraphDebugDotPrint_v11030
#define PFN_cuGraphAddMemAllocNode PFN_cuGraphAddMemAllocNode_v11040
#define PFN_cuGraphMemAllocNodeGetParams PFN_cuGraphMemAllocNodeGetParams_v11040
#define PFN_cuGraphAddMemFreeNode PFN_cuGraphAddMemFreeNode_v11040
#define PFN_cuGraphMemFreeNodeGetParams PFN_cuGraphMemFreeNodeGetParams_v11040
#define PFN_cuGraphNodeSetEnabled PFN_cuGraphNodeSetEnabled_v11060
#define PFN_cuGraphNodeGetEnabled PFN_cuGraphNodeGetEnabled_v11060
#define PFN_cuDeviceGraphMemTrim PFN_cuDeviceGraphMemTrim_v11040
#define PFN_cuDeviceGetGraphMemAttribute PFN_cuDeviceGetGraphMemAttribute_v11040
#define PFN_cuDeviceSetGraphMemAttribute PFN_cuDeviceSetGraphMemAttribute_v11040
#define PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor_v6050
#define PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000
#define PFN_cuOccupancyMaxPotentialBlockSize PFN_cuOccupancyMaxPotentialBlockSize_v6050
#define PFN_cuOccupancyMaxPotentialBlockSizeWithFlags PFN_cuOccupancyMaxPotentialBlockSizeWithFlags_v7000
#define PFN_cuOccupancyAvailableDynamicSMemPerBlock PFN_cuOccupancyAvailableDynamicSMemPerBlock_v10020
#define PFN_cuTexRefSetArray PFN_cuTexRefSetArray_v2000
#define PFN_cuTexRefSetMipmappedArray PFN_cuTexRefSetMipmappedArray_v5000
#define PFN_cuTexRefSetAddress PFN_cuTexRefSetAddress_v3020
#define PFN_cuTexRefSetAddress2D PFN_cuTexRefSetAddress2D_v4010
#define PFN_cuTexRefSetFormat PFN_cuTexRefSetFormat_v2000
#define PFN_cuTexRefSetAddressMode PFN_cuTexRefSetAddressMode_v2000
#define PFN_cuTexRefSetFilterMode PFN_cuTexRefSetFilterMode_v2000
#define PFN_cuTexRefSetMipmapFilterMode PFN_cuTexRefSetMipmapFilterMode_v5000
#define PFN_cuTexRefSetMipmapLevelBias PFN_cuTexRefSetMipmapLevelBias_v5000
#define PFN_cuTexRefSetMipmapLevelClamp PFN_cuTexRefSetMipmapLevelClamp_v5000
#define PFN_cuTexRefSetMaxAnisotropy PFN_cuTexRefSetMaxAnisotropy_v5000
#define PFN_cuTexRefSetBorderColor PFN_cuTexRefSetBorderColor_v8000
#define PFN_cuTexRefSetFlags PFN_cuTexRefSetFlags_v2000
#define PFN_cuTexRefGetAddress PFN_cuTexRefGetAddress_v3020
#define PFN_cuTexRefGetArray PFN_cuTexRefGetArray_v2000
#define PFN_cuTexRefGetMipmappedArray PFN_cuTexRefGetMipmappedArray_v5000
#define PFN_cuTexRefGetAddressMode PFN_cuTexRefGetAddressMode_v2000
#define PFN_cuTexRefGetFilterMode PFN_cuTexRefGetFilterMode_v2000
#define PFN_cuTexRefGetFormat PFN_cuTexRefGetFormat_v2000
#define PFN_cuTexRefGetMipmapFilterMode PFN_cuTexRefGetMipmapFilterMode_v5000
#define PFN_cuTexRefGetMipmapLevelBias PFN_cuTexRefGetMipmapLevelBias_v5000
#define PFN_cuTexRefGetMipmapLevelClamp PFN_cuTexRefGetMipmapLevelClamp_v5000
#define PFN_cuTexRefGetMaxAnisotropy PFN_cuTexRefGetMaxAnisotropy_v5000
#define PFN_cuTexRefGetBorderColor PFN_cuTexRefGetBorderColor_v8000
#define PFN_cuTexRefGetFlags PFN_cuTexRefGetFlags_v2000
#define PFN_cuTexRefCreate PFN_cuTexRefCreate_v2000
#define PFN_cuTexRefDestroy PFN_cuTexRefDestroy_v2000
#define PFN_cuSurfRefSetArray PFN_cuSurfRefSetArray_v3000
#define PFN_cuSurfRefGetArray PFN_cuSurfRefGetArray_v3000
#define PFN_cuTexObjectCreate PFN_cuTexObjectCreate_v5000
#define PFN_cuTexObjectDestroy PFN_cuTexObjectDestroy_v5000
#define PFN_cuTexObjectGetResourceDesc PFN_cuTexObjectGetResourceDesc_v5000
#define PFN_cuTexObjectGetTextureDesc PFN_cuTexObjectGetTextureDesc_v5000
#define PFN_cuTexObjectGetResourceViewDesc PFN_cuTexObjectGetResourceViewDesc_v5000
#define PFN_cuSurfObjectCreate PFN_cuSurfObjectCreate_v5000
#define PFN_cuSurfObjectDestroy PFN_cuSurfObjectDestroy_v5000
#define PFN_cuSurfObjectGetResourceDesc PFN_cuSurfObjectGetResourceDesc_v5000
#define PFN_cuDeviceCanAccessPeer PFN_cuDeviceCanAccessPeer_v4000
#define PFN_cuCtxEnablePeerAccess PFN_cuCtxEnablePeerAccess_v4000
#define PFN_cuCtxDisablePeerAccess PFN_cuCtxDisablePeerAccess_v4000
#define PFN_cuDeviceGetP2PAttribute PFN_cuDeviceGetP2PAttribute_v8000
#define PFN_cuGraphicsUnregisterResource PFN_cuGraphicsUnregisterResource_v3000
#define PFN_cuGraphicsSubResourceGetMappedArray PFN_cuGraphicsSubResourceGetMappedArray_v3000
#define PFN_cuGraphicsResourceGetMappedMipmappedArray PFN_cuGraphicsResourceGetMappedMipmappedArray_v5000
#define PFN_cuGraphicsResourceGetMappedPointer PFN_cuGraphicsResourceGetMappedPointer_v3020
#define PFN_cuGraphicsResourceSetMapFlags PFN_cuGraphicsResourceSetMapFlags_v6050
#define PFN_cuGraphicsMapResources __API_TYPEDEF_PTSZ(PFN_cuGraphicsMapResources, 3000, 7000)
#define PFN_cuGraphicsUnmapResources __API_TYPEDEF_PTSZ(PFN_cuGraphicsUnmapResources, 3000, 7000)
#define PFN_cuGetExportTable PFN_cuGetExportTable_v3000
#define PFN_cuFuncGetModule PFN_cuFuncGetModule_v11000
#define PFN_cuFlushGPUDirectRDMAWrites PFN_cuFlushGPUDirectRDMAWrites_v11030
#define PFN_cuGetProcAddress PFN_cuGetProcAddress_v11030
#define PFN_cuUserObjectCreate PFN_cuUserObjectCreate_v11030
#define PFN_cuUserObjectRetain PFN_cuUserObjectRetain_v11030
#define PFN_cuUserObjectRelease PFN_cuUserObjectRelease_v11030
#define PFN_cuGraphRetainUserObject PFN_cuGraphRetainUserObject_v11030
#define PFN_cuGraphReleaseUserObject PFN_cuGraphReleaseUserObject_v11030
/*
* Type definitions for functions defined in cuda.h
*/
typedef CUresult (CUDAAPI *PFN_cuGetErrorString_v6000)(CUresult error, const char **pStr);
typedef CUresult (CUDAAPI *PFN_cuGetErrorName_v6000)(CUresult error, const char **pStr);
typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion);
typedef CUresult (CUDAAPI *PFN_cuDeviceGet_v2000)(CUdevice_v1 *device, int ordinal);
typedef CUresult (CUDAAPI *PFN_cuDeviceGetCount_v2000)(int *count);
typedef CUresult (CUDAAPI *PFN_cuDeviceGetName_v2000)(char *name, int len, CUdevice_v1 dev);
typedef CUresult (CUDAAPI *PFN_cuDeviceGetUuid_v9020)(CUuuid *uuid, CUdevice_v1 dev);
typedef CUresult (CUDAAPI *PFN_cuDeviceGetUuid_v11040)(CUuuid *uuid, CUdevice_v1 dev);
typedef CUresult (CUDAAPI *PFN_cuDeviceGetLuid_v10000)(char *luid, unsigned int *deviceNodeMask, CUdevice_v1 dev);
typedef CUresult (CUDAAPI *PFN_cuDeviceTotalMem_v3020)(size_t *bytes, CUdevice_v1 dev);
typedef CUresult (CUDAAPI *PFN_cuDeviceGetTexture1DLinearMaxWidth_v11010)(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice_v1 dev);
typedef CUresult (CUDAAPI *PFN_cuDeviceGetAttribute_v2000)(int *pi, CUdevice_attribute attrib, CUdevice_v1 dev);
typedef CUresult (CUDAAPI *PFN_cuDeviceGetNvSciSyncAttributes_v10020)(void *nvSciSyncAttrList, CUdevice_v1 dev, int flags);
typedef CUresult (CUDAAPI *PFN_cuDeviceSetMemPool_v11020)(CUdevice_v1 dev, CUmemoryPool pool);
typedef CUresult (CUDAAPI *PFN_cuDeviceGetMemPool_v11020)(CUmemoryPool *pool, CUdevice_v1 dev);
typedef CUresult (CUDAAPI *PFN_cuDeviceGetDefaultMemPool_v11020)(CUmemoryPool *pool_out, CUdevice_v1 dev);
typedef CUresult (CUDAAPI *PFN_cuDeviceGetProperties_v2000)(CUdevprop_v1 *prop, CUdevice_v1 dev);
typedef CUresult (CUDAAPI *PFN_cuDeviceComputeCapability_v2000)(int *major, int *minor, CUdevice_v1 dev);
typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRetain_v7000)(CUcontext *pctx, CUdevice_v1 dev);
typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRelease_v11000)(CUdevice_v1 dev);
typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxSetFlags_v11000)(CUdevice_v1 dev, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxGetState_v7000)(CUdevice_v1 dev, unsigned int *flags, int *active);
typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxReset_v11000)(CUdevice_v1 dev);
typedef CUresult (CUDAAPI *PFN_cuDeviceGetExecAffinitySupport_v11040)(int *pi, CUexecAffinityType type, CUdevice dev);
typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v3020)(CUcontext *pctx, unsigned int flags, CUdevice_v1 dev);
typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v11040)(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice_v1 dev);
typedef CUresult (CUDAAPI *PFN_cuCtxDestroy_v4000)(CUcontext ctx);
typedef CUresult (CUDAAPI *PFN_cuCtxPushCurrent_v4000)(CUcontext ctx);
typedef CUresult (CUDAAPI *PFN_cuCtxPopCurrent_v4000)(CUcontext *pctx);
typedef CUresult (CUDAAPI *PFN_cuCtxSetCurrent_v4000)(CUcontext ctx);
typedef CUresult (CUDAAPI *PFN_cuCtxGetCurrent_v4000)(CUcontext *pctx);
typedef CUresult (CUDAAPI *PFN_cuCtxGetDevice_v2000)(CUdevice_v1 *device);
typedef CUresult (CUDAAPI *PFN_cuCtxGetFlags_v7000)(unsigned int *flags);
typedef CUresult (CUDAAPI *PFN_cuCtxSynchronize_v2000)(void);
typedef CUresult (CUDAAPI *PFN_cuCtxSetLimit_v3010)(CUlimit limit, size_t value);
typedef CUresult (CUDAAPI *PFN_cuCtxGetLimit_v3010)(size_t *pvalue, CUlimit limit);
typedef CUresult (CUDAAPI *PFN_cuCtxGetCacheConfig_v3020)(CUfunc_cache *pconfig);
typedef CUresult (CUDAAPI *PFN_cuCtxSetCacheConfig_v3020)(CUfunc_cache config);
typedef CUresult (CUDAAPI *PFN_cuCtxGetSharedMemConfig_v4020)(CUsharedconfig *pConfig);
typedef CUresult (CUDAAPI *PFN_cuCtxSetSharedMemConfig_v4020)(CUsharedconfig config);
typedef CUresult (CUDAAPI *PFN_cuCtxGetApiVersion_v3020)(CUcontext ctx, unsigned int *version);
typedef CUresult (CUDAAPI *PFN_cuCtxGetStreamPriorityRange_v5050)(int *leastPriority, int *greatestPriority);
typedef CUresult (CUDAAPI *PFN_cuCtxResetPersistingL2Cache_v11000)(void);
typedef CUresult (CUDAAPI *PFN_cuCtxAttach_v2000)(CUcontext *pctx, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuCtxDetach_v2000)(CUcontext ctx);
typedef CUresult (CUDAAPI *PFN_cuCtxGetExecAffinity_v11040)(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type);
typedef CUresult (CUDAAPI *PFN_cuModuleLoad_v2000)(CUmodule *module, const char *fname);
typedef CUresult (CUDAAPI *PFN_cuModuleLoadData_v2000)(CUmodule *module, const void *image);
typedef CUresult (CUDAAPI *PFN_cuModuleLoadDataEx_v2010)(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
typedef CUresult (CUDAAPI *PFN_cuModuleLoadFatBinary_v2000)(CUmodule *module, const void *fatCubin);
typedef CUresult (CUDAAPI *PFN_cuModuleUnload_v2000)(CUmodule hmod);
typedef CUresult (CUDAAPI *PFN_cuModuleGetFunction_v2000)(CUfunction *hfunc, CUmodule hmod, const char *name);
typedef CUresult (CUDAAPI *PFN_cuModuleGetGlobal_v3020)(CUdeviceptr_v2 *dptr, size_t *bytes, CUmodule hmod, const char *name);
typedef CUresult (CUDAAPI *PFN_cuModuleGetTexRef_v2000)(CUtexref *pTexRef, CUmodule hmod, const char *name);
typedef CUresult (CUDAAPI *PFN_cuModuleGetSurfRef_v3000)(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
typedef CUresult (CUDAAPI *PFN_cuLinkCreate_v6050)(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
typedef CUresult (CUDAAPI *PFN_cuLinkAddData_v6050)(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
typedef CUresult (CUDAAPI *PFN_cuLinkAddFile_v6050)(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues);
typedef CUresult (CUDAAPI *PFN_cuLinkComplete_v5050)(CUlinkState state, void **cubinOut, size_t *sizeOut);
typedef CUresult (CUDAAPI *PFN_cuLinkDestroy_v5050)(CUlinkState state);
typedef CUresult (CUDAAPI *PFN_cuMemGetInfo_v3020)(size_t *free, size_t *total);
typedef CUresult (CUDAAPI *PFN_cuMemAlloc_v3020)(CUdeviceptr_v2 *dptr, size_t bytesize);
typedef CUresult (CUDAAPI *PFN_cuMemAllocPitch_v3020)(CUdeviceptr_v2 *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
typedef CUresult (CUDAAPI *PFN_cuMemFree_v3020)(CUdeviceptr_v2 dptr);
typedef CUresult (CUDAAPI *PFN_cuMemGetAddressRange_v3020)(CUdeviceptr_v2 *pbase, size_t *psize, CUdeviceptr_v2 dptr);
typedef CUresult (CUDAAPI *PFN_cuMemAllocHost_v3020)(void **pp, size_t bytesize);
typedef CUresult (CUDAAPI *PFN_cuMemFreeHost_v2000)(void *p);
typedef CUresult (CUDAAPI *PFN_cuMemHostAlloc_v2020)(void **pp, size_t bytesize, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuMemHostGetDevicePointer_v3020)(CUdeviceptr_v2 *pdptr, void *p, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuMemHostGetFlags_v2030)(unsigned int *pFlags, void *p);
typedef CUresult (CUDAAPI *PFN_cuMemAllocManaged_v6000)(CUdeviceptr_v2 *dptr, size_t bytesize, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuDeviceGetByPCIBusId_v4010)(CUdevice_v1 *dev, const char *pciBusId);
typedef CUresult (CUDAAPI *PFN_cuDeviceGetPCIBusId_v4010)(char *pciBusId, int len, CUdevice_v1 dev);
typedef CUresult (CUDAAPI *PFN_cuIpcGetEventHandle_v4010)(CUipcEventHandle_v1 *pHandle, CUevent event);
typedef CUresult (CUDAAPI *PFN_cuIpcOpenEventHandle_v4010)(CUevent *phEvent, CUipcEventHandle_v1 handle);
typedef CUresult (CUDAAPI *PFN_cuIpcGetMemHandle_v4010)(CUipcMemHandle_v1 *pHandle, CUdeviceptr_v2 dptr);
typedef CUresult (CUDAAPI *PFN_cuIpcOpenMemHandle_v11000)(CUdeviceptr_v2 *pdptr, CUipcMemHandle_v1 handle, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuIpcCloseMemHandle_v4010)(CUdeviceptr_v2 dptr);
typedef CUresult (CUDAAPI *PFN_cuMemHostRegister_v6050)(void *p, size_t bytesize, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuMemHostUnregister_v4000)(void *p);
typedef CUresult (CUDAAPI *PFN_cuMemcpy_v7000_ptds)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyPeer_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v7000_ptds)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, CUdeviceptr_v2 srcDevice, size_t ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v7000_ptds)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v7000_ptds)(const CUDA_MEMCPY2D_v2 *pCopy);
typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v7000_ptds)(const CUDA_MEMCPY2D_v2 *pCopy);
typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v7000_ptds)(const CUDA_MEMCPY3D_v2 *pCopy);
typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeer_v7000_ptds)(const CUDA_MEMCPY3D_PEER_v1 *pCopy);
typedef CUresult (CUDAAPI *PFN_cuMemcpyAsync_v7000_ptsz)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemcpyPeerAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v7000_ptsz)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v7000_ptsz)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v7000_ptsz)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v7000_ptsz)(const CUDA_MEMCPY2D_v2 *pCopy, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v7000_ptsz)(const CUDA_MEMCPY3D_v2 *pCopy, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeerAsync_v7000_ptsz)(const CUDA_MEMCPY3D_PEER_v1 *pCopy, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N);
typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N);
typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N);
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
typedef CUresult (CUDAAPI *PFN_cuMemsetD8Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemsetD16Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemsetD32Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuArrayCreate_v3020)(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v2 *pAllocateArray);
typedef CUresult (CUDAAPI *PFN_cuArrayGetDescriptor_v3020)(CUDA_ARRAY_DESCRIPTOR_v2 *pArrayDescriptor, CUarray hArray);
typedef CUresult (CUDAAPI *PFN_cuArrayGetSparseProperties_v11010)(CUDA_ARRAY_SPARSE_PROPERTIES_v1 *sparseProperties, CUarray array);
typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetSparseProperties_v11010)(CUDA_ARRAY_SPARSE_PROPERTIES_v1 *sparseProperties, CUmipmappedArray mipmap);
typedef CUresult (CUDAAPI *PFN_cuArrayGetMemoryRequirements_v11060)(CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 *memoryRequirements, CUarray array, CUdevice device);
typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetMemoryRequirements_v11060)(CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 *memoryRequirements, CUmipmappedArray mipmap, CUdevice device);
typedef CUresult (CUDAAPI *PFN_cuArrayGetPlane_v11020)(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx);
typedef CUresult (CUDAAPI *PFN_cuArrayDestroy_v2000)(CUarray hArray);
typedef CUresult (CUDAAPI *PFN_cuArray3DCreate_v3020)(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v2 *pAllocateArray);
typedef CUresult (CUDAAPI *PFN_cuArray3DGetDescriptor_v3020)(CUDA_ARRAY3D_DESCRIPTOR_v2 *pArrayDescriptor, CUarray hArray);
typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayCreate_v5000)(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v2 *pMipmappedArrayDesc, unsigned int numMipmapLevels);
typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetLevel_v5000)(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayDestroy_v5000)(CUmipmappedArray hMipmappedArray);
typedef CUresult (CUDAAPI *PFN_cuMemAddressReserve_v10020)(CUdeviceptr_v2 *ptr, size_t size, size_t alignment, CUdeviceptr_v2 addr, unsigned long long flags);
typedef CUresult (CUDAAPI *PFN_cuMemAddressFree_v10020)(CUdeviceptr_v2 ptr, size_t size);
typedef CUresult (CUDAAPI *PFN_cuMemCreate_v10020)(CUmemGenericAllocationHandle_v1 *handle, size_t size, const CUmemAllocationProp_v1 *prop, unsigned long long flags);
typedef CUresult (CUDAAPI *PFN_cuMemRelease_v10020)(CUmemGenericAllocationHandle_v1 handle);
typedef CUresult (CUDAAPI *PFN_cuMemMap_v10020)(CUdeviceptr_v2 ptr, size_t size, size_t offset, CUmemGenericAllocationHandle_v1 handle, unsigned long long flags);
typedef CUresult (CUDAAPI *PFN_cuMemMapArrayAsync_v11010_ptsz)(CUarrayMapInfo_v1 *mapInfoList, unsigned int count, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemUnmap_v10020)(CUdeviceptr_v2 ptr, size_t size);
typedef CUresult (CUDAAPI *PFN_cuMemSetAccess_v10020)(CUdeviceptr_v2 ptr, size_t size, const CUmemAccessDesc_v1 *desc, size_t count);
typedef CUresult (CUDAAPI *PFN_cuMemGetAccess_v10020)(unsigned long long *flags, const CUmemLocation_v1 *location, CUdeviceptr_v2 ptr);
typedef CUresult (CUDAAPI *PFN_cuMemExportToShareableHandle_v10020)(void *shareableHandle, CUmemGenericAllocationHandle_v1 handle, CUmemAllocationHandleType handleType, unsigned long long flags);
typedef CUresult (CUDAAPI *PFN_cuMemImportFromShareableHandle_v10020)(CUmemGenericAllocationHandle_v1 *handle, void *osHandle, CUmemAllocationHandleType shHandleType);
typedef CUresult (CUDAAPI *PFN_cuMemGetAllocationGranularity_v10020)(size_t *granularity, const CUmemAllocationProp_v1 *prop, CUmemAllocationGranularity_flags option);
typedef CUresult (CUDAAPI *PFN_cuMemGetAllocationPropertiesFromHandle_v10020)(CUmemAllocationProp_v1 *prop, CUmemGenericAllocationHandle_v1 handle);
typedef CUresult (CUDAAPI *PFN_cuMemRetainAllocationHandle_v11000)(CUmemGenericAllocationHandle_v1 *handle, void *addr);
typedef CUresult (CUDAAPI *PFN_cuMemFreeAsync_v11020_ptsz)(CUdeviceptr_v2 dptr, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemAllocAsync_v11020_ptsz)(CUdeviceptr_v2 *dptr, size_t bytesize, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemPoolTrimTo_v11020)(CUmemoryPool pool, size_t minBytesToKeep);
typedef CUresult (CUDAAPI *PFN_cuMemPoolSetAttribute_v11020)(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
typedef CUresult (CUDAAPI *PFN_cuMemPoolGetAttribute_v11020)(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
typedef CUresult (CUDAAPI *PFN_cuMemPoolSetAccess_v11020)(CUmemoryPool pool, const CUmemAccessDesc_v1 *map, size_t count);
typedef CUresult (CUDAAPI *PFN_cuMemPoolGetAccess_v11020)(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation_v1 *location);
typedef CUresult (CUDAAPI *PFN_cuMemPoolCreate_v11020)(CUmemoryPool *pool, const CUmemPoolProps_v1 *poolProps);
typedef CUresult (CUDAAPI *PFN_cuMemPoolDestroy_v11020)(CUmemoryPool pool);
typedef CUresult (CUDAAPI *PFN_cuMemAllocFromPoolAsync_v11020_ptsz)(CUdeviceptr_v2 *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemPoolExportToShareableHandle_v11020)(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags);
typedef CUresult (CUDAAPI *PFN_cuMemPoolImportFromShareableHandle_v11020)(CUmemoryPool *pool_out, void *handle, CUmemAllocationHandleType handleType, unsigned long long flags);
typedef CUresult (CUDAAPI *PFN_cuMemPoolExportPointer_v11020)(CUmemPoolPtrExportData_v1 *shareData_out, CUdeviceptr_v2 ptr);
typedef CUresult (CUDAAPI *PFN_cuMemPoolImportPointer_v11020)(CUdeviceptr_v2 *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData_v1 *shareData);
typedef CUresult (CUDAAPI *PFN_cuPointerGetAttribute_v4000)(void *data, CUpointer_attribute attribute, CUdeviceptr_v2 ptr);
typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v8000_ptsz)(CUdeviceptr_v2 devPtr, size_t count, CUdevice_v1 dstDevice, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemAdvise_v8000)(CUdeviceptr_v2 devPtr, size_t count, CUmem_advise advice, CUdevice_v1 device);
typedef CUresult (CUDAAPI *PFN_cuMemRangeGetAttribute_v8000)(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr_v2 devPtr, size_t count);
typedef CUresult (CUDAAPI *PFN_cuMemRangeGetAttributes_v8000)(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr_v2 devPtr, size_t count);
typedef CUresult (CUDAAPI *PFN_cuPointerSetAttribute_v6000)(const void *value, CUpointer_attribute attribute, CUdeviceptr_v2 ptr);
typedef CUresult (CUDAAPI *PFN_cuPointerGetAttributes_v7000)(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr_v2 ptr);
typedef CUresult (CUDAAPI *PFN_cuStreamCreate_v2000)(CUstream *phStream, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuStreamCreateWithPriority_v5050)(CUstream *phStream, unsigned int flags, int priority);
typedef CUresult (CUDAAPI *PFN_cuStreamGetPriority_v7000_ptsz)(CUstream hStream, int *priority);
typedef CUresult (CUDAAPI *PFN_cuStreamGetFlags_v7000_ptsz)(CUstream hStream, unsigned int *flags);
typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v9020_ptsz)(CUstream hStream, CUcontext *pctx);
typedef CUresult (CUDAAPI *PFN_cuStreamWaitEvent_v7000_ptsz)(CUstream hStream, CUevent hEvent, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuStreamAddCallback_v7000_ptsz)(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10010_ptsz)(CUstream hStream, CUstreamCaptureMode mode);
typedef CUresult (CUDAAPI *PFN_cuThreadExchangeStreamCaptureMode_v10010)(CUstreamCaptureMode *mode);
typedef CUresult (CUDAAPI *PFN_cuStreamEndCapture_v10000_ptsz)(CUstream hStream, CUgraph *phGraph);
typedef CUresult (CUDAAPI *PFN_cuStreamIsCapturing_v10000_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus);
typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v10010_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v11030_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v11030_ptsz)(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuStreamAttachMemAsync_v7000_ptsz)(CUstream hStream, CUdeviceptr_v2 dptr, size_t length, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuStreamQuery_v7000_ptsz)(CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuStreamSynchronize_v7000_ptsz)(CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuStreamDestroy_v4000)(CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuStreamCopyAttributes_v11000_ptsz)(CUstream dst, CUstream src);
typedef CUresult (CUDAAPI *PFN_cuStreamGetAttribute_v11000_ptsz)(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue_v1 *value_out);
typedef CUresult (CUDAAPI *PFN_cuStreamSetAttribute_v11000_ptsz)(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue_v1 *value);
typedef CUresult (CUDAAPI *PFN_cuEventCreate_v2000)(CUevent *phEvent, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuEventRecord_v7000_ptsz)(CUevent hEvent, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuEventRecordWithFlags_v11010_ptsz)(CUevent hEvent, CUstream hStream, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuEventQuery_v2000)(CUevent hEvent);
typedef CUresult (CUDAAPI *PFN_cuEventSynchronize_v2000)(CUevent hEvent);
typedef CUresult (CUDAAPI *PFN_cuEventDestroy_v4000)(CUevent hEvent);
typedef CUresult (CUDAAPI *PFN_cuEventElapsedTime_v2000)(float *pMilliseconds, CUevent hStart, CUevent hEnd);
typedef CUresult (CUDAAPI *PFN_cuImportExternalMemory_v10000)(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 *memHandleDesc);
typedef CUresult (CUDAAPI *PFN_cuExternalMemoryGetMappedBuffer_v10000)(CUdeviceptr_v2 *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 *bufferDesc);
typedef CUresult (CUDAAPI *PFN_cuExternalMemoryGetMappedMipmappedArray_v10000)(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 *mipmapDesc);
typedef CUresult (CUDAAPI *PFN_cuDestroyExternalMemory_v10000)(CUexternalMemory extMem);
typedef CUresult (CUDAAPI *PFN_cuImportExternalSemaphore_v10000)(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 *semHandleDesc);
typedef CUresult (CUDAAPI *PFN_cuSignalExternalSemaphoresAsync_v10000_ptsz)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
typedef CUresult (CUDAAPI *PFN_cuWaitExternalSemaphoresAsync_v10000_ptsz)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
typedef CUresult (CUDAAPI *PFN_cuDestroyExternalSemaphore_v10000)(CUexternalSemaphore extSem);
typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v8000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v9000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v8000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v9000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v8000_ptsz)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams_v1 *paramArray, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuFuncGetAttribute_v2020)(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
typedef CUresult (CUDAAPI *PFN_cuFuncSetAttribute_v9000)(CUfunction hfunc, CUfunction_attribute attrib, int value);
typedef CUresult (CUDAAPI *PFN_cuFuncSetCacheConfig_v3000)(CUfunction hfunc, CUfunc_cache config);
typedef CUresult (CUDAAPI *PFN_cuFuncSetSharedMemConfig_v4020)(CUfunction hfunc, CUsharedconfig config);
typedef CUresult (CUDAAPI *PFN_cuLaunchKernel_v7000_ptsz)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernel_v9000_ptsz)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernelMultiDevice_v9000)(CUDA_LAUNCH_PARAMS_v1 *launchParamsList, unsigned int numDevices, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuLaunchHostFunc_v10000_ptsz)(CUstream hStream, CUhostFn fn, void *userData);
typedef CUresult (CUDAAPI *PFN_cuFuncSetBlockShape_v2000)(CUfunction hfunc, int x, int y, int z);
typedef CUresult (CUDAAPI *PFN_cuFuncSetSharedSize_v2000)(CUfunction hfunc, unsigned int bytes);
typedef CUresult (CUDAAPI *PFN_cuParamSetSize_v2000)(CUfunction hfunc, unsigned int numbytes);
typedef CUresult (CUDAAPI *PFN_cuParamSeti_v2000)(CUfunction hfunc, int offset, unsigned int value);
typedef CUresult (CUDAAPI *PFN_cuParamSetf_v2000)(CUfunction hfunc, int offset, float value);
typedef CUresult (CUDAAPI *PFN_cuParamSetv_v2000)(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
typedef CUresult (CUDAAPI *PFN_cuLaunch_v2000)(CUfunction f);
typedef CUresult (CUDAAPI *PFN_cuLaunchGrid_v2000)(CUfunction f, int grid_width, int grid_height);
typedef CUresult (CUDAAPI *PFN_cuLaunchGridAsync_v2000)(CUfunction f, int grid_width, int grid_height, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuParamSetTexRef_v2000)(CUfunction hfunc, int texunit, CUtexref hTexRef);
typedef CUresult (CUDAAPI *PFN_cuGraphCreate_v10000)(CUgraph *phGraph, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuGraphAddKernelNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeGetParams_v10000)(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
typedef CUresult (CUDAAPI *PFN_cuGraphAddMemcpyNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D_v2 *copyParams, CUcontext ctx);
typedef CUresult (CUDAAPI *PFN_cuGraphMemcpyNodeGetParams_v10000)(CUgraphNode hNode, CUDA_MEMCPY3D_v2 *nodeParams);
typedef CUresult (CUDAAPI *PFN_cuGraphMemcpyNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_MEMCPY3D_v2 *nodeParams);
typedef CUresult (CUDAAPI *PFN_cuGraphAddMemsetNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS_v1 *memsetParams, CUcontext ctx);
typedef CUresult (CUDAAPI *PFN_cuGraphMemsetNodeGetParams_v10000)(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS_v1 *nodeParams);
typedef CUresult (CUDAAPI *PFN_cuGraphMemsetNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS_v1 *nodeParams);
typedef CUresult (CUDAAPI *PFN_cuGraphAddHostNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
typedef CUresult (CUDAAPI *PFN_cuGraphHostNodeGetParams_v10000)(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
typedef CUresult (CUDAAPI *PFN_cuGraphHostNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
typedef CUresult (CUDAAPI *PFN_cuGraphAddChildGraphNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph);
typedef CUresult (CUDAAPI *PFN_cuGraphChildGraphNodeGetGraph_v10000)(CUgraphNode hNode, CUgraph *phGraph);
typedef CUresult (CUDAAPI *PFN_cuGraphAddEmptyNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies);
typedef CUresult (CUDAAPI *PFN_cuGraphAddEventRecordNode_v11010)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
typedef CUresult (CUDAAPI *PFN_cuGraphEventRecordNodeGetEvent_v11010)(CUgraphNode hNode, CUevent *event_out);
typedef CUresult (CUDAAPI *PFN_cuGraphEventRecordNodeSetEvent_v11010)(CUgraphNode hNode, CUevent event);
typedef CUresult (CUDAAPI *PFN_cuGraphAddEventWaitNode_v11010)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
typedef CUresult (CUDAAPI *PFN_cuGraphEventWaitNodeGetEvent_v11010)(CUgraphNode hNode, CUevent *event_out);
typedef CUresult (CUDAAPI *PFN_cuGraphEventWaitNodeSetEvent_v11010)(CUgraphNode hNode, CUevent event);
typedef CUresult (CUDAAPI *PFN_cuGraphAddExternalSemaphoresSignalNode_v11020)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresSignalNodeGetParams_v11020)(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *params_out);
typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresSignalNodeSetParams_v11020)(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
typedef CUresult (CUDAAPI *PFN_cuGraphAddExternalSemaphoresWaitNode_v11020)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresWaitNodeGetParams_v11020)(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *params_out);
typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresWaitNodeSetParams_v11020)(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
typedef CUresult (CUDAAPI *PFN_cuGraphClone_v10000)(CUgraph *phGraphClone, CUgraph originalGraph);
typedef CUresult (CUDAAPI *PFN_cuGraphNodeFindInClone_v10000)(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph);
typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetType_v10000)(CUgraphNode hNode, CUgraphNodeType *type);
typedef CUresult (CUDAAPI *PFN_cuGraphGetNodes_v10000)(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes);
typedef CUresult (CUDAAPI *PFN_cuGraphGetRootNodes_v10000)(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes);
typedef CUresult (CUDAAPI *PFN_cuGraphGetEdges_v10000)(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges);
typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependencies_v10000)(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies);
typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependentNodes_v10000)(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes);
typedef CUresult (CUDAAPI *PFN_cuGraphAddDependencies_v10000)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
typedef CUresult (CUDAAPI *PFN_cuGraphRemoveDependencies_v10000)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
typedef CUresult (CUDAAPI *PFN_cuGraphDestroyNode_v10000)(CUgraphNode hNode);
typedef CUresult (CUDAAPI *PFN_cuGraphInstantiate_v11000)(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
typedef CUresult (CUDAAPI *PFN_cuGraphInstantiateWithFlags_v11040)(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags);
typedef CUresult (CUDAAPI *PFN_cuGraphExecKernelNodeSetParams_v10010)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
typedef CUresult (CUDAAPI *PFN_cuGraphExecMemcpyNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D_v2 *copyParams, CUcontext ctx);
typedef CUresult (CUDAAPI *PFN_cuGraphExecMemsetNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS_v1 *memsetParams, CUcontext ctx);
typedef CUresult (CUDAAPI *PFN_cuGraphExecHostNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
typedef CUresult (CUDAAPI *PFN_cuGraphExecChildGraphNodeSetParams_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph);
typedef CUresult (CUDAAPI *PFN_cuGraphExecEventRecordNodeSetEvent_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
typedef CUresult (CUDAAPI *PFN_cuGraphExecEventWaitNodeSetEvent_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
typedef CUresult (CUDAAPI *PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams_v11020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
typedef CUresult (CUDAAPI *PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams_v11020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
typedef CUresult (CUDAAPI *PFN_cuGraphUpload_v11010_ptsz)(CUgraphExec hGraphExec, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuGraphLaunch_v10000_ptsz)(CUgraphExec hGraphExec, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuGraphExecDestroy_v10000)(CUgraphExec hGraphExec);
typedef CUresult (CUDAAPI *PFN_cuGraphDestroy_v10000)(CUgraph hGraph);
typedef CUresult (CUDAAPI *PFN_cuGraphExecUpdate_v10020)(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out);
typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeCopyAttributes_v11000)(CUgraphNode dst, CUgraphNode src);
typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeGetAttribute_v11000)(CUgraphNode hNode, CUkernelNodeAttrID attr, CUkernelNodeAttrValue_v1 *value_out);
typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeSetAttribute_v11000)(CUgraphNode hNode, CUkernelNodeAttrID attr, const CUkernelNodeAttrValue_v1 *value);
typedef CUresult (CUDAAPI *PFN_cuGraphDebugDotPrint_v11030)(CUgraph hGraph, const char *path, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuGraphAddMemAllocNode_v11040)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams);
typedef CUresult (CUDAAPI *PFN_cuGraphMemAllocNodeGetParams_v11040)(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out);
typedef CUresult (CUDAAPI *PFN_cuGraphAddMemFreeNode_v11040)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr);
typedef CUresult (CUDAAPI *PFN_cuGraphMemFreeNodeGetParams_v11040)(CUgraphNode hNode, CUdeviceptr *dptr_out);
typedef CUresult (CUDAAPI *PFN_cuGraphNodeSetEnabled_v11060)(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int isEnabled);
typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetEnabled_v11060)(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int *isEnabled);
typedef CUresult (CUDAAPI *PFN_cuDeviceGraphMemTrim_v11040)(CUdevice device);
typedef CUresult (CUDAAPI *PFN_cuDeviceGetGraphMemAttribute_v11040)(CUdevice device, CUgraphMem_attribute attr, void* value);
typedef CUresult (CUDAAPI *PFN_cuDeviceSetGraphMemAttribute_v11040)(CUdevice device, CUgraphMem_attribute attr, void* value);
typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor_v6050)(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize);
typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000)(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialBlockSize_v6050)(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit);
typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialBlockSizeWithFlags_v7000)(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuOccupancyAvailableDynamicSMemPerBlock_v10020)(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize);
typedef CUresult (CUDAAPI *PFN_cuTexRefSetArray_v2000)(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmappedArray_v5000)(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress_v3020)(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr_v2 dptr, size_t bytes);
typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v4010)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v2 *desc, CUdeviceptr_v2 dptr, size_t Pitch);
typedef CUresult (CUDAAPI *PFN_cuTexRefSetFormat_v2000)(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddressMode_v2000)(CUtexref hTexRef, int dim, CUaddress_mode am);
typedef CUresult (CUDAAPI *PFN_cuTexRefSetFilterMode_v2000)(CUtexref hTexRef, CUfilter_mode fm);
typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapFilterMode_v5000)(CUtexref hTexRef, CUfilter_mode fm);
typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapLevelBias_v5000)(CUtexref hTexRef, float bias);
typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapLevelClamp_v5000)(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);
typedef CUresult (CUDAAPI *PFN_cuTexRefSetMaxAnisotropy_v5000)(CUtexref hTexRef, unsigned int maxAniso);
typedef CUresult (CUDAAPI *PFN_cuTexRefSetBorderColor_v8000)(CUtexref hTexRef, float *pBorderColor);
typedef CUresult (CUDAAPI *PFN_cuTexRefSetFlags_v2000)(CUtexref hTexRef, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddress_v3020)(CUdeviceptr_v2 *pdptr, CUtexref hTexRef);
typedef CUresult (CUDAAPI *PFN_cuTexRefGetArray_v2000)(CUarray *phArray, CUtexref hTexRef);
typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmappedArray_v5000)(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef);
typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddressMode_v2000)(CUaddress_mode *pam, CUtexref hTexRef, int dim);
typedef CUresult (CUDAAPI *PFN_cuTexRefGetFilterMode_v2000)(CUfilter_mode *pfm, CUtexref hTexRef);
typedef CUresult (CUDAAPI *PFN_cuTexRefGetFormat_v2000)(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapFilterMode_v5000)(CUfilter_mode *pfm, CUtexref hTexRef);
typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapLevelBias_v5000)(float *pbias, CUtexref hTexRef);
typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapLevelClamp_v5000)(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef);
typedef CUresult (CUDAAPI *PFN_cuTexRefGetMaxAnisotropy_v5000)(int *pmaxAniso, CUtexref hTexRef);
typedef CUresult (CUDAAPI *PFN_cuTexRefGetBorderColor_v8000)(float *pBorderColor, CUtexref hTexRef);
typedef CUresult (CUDAAPI *PFN_cuTexRefGetFlags_v2000)(unsigned int *pFlags, CUtexref hTexRef);
typedef CUresult (CUDAAPI *PFN_cuTexRefCreate_v2000)(CUtexref *pTexRef);
typedef CUresult (CUDAAPI *PFN_cuTexRefDestroy_v2000)(CUtexref hTexRef);
typedef CUresult (CUDAAPI *PFN_cuSurfRefSetArray_v3000)(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuSurfRefGetArray_v3000)(CUarray *phArray, CUsurfref hSurfRef);
typedef CUresult (CUDAAPI *PFN_cuTexObjectCreate_v5000)(CUtexObject_v1 *pTexObject, const CUDA_RESOURCE_DESC_v1 *pResDesc, const CUDA_TEXTURE_DESC_v1 *pTexDesc, const CUDA_RESOURCE_VIEW_DESC_v1 *pResViewDesc);
typedef CUresult (CUDAAPI *PFN_cuTexObjectDestroy_v5000)(CUtexObject_v1 texObject);
typedef CUresult (CUDAAPI *PFN_cuTexObjectGetResourceDesc_v5000)(CUDA_RESOURCE_DESC_v1 *pResDesc, CUtexObject_v1 texObject);
typedef CUresult (CUDAAPI *PFN_cuTexObjectGetTextureDesc_v5000)(CUDA_TEXTURE_DESC_v1 *pTexDesc, CUtexObject_v1 texObject);
typedef CUresult (CUDAAPI *PFN_cuTexObjectGetResourceViewDesc_v5000)(CUDA_RESOURCE_VIEW_DESC_v1 *pResViewDesc, CUtexObject_v1 texObject);
typedef CUresult (CUDAAPI *PFN_cuSurfObjectCreate_v5000)(CUsurfObject_v1 *pSurfObject, const CUDA_RESOURCE_DESC_v1 *pResDesc);
typedef CUresult (CUDAAPI *PFN_cuSurfObjectDestroy_v5000)(CUsurfObject_v1 surfObject);
typedef CUresult (CUDAAPI *PFN_cuSurfObjectGetResourceDesc_v5000)(CUDA_RESOURCE_DESC_v1 *pResDesc, CUsurfObject_v1 surfObject);
typedef CUresult (CUDAAPI *PFN_cuDeviceCanAccessPeer_v4000)(int *canAccessPeer, CUdevice_v1 dev, CUdevice_v1 peerDev);
typedef CUresult (CUDAAPI *PFN_cuCtxEnablePeerAccess_v4000)(CUcontext peerContext, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuCtxDisablePeerAccess_v4000)(CUcontext peerContext);
typedef CUresult (CUDAAPI *PFN_cuDeviceGetP2PAttribute_v8000)(int *value, CUdevice_P2PAttribute attrib, CUdevice_v1 srcDevice, CUdevice_v1 dstDevice);
typedef CUresult (CUDAAPI *PFN_cuGraphicsUnregisterResource_v3000)(CUgraphicsResource resource);
typedef CUresult (CUDAAPI *PFN_cuGraphicsSubResourceGetMappedArray_v3000)(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedMipmappedArray_v5000)(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource);
typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedPointer_v3020)(CUdeviceptr_v2 *pDevPtr, size_t *pSize, CUgraphicsResource resource);
typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceSetMapFlags_v6050)(CUgraphicsResource resource, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuGraphicsMapResources_v7000_ptsz)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuGraphicsUnmapResources_v7000_ptsz)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuGetExportTable_v3000)(const void **ppExportTable, const CUuuid *pExportTableId);
typedef CUresult (CUDAAPI *PFN_cuFuncGetModule_v11000)(CUmodule *hmod, CUfunction hfunc);
typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v3020)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v3020)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v3020)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v3020)(CUarray dstArray, size_t dstOffset, CUdeviceptr_v2 srcDevice, size_t ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v3020)(CUdeviceptr_v2 dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v3020)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v3020)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v3020)(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v3020)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v3020)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v3020)(const CUDA_MEMCPY2D_v2 *pCopy);
typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v3020)(const CUDA_MEMCPY2D_v2 *pCopy);
typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v3020)(const CUDA_MEMCPY3D_v2 *pCopy);
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v3020)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v3020)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v3020)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v3020)(const CUDA_MEMCPY2D_v2 *pCopy, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v3020)(const CUDA_MEMCPY3D_v2 *pCopy, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v3020)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N);
typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v3020)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N);
typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v3020)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N);
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
typedef CUresult (CUDAAPI *PFN_cuMemcpy_v4000)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyAsync_v4000)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemcpyPeer_v4000)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyPeerAsync_v4000)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeer_v4000)(const CUDA_MEMCPY3D_PEER_v1 *pCopy);
typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeerAsync_v4000)(const CUDA_MEMCPY3D_PEER_v1 *pCopy, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemsetD8Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemsetD16Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemsetD32Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuStreamGetPriority_v5050)(CUstream hStream, int *priority);
typedef CUresult (CUDAAPI *PFN_cuStreamGetFlags_v5050)(CUstream hStream, unsigned int *flags);
typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v9020)(CUstream hStream, CUcontext *pctx);
typedef CUresult (CUDAAPI *PFN_cuStreamWaitEvent_v3020)(CUstream hStream, CUevent hEvent, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuStreamAddCallback_v5000)(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuStreamAttachMemAsync_v6000)(CUstream hStream, CUdeviceptr_v2 dptr, size_t length, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuStreamQuery_v2000)(CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuStreamSynchronize_v2000)(CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuEventRecord_v2000)(CUevent hEvent, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuEventRecordWithFlags_v11010)(CUevent hEvent, CUstream hStream, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuLaunchKernel_v4000)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
typedef CUresult (CUDAAPI *PFN_cuLaunchHostFunc_v10000)(CUstream hStream, CUhostFn fn, void *userData);
typedef CUresult (CUDAAPI *PFN_cuGraphicsMapResources_v3000)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuGraphicsUnmapResources_v3000)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v8000)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v8000)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v9000)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v9000)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v8000)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v8000)(CUdeviceptr_v2 devPtr, size_t count, CUdevice_v1 dstDevice, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernel_v9000)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
typedef CUresult (CUDAAPI *PFN_cuSignalExternalSemaphoresAsync_v10000)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
typedef CUresult (CUDAAPI *PFN_cuWaitExternalSemaphoresAsync_v10000)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10010)(CUstream hStream, CUstreamCaptureMode mode);
typedef CUresult (CUDAAPI *PFN_cuStreamEndCapture_v10000)(CUstream hStream, CUgraph *phGraph);
typedef CUresult (CUDAAPI *PFN_cuStreamIsCapturing_v10000)(CUstream hStream, CUstreamCaptureStatus *captureStatus);
typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v10010)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v11030)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v11030)(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuGraphUpload_v11010)(CUgraphExec hGraph, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuGraphLaunch_v10000)(CUgraphExec hGraph, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuStreamCopyAttributes_v11000)(CUstream dstStream, CUstream srcStream);
typedef CUresult (CUDAAPI *PFN_cuStreamGetAttribute_v11000)(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue_v1 *value);
typedef CUresult (CUDAAPI *PFN_cuStreamSetAttribute_v11000)(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue_v1 *param);
typedef CUresult (CUDAAPI *PFN_cuMemMapArrayAsync_v11010)(CUarrayMapInfo_v1 *mapInfoList, unsigned int count, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemFreeAsync_v11020)(CUdeviceptr_v2 dptr, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemAllocAsync_v11020)(CUdeviceptr_v2 *dptr, size_t bytesize, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemAllocFromPoolAsync_v11020)(CUdeviceptr_v2 *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuFlushGPUDirectRDMAWrites_v11030)(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope);
typedef CUresult (CUDAAPI *PFN_cuUserObjectCreate_v11030)(CUuserObject *object_out, void *ptr, CUhostFn destroy, unsigned int initialRefcount, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuUserObjectRetain_v11030)(CUuserObject object, unsigned int count);
typedef CUresult (CUDAAPI *PFN_cuUserObjectRelease_v11030)(CUuserObject object, unsigned int count);
typedef CUresult (CUDAAPI *PFN_cuGraphRetainUserObject_v11030)(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuGraphReleaseUserObject_v11030)(CUgraph graph, CUuserObject object, unsigned int count);
/*
* Type definitions for older versioned functions in cuda.h
*/
#if defined(__CUDA_API_VERSION_INTERNAL)
typedef CUresult (CUDAAPI *PFN_cuMemHostRegister_v4000)(void *p, size_t bytesize, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceSetMapFlags_v3000)(CUgraphicsResource resource, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuLinkCreate_v5050)(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
typedef CUresult (CUDAAPI *PFN_cuLinkAddData_v5050)(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
typedef CUresult (CUDAAPI *PFN_cuLinkAddFile_v5050)(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues);
typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v3020)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v2 *desc, CUdeviceptr_v2 dptr, size_t Pitch);
typedef CUresult (CUDAAPI *PFN_cuDeviceTotalMem_v2000)(unsigned int *bytes, CUdevice_v1 dev);
typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v2000)(CUcontext *pctx, unsigned int flags, CUdevice_v1 dev);
typedef CUresult (CUDAAPI *PFN_cuModuleGetGlobal_v2000)(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
typedef CUresult (CUDAAPI *PFN_cuMemGetInfo_v2000)(unsigned int *free, unsigned int *total);
typedef CUresult (CUDAAPI *PFN_cuMemAlloc_v2000)(CUdeviceptr_v1 *dptr, unsigned int bytesize);
typedef CUresult (CUDAAPI *PFN_cuMemAllocPitch_v2000)(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes);
typedef CUresult (CUDAAPI *PFN_cuMemFree_v2000)(CUdeviceptr_v1 dptr);
typedef CUresult (CUDAAPI *PFN_cuMemGetAddressRange_v2000)(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr);
typedef CUresult (CUDAAPI *PFN_cuMemAllocHost_v2000)(void **pp, unsigned int bytesize);
typedef CUresult (CUDAAPI *PFN_cuMemHostGetDevicePointer_v2020)(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v2000)(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v2000)(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v2000)(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v2000)(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v2000)(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v2000)(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v2000)(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v2000)(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v2000)(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v2000)(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v2000)(const CUDA_MEMCPY2D_v1 *pCopy);
typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v2000)(const CUDA_MEMCPY2D_v1 *pCopy);
typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v2000)(const CUDA_MEMCPY3D_v1 *pCopy);
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v2000)(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v2000)(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v3000)(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v2000)(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v2000)(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v2000)(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N);
typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v2000)(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N);
typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v2000)(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N);
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
typedef CUresult (CUDAAPI *PFN_cuArrayCreate_v2000)(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray);
typedef CUresult (CUDAAPI *PFN_cuArrayGetDescriptor_v2000)(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
typedef CUresult (CUDAAPI *PFN_cuArray3DCreate_v2000)(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray);
typedef CUresult (CUDAAPI *PFN_cuArray3DGetDescriptor_v2000)(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress_v2000)(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes);
typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v2020)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch);
typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddress_v2000)(CUdeviceptr_v1 *pdptr, CUtexref hTexRef);
typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedPointer_v3000)(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
typedef CUresult (CUDAAPI *PFN_cuCtxDestroy_v2000)(CUcontext ctx);
typedef CUresult (CUDAAPI *PFN_cuCtxPopCurrent_v2000)(CUcontext *pctx);
typedef CUresult (CUDAAPI *PFN_cuCtxPushCurrent_v2000)(CUcontext ctx);
typedef CUresult (CUDAAPI *PFN_cuStreamDestroy_v2000)(CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuEventDestroy_v2000)(CUevent hEvent);
typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRelease_v7000)(CUdevice_v1 dev);
typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxReset_v7000)(CUdevice_v1 dev);
typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxSetFlags_v7000)(CUdevice_v1 dev, unsigned int flags);
typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10000)(CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10000_ptsz)(CUstream hStream);
typedef CUresult (CUDAAPI *PFN_cuIpcOpenMemHandle_v4010)(CUdeviceptr_v2 *pdptr, CUipcMemHandle_v1 handle, unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuGraphInstantiate_v10000)(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
#endif
#ifdef __cplusplus
}
#endif // __cplusplus
#endif // file guard

View file

@ -1,211 +0,0 @@
/*
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef _CUDA_AWBARRIER_H_
# define _CUDA_AWBARRIER_H_
# include "cuda_awbarrier_primitives.h"
# if !defined(_CUDA_AWBARRIER_ARCH_700_OR_LATER)
# error This file requires compute capability 7.0 or greater.
# endif
# if !defined(_CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER)
# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
-std=c++11 compiler option.
# endif
_CUDA_AWBARRIER_BEGIN_NAMESPACE
class awbarrier {
public:
class arrival_token {
public:
arrival_token() = default;
~arrival_token() = default;
_CUDA_AWBARRIER_QUALIFIER uint32_t pending_count() const;
private:
_CUDA_AWBARRIER_QUALIFIER arrival_token(uint64_t token);
uint64_t token;
friend awbarrier;
};
awbarrier() = default;
awbarrier(const awbarrier&) = delete;
awbarrier& operator=(const awbarrier&) = delete;
~awbarrier() = default;
_CUDA_AWBARRIER_QUALIFIER arrival_token arrive();
_CUDA_AWBARRIER_QUALIFIER arrival_token arrive_and_drop();
_CUDA_AWBARRIER_QUALIFIER bool timed_wait(arrival_token token, uint32_t hint_cycles);
_CUDA_AWBARRIER_QUALIFIER void wait(arrival_token token);
_CUDA_AWBARRIER_QUALIFIER void arrive_and_wait();
_CUDA_AWBARRIER_STATIC_QUALIFIER __host__ constexpr uint32_t max();
private:
uint64_t barrier;
friend _CUDA_AWBARRIER_QUALIFIER void init(awbarrier* barrier, uint32_t expected_count);
friend _CUDA_AWBARRIER_QUALIFIER void inval(awbarrier* barrier);
friend class pipeline;
};
_CUDA_AWBARRIER_QUALIFIER
uint32_t awbarrier::arrival_token::pending_count() const
{
const uint32_t pending_count = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_token_pending_count(this->token);
return (pending_count >> 15);
}
_CUDA_AWBARRIER_QUALIFIER
awbarrier::arrival_token::arrival_token(uint64_t token)
: token(token)
{
}
_CUDA_AWBARRIER_QUALIFIER
void init(awbarrier* barrier, uint32_t expected_count)
{
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
_CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count <= _CUDA_AWBARRIER_MAX_COUNT);
const uint32_t init_count = (expected_count << 15) + expected_count;
_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_init(&barrier->barrier, init_count);
}
_CUDA_AWBARRIER_QUALIFIER
void inval(awbarrier* barrier)
{
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_inval(&barrier->barrier);
}
_CUDA_AWBARRIER_QUALIFIER
awbarrier::arrival_token awbarrier::arrive()
{
_CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
const uint32_t arrive_count = 1 << 15;
const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete<false>(&this->barrier, arrive_count);
(void)_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<false>(&this->barrier);
return arrival_token(token);
}
_CUDA_AWBARRIER_QUALIFIER
awbarrier::arrival_token awbarrier::arrive_and_drop()
{
_CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
const uint32_t arrive_count = 1 << 15;
const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete<true>(&this->barrier, arrive_count);
(void)_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<true>(&this->barrier);
return arrival_token(token);
}
_CUDA_AWBARRIER_QUALIFIER
bool awbarrier::timed_wait(arrival_token token, uint32_t hint_cycles)
{
constexpr uint64_t max_busy_wait_cycles = 1024;
constexpr uint32_t max_sleep_ns = 1 << 20;
_CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) {
return true;
}
uint64_t start_cycles = clock64();
uint64_t elapsed_cycles = 0;
uint32_t sleep_ns = 32;
while (elapsed_cycles < hint_cycles) {
if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) {
return true;
}
if (elapsed_cycles > max_busy_wait_cycles) {
__nanosleep(sleep_ns);
if (sleep_ns < max_sleep_ns) {
sleep_ns *= 2;
}
}
elapsed_cycles = clock64() - start_cycles;
}
return false;
}
_CUDA_AWBARRIER_QUALIFIER
void awbarrier::wait(arrival_token token)
{
_CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
while (!timed_wait(token, ~0u));
}
_CUDA_AWBARRIER_QUALIFIER
void awbarrier::arrive_and_wait()
{
_CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
this->wait(this->arrive());
}
_CUDA_AWBARRIER_QUALIFIER __host__
constexpr uint32_t awbarrier::max()
{
return _CUDA_AWBARRIER_MAX_COUNT;
}
_CUDA_AWBARRIER_END_NAMESPACE
#endif /* !_CUDA_AWBARRIER_H_ */

View file

@ -1,370 +0,0 @@
/*
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef _CUDA_AWBARRIER_HELPERS_H_
# define _CUDA_AWBARRIER_HELPERS_H_
# define _CUDA_AWBARRIER_NAMESPACE nvcuda::experimental
# define _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace nvcuda { namespace experimental {
# define _CUDA_AWBARRIER_END_NAMESPACE } }
# define _CUDA_AWBARRIER_INTERNAL_NAMESPACE _CUDA_AWBARRIER_NAMESPACE::__awbarrier_internal
# define _CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace __awbarrier_internal {
# define _CUDA_AWBARRIER_END_INTERNAL_NAMESPACE } _CUDA_AWBARRIER_END_NAMESPACE
# if !defined(_CUDA_AWBARRIER_QUALIFIER)
# define _CUDA_AWBARRIER_QUALIFIER inline __device__
# endif
# if !defined(_CUDA_AWBARRIER_STATIC_QUALIFIER)
# define _CUDA_AWBARRIER_STATIC_QUALIFIER static inline __device__
# endif
# if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
# define _CUDA_AWBARRIER_ARCH_700_OR_LATER
# endif
# define _CUDA_AWBARRIER_MAX_COUNT ((1 << 14) - 1)
# if (__CUDA_ARCH__ >= 800)
# define _CUDA_AWBARRIER_HAS_HW_MBARRIER 1
# else
# define _CUDA_AWBARRIER_HAS_HW_MBARRIER 0
# endif
# if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && (_MSC_VER >= 1900)))
# define _CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER
# endif
# if !defined(_CUDA_AWBARRIER_DEBUG)
# if defined(__CUDACC_DEBUG__)
# define _CUDA_AWBARRIER_DEBUG 1
# else
# define _CUDA_AWBARRIER_DEBUG 0
# endif
# endif
# if defined(_CUDA_AWBARRIER_DEBUG) && (_CUDA_AWBARRIER_DEBUG == 1) && !defined(NDEBUG)
# if !defined(__CUDACC_RTC__)
# include <cassert>
# endif
# define _CUDA_AWBARRIER_ASSERT(x) assert((x));
# define _CUDA_AWBARRIER_ABORT() assert(0);
# else
# define _CUDA_AWBARRIER_ASSERT(x)
# define _CUDA_AWBARRIER_ABORT() __trap();
# endif
# if defined(_MSC_VER) && !defined(_WIN64)
# define _CUDA_AWBARRIER_ASM_PTR_CONSTRAINT "r"
# else
# define _CUDA_AWBARRIER_ASM_PTR_CONSTRAINT "l"
# endif
# if defined(__CUDACC_RTC__)
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
typedef uint64_t uintptr_t;
# else
# include <stdint.h>
# endif
# if defined(_CUDA_AWBARRIER_ARCH_700_OR_LATER)
_CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE
extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *);
template<bool UseHWAtomicArrive>
struct ImplementationChooser;
template<>
struct ImplementationChooser<true> {
_CUDA_AWBARRIER_STATIC_QUALIFIER
void awbarrier_init(uint64_t* barrier, uint32_t expected_count)
{
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
_CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count < (1 << 29));
asm volatile ("mbarrier.init.shared.b64 [%0], %1;"
:
: "r"(__nvvm_get_smem_pointer(barrier)), "r"(expected_count)
: "memory");
}
_CUDA_AWBARRIER_STATIC_QUALIFIER
void awbarrier_inval(uint64_t* barrier)
{
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
asm volatile ("mbarrier.inval.shared.b64 [%0];"
:
: "r"(__nvvm_get_smem_pointer(barrier))
: "memory");
}
_CUDA_AWBARRIER_STATIC_QUALIFIER
uint32_t awbarrier_token_pending_count(uint64_t token)
{
uint32_t pending_count;
asm ("mbarrier.pending_count.b64 %0, %1;"
: "=r"(pending_count)
: "l"(token));
return pending_count;
}
template<bool Drop>
_CUDA_AWBARRIER_STATIC_QUALIFIER
uint64_t awbarrier_arrive_drop(uint64_t* barrier)
{
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
uint64_t token;
if (Drop) {
asm volatile ("mbarrier.arrive_drop.shared.b64 %0, [%1];"
: "=l"(token)
: "r"(__nvvm_get_smem_pointer(barrier))
: "memory");
} else {
asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];"
: "=l"(token)
: "r"(__nvvm_get_smem_pointer(barrier))
: "memory");
}
return token;
}
template<bool Drop>
_CUDA_AWBARRIER_STATIC_QUALIFIER
uint64_t awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t count)
{
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
_CUDA_AWBARRIER_ASSERT(count > 0 && count < (1 << 29));
uint64_t token;
if (Drop) {
asm volatile ("mbarrier.arrive_drop.noComplete.shared.b64 %0, [%1], %2;"
: "=l"(token)
: "r"(__nvvm_get_smem_pointer(barrier)), "r"(count)
: "memory");
} else {
asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;"
: "=l"(token)
: "r"(__nvvm_get_smem_pointer(barrier)), "r"(count)
: "memory");
}
return token;
}
_CUDA_AWBARRIER_STATIC_QUALIFIER
bool awbarrier_test_wait(uint64_t* barrier, uint64_t token)
{
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
uint16_t wait_complete;
asm volatile ("{"
" .reg .pred %%p;"
" mbarrier.test_wait.shared.b64 %%p, [%1], %2;"
" selp.u16 %0, 1, 0, %%p;"
"}"
: "=h"(wait_complete)
: "r"(__nvvm_get_smem_pointer(barrier)), "l"(token)
: "memory");
return bool(wait_complete);
}
};
template<>
struct ImplementationChooser<false> {
union AWBarrier {
struct {
uint32_t expected;
uint32_t pending;
} split;
uint64_t raw;
};
_CUDA_AWBARRIER_STATIC_QUALIFIER
void awbarrier_init(uint64_t* barrier, uint32_t expected_count)
{
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
_CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count < (1 << 29));
AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
awbarrier->split.expected = 0x40000000 - expected_count;
awbarrier->split.pending = 0x80000000 - expected_count;
}
_CUDA_AWBARRIER_STATIC_QUALIFIER
void awbarrier_inval(uint64_t* barrier)
{
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
}
_CUDA_AWBARRIER_STATIC_QUALIFIER
uint32_t awbarrier_token_pending_count(uint64_t token)
{
const uint32_t pending = token >> 32;
return 0x80000000 - (pending & 0x7fffffff);
}
template<bool Drop>
_CUDA_AWBARRIER_STATIC_QUALIFIER
uint64_t awbarrier_arrive_drop(uint64_t* barrier)
{
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
while ((*reinterpret_cast<volatile uint32_t*>(&awbarrier->split.pending) & 0x7fffffff) == 0);
if (Drop) {
(void)atomicAdd_block(&awbarrier->split.expected, 1);
}
__threadfence_block();
const uint32_t old_pending = atomicAdd_block(&awbarrier->split.pending, 1);
const uint32_t new_pending = old_pending + 1;
const bool reset = (old_pending ^ new_pending) & 0x80000000;
if (reset) {
__threadfence_block();
uint32_t new_expected = *reinterpret_cast<volatile uint32_t*>(&awbarrier->split.expected);
new_expected &= ~0x40000000;
if (new_expected & 0x20000000) {
new_expected |= 0x40000000;
}
atomicAdd_block(&awbarrier->split.pending, new_expected);
}
return static_cast<uint64_t>(old_pending) << 32;
}
template<bool Drop>
_CUDA_AWBARRIER_STATIC_QUALIFIER
uint64_t awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t count)
{
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
_CUDA_AWBARRIER_ASSERT(count > 0 && count < (1 << 29));
AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
while ((*reinterpret_cast<volatile uint32_t*>(&awbarrier->split.pending) & 0x7fffffff) == 0);
if (Drop) {
(void)atomicAdd_block(&awbarrier->split.expected, count);
}
return static_cast<uint64_t>(atomicAdd_block(&awbarrier->split.pending, count)) << 32;
}
_CUDA_AWBARRIER_STATIC_QUALIFIER
bool awbarrier_test_wait(uint64_t* barrier, uint64_t token)
{
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
volatile AWBarrier* awbarrier = reinterpret_cast<volatile AWBarrier*>(barrier);
return ((token >> 32) ^ awbarrier->split.pending) & 0x80000000;
}
};
_CUDA_AWBARRIER_QUALIFIER
void awbarrier_init(uint64_t* barrier, uint32_t expected_count)
{
ImplementationChooser<_CUDA_AWBARRIER_HAS_HW_MBARRIER>::awbarrier_init(barrier, expected_count);
}
_CUDA_AWBARRIER_QUALIFIER
void awbarrier_inval(uint64_t* barrier)
{
ImplementationChooser<_CUDA_AWBARRIER_HAS_HW_MBARRIER>::awbarrier_inval(barrier);
}
_CUDA_AWBARRIER_QUALIFIER
uint32_t awbarrier_token_pending_count(uint64_t token)
{
return ImplementationChooser<_CUDA_AWBARRIER_HAS_HW_MBARRIER>::awbarrier_token_pending_count(token);
}
template<bool Drop>
_CUDA_AWBARRIER_QUALIFIER
uint64_t awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t arrive_count)
{
return ImplementationChooser<_CUDA_AWBARRIER_HAS_HW_MBARRIER>::awbarrier_arrive_drop_no_complete<Drop>(barrier, arrive_count);
}
template<bool Drop>
_CUDA_AWBARRIER_QUALIFIER
uint64_t awbarrier_arrive_drop(uint64_t* barrier)
{
return ImplementationChooser<_CUDA_AWBARRIER_HAS_HW_MBARRIER>::awbarrier_arrive_drop<Drop>(barrier);
}
_CUDA_AWBARRIER_QUALIFIER
bool awbarrier_test_wait(uint64_t* barrier, uint64_t token)
{
return ImplementationChooser<_CUDA_AWBARRIER_HAS_HW_MBARRIER>::awbarrier_test_wait(barrier, token);
}
_CUDA_AWBARRIER_END_INTERNAL_NAMESPACE
# endif /* !_CUDA_AWBARRIER_ARCH_700_OR_LATER */
#endif /* !_CUDA_AWBARRIER_HELPERS_H_ */

View file

@ -1,115 +0,0 @@
/*
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef _CUDA_AWBARRIER_PRIMITIVES_H_
# define _CUDA_AWBARRIER_PRIMITIVES_H_
# include "cuda_awbarrier_helpers.h"
# if !defined(_CUDA_AWBARRIER_ARCH_700_OR_LATER)
# error This file requires compute capability 7.0 or greater.
# endif
typedef uint64_t __mbarrier_t;
typedef uint64_t __mbarrier_token_t;
_CUDA_AWBARRIER_STATIC_QUALIFIER __host__
uint32_t __mbarrier_maximum_count()
{
return _CUDA_AWBARRIER_MAX_COUNT;
}
_CUDA_AWBARRIER_STATIC_QUALIFIER
void __mbarrier_init(__mbarrier_t* barrier, uint32_t expected_count)
{
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
_CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count <= _CUDA_AWBARRIER_MAX_COUNT);
_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_init(barrier, expected_count);
}
_CUDA_AWBARRIER_STATIC_QUALIFIER
void __mbarrier_inval(__mbarrier_t* barrier)
{
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_inval(barrier);
}
_CUDA_AWBARRIER_STATIC_QUALIFIER
__mbarrier_token_t __mbarrier_arrive(__mbarrier_t* barrier)
{
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<false>(barrier);
}
_CUDA_AWBARRIER_STATIC_QUALIFIER
__mbarrier_token_t __mbarrier_arrive_and_drop(__mbarrier_t* barrier)
{
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<true>(barrier);
}
_CUDA_AWBARRIER_STATIC_QUALIFIER
bool __mbarrier_test_wait(__mbarrier_t* barrier, __mbarrier_token_t token)
{
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(barrier, token);
}
_CUDA_AWBARRIER_STATIC_QUALIFIER
uint32_t __mbarrier_token_pending_count(__mbarrier_token_t token)
{
return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_token_pending_count(token);
}
#endif /* !_CUDA_AWBARRIER_PRIMITIVES_H_ */

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,724 +0,0 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_D3D10_INTEROP_H__)
#define __CUDA_D3D10_INTEROP_H__
#include "cuda_runtime_api.h"
/** \cond impl_private */
#if !defined(__dv)
#if defined(__cplusplus)
#define __dv(v) \
= v
#else /* __cplusplus */
#define __dv(v)
#endif /* __cplusplus */
#endif /* !__dv */
/** \endcond impl_private */
#include <d3d10_1.h>
/** \cond impl_private */
#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
#define __CUDA_DEPRECATED
#elif defined(_MSC_VER)
#define __CUDA_DEPRECATED __declspec(deprecated)
#elif defined(__GNUC__)
#define __CUDA_DEPRECATED __attribute__((deprecated))
#else
#define __CUDA_DEPRECATED
#endif
/** \endcond impl_private */
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
/**
* \addtogroup CUDART_D3D10 Direct3D 10 Interoperability
* This section describes the Direct3D 10 interoperability functions of the CUDA
* runtime application programming interface. Note that mapping of Direct3D 10
* resources is performed with the graphics API agnostic, resource mapping
* interface described in \ref CUDART_INTEROP "Graphics Interopability".
*
* @{
*/
/**
* CUDA devices corresponding to a D3D10 device
*/
enum cudaD3D10DeviceList
{
cudaD3D10DeviceListAll = 1, /**< The CUDA devices for all GPUs used by a D3D10 device */
cudaD3D10DeviceListCurrentFrame = 2, /**< The CUDA devices for the GPUs used by a D3D10 device in its currently rendering frame */
cudaD3D10DeviceListNextFrame = 3 /**< The CUDA devices for the GPUs to be used by a D3D10 device in the next frame */
};
/**
* \brief Registers a Direct3D 10 resource for access by CUDA
*
* Registers the Direct3D 10 resource \p pD3DResource for access by CUDA.
*
* If this call is successful, then the application will be able to map and
* unmap this resource until it is unregistered through
* ::cudaGraphicsUnregisterResource(). Also on success, this call will increase the
* internal reference count on \p pD3DResource. This reference count will be
* decremented when this resource is unregistered through
* ::cudaGraphicsUnregisterResource().
*
* This call potentially has a high-overhead and should not be called every frame
* in interactive applications.
*
* The type of \p pD3DResource must be one of the following.
*
* - ::ID3D10Buffer: may be accessed via a device pointer
* - ::ID3D10Texture1D: individual subresources of the texture may be accessed via arrays
* - ::ID3D10Texture2D: individual subresources of the texture may be accessed via arrays
* - ::ID3D10Texture3D: individual subresources of the texture may be accessed via arrays
*
* The \p flags argument may be used to specify additional parameters at register
* time. The valid values for this parameter are
*
* - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
* resource will be used.
* - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will
* bind this resource to a surface reference.
* - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform
* texture gather operations on this resource.
*
* Not all Direct3D resources of the above types may be used for
* interoperability with CUDA. The following are some limitations.
*
* - The primary rendertarget may not be registered with CUDA.
* - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
* or 32-bit integer or floating-point data cannot be shared.
* - Surfaces of depth or stencil formats cannot be shared.
*
* A complete list of supported DXGI formats is as follows. For compactness the
* notation A_{B,C,D} represents A_B, A_C, and A_D.
* - DXGI_FORMAT_A8_UNORM
* - DXGI_FORMAT_B8G8R8A8_UNORM
* - DXGI_FORMAT_B8G8R8X8_UNORM
* - DXGI_FORMAT_R16_FLOAT
* - DXGI_FORMAT_R16G16B16A16_{FLOAT,SINT,SNORM,UINT,UNORM}
* - DXGI_FORMAT_R16G16_{FLOAT,SINT,SNORM,UINT,UNORM}
* - DXGI_FORMAT_R16_{SINT,SNORM,UINT,UNORM}
* - DXGI_FORMAT_R32_FLOAT
* - DXGI_FORMAT_R32G32B32A32_{FLOAT,SINT,UINT}
* - DXGI_FORMAT_R32G32_{FLOAT,SINT,UINT}
* - DXGI_FORMAT_R32_{SINT,UINT}
* - DXGI_FORMAT_R8G8B8A8_{SINT,SNORM,UINT,UNORM,UNORM_SRGB}
* - DXGI_FORMAT_R8G8_{SINT,SNORM,UINT,UNORM}
* - DXGI_FORMAT_R8_{SINT,SNORM,UINT,UNORM}
*
* If \p pD3DResource is of incorrect type or is already registered, then
* ::cudaErrorInvalidResourceHandle is returned.
* If \p pD3DResource cannot be registered, then ::cudaErrorUnknown is returned.
*
* \param resource - Pointer to returned resource handle
* \param pD3DResource - Direct3D resource to register
* \param flags - Parameters for resource registration
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidDevice,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaGraphicsUnregisterResource,
* ::cudaGraphicsMapResources,
* ::cudaGraphicsSubResourceGetMappedArray,
* ::cudaGraphicsResourceGetMappedPointer,
* ::cuGraphicsD3D10RegisterResource
*/
extern __host__ cudaError_t CUDARTAPI cudaGraphicsD3D10RegisterResource(struct cudaGraphicsResource **resource, ID3D10Resource *pD3DResource, unsigned int flags);
/**
* \brief Gets the device number for an adapter
*
* Returns in \p *device the CUDA-compatible device corresponding to the
* adapter \p pAdapter obtained from ::IDXGIFactory::EnumAdapters. This call
* will succeed only if a device on adapter \p pAdapter is CUDA-compatible.
*
* \param device - Returns the device corresponding to pAdapter
* \param pAdapter - D3D10 adapter to get device for
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaGraphicsD3D10RegisterResource,
* ::cuD3D10GetDevice
*/
extern __host__ cudaError_t CUDARTAPI cudaD3D10GetDevice(int *device, IDXGIAdapter *pAdapter);
/**
* \brief Gets the CUDA devices corresponding to a Direct3D 10 device
*
* Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices corresponding
* to the Direct3D 10 device \p pD3D10Device.
* Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the the CUDA-compatible devices
* corresponding to the Direct3D 10 device \p pD3D10Device.
*
* If any of the GPUs being used to render \p pDevice are not CUDA capable then the
* call will return ::cudaErrorNoDevice.
*
* \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D10Device
* \param pCudaDevices - Returned CUDA devices corresponding to \p pD3D10Device
* \param cudaDeviceCount - The size of the output device array \p pCudaDevices
* \param pD3D10Device - Direct3D 10 device to query for CUDA devices
* \param deviceList - The set of devices to return. This set may be
* ::cudaD3D10DeviceListAll for all devices,
* ::cudaD3D10DeviceListCurrentFrame for the devices used to
* render the current frame (in SLI), or
* ::cudaD3D10DeviceListNextFrame for the devices used to
* render the next frame (in SLI).
*
* \return
* ::cudaSuccess,
* ::cudaErrorNoDevice,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaGraphicsUnregisterResource,
* ::cudaGraphicsMapResources,
* ::cudaGraphicsSubResourceGetMappedArray,
* ::cudaGraphicsResourceGetMappedPointer,
* ::cuD3D10GetDevices
*/
extern __host__ cudaError_t CUDARTAPI cudaD3D10GetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, ID3D10Device *pD3D10Device, enum cudaD3D10DeviceList deviceList);
/** @} */ /* END CUDART_D3D10 */
/**
* \addtogroup CUDART_D3D10_DEPRECATED Direct3D 10 Interoperability [DEPRECATED]
* This section describes deprecated Direct3D 10 interoperability functions.
*
* @{
*/
/**
* CUDA D3D10 Register Flags
*/
enum cudaD3D10RegisterFlags
{
cudaD3D10RegisterFlagsNone = 0, /**< Default; Resource can be accessed through a void* */
cudaD3D10RegisterFlagsArray = 1 /**< Resource can be accessed through a CUarray* */
};
/**
* CUDA D3D10 Map Flags
*/
enum cudaD3D10MapFlags
{
cudaD3D10MapFlagsNone = 0, /**< Default; Assume resource can be read/written */
cudaD3D10MapFlagsReadOnly = 1, /**< CUDA kernels will not write to this resource */
cudaD3D10MapFlagsWriteDiscard = 2 /**< CUDA kernels will only write to and will not read from this resource */
};
/**
* \brief Gets the Direct3D device against which the current CUDA context was
* created
*
* \deprecated This function is deprecated as of CUDA 5.0.
*
* This function is deprecated and should no longer be used. It is
* no longer necessary to associate a CUDA device with a D3D10
* device in order to achieve maximum interoperability performance.
*
* \param ppD3D10Device - Returns the Direct3D device for this thread
*
* \return
* ::cudaSuccess,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaD3D10SetDirect3DDevice
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10GetDirect3DDevice(ID3D10Device **ppD3D10Device);
/**
* \brief Sets the Direct3D 10 device to use for interoperability with
* a CUDA device
*
* \deprecated This function is deprecated as of CUDA 5.0.
*
* This function is deprecated and should no longer be used. It is
* no longer necessary to associate a CUDA device with a D3D10
* device in order to achieve maximum interoperability performance.
*
* \param pD3D10Device - Direct3D device to use for interoperability
* \param device - The CUDA device to use. This device must be among the devices
* returned when querying ::cudaD3D10DeviceListAll from ::cudaD3D10GetDevices,
* may be set to -1 to automatically select an appropriate CUDA device.
*
* \return
* ::cudaSuccess,
* ::cudaErrorInitializationError,
* ::cudaErrorInvalidValue,
* ::cudaErrorSetOnActiveProcess
* \notefnerr
*
* \sa
* ::cudaD3D10GetDevice,
* ::cudaGraphicsD3D10RegisterResource,
* ::cudaDeviceReset
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10SetDirect3DDevice(ID3D10Device *pD3D10Device, int device __dv(-1));
/**
* \brief Registers a Direct3D 10 resource for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Registers the Direct3D resource \p pResource for access by CUDA.
*
* If this call is successful, then the application will be able to map and
* unmap this resource until it is unregistered through
* ::cudaD3D10UnregisterResource(). Also on success, this call will increase
* the internal reference count on \p pResource. This reference count will be
* decremented when this resource is unregistered through
* ::cudaD3D10UnregisterResource().
*
* This call potentially has a high-overhead and should not be called every frame
* in interactive applications.
*
* The type of \p pResource must be one of the following:
*
* - ::ID3D10Buffer: Cannot be used with \p flags set to
* \p cudaD3D10RegisterFlagsArray.
* - ::ID3D10Texture1D: No restrictions.
* - ::ID3D10Texture2D: No restrictions.
* - ::ID3D10Texture3D: No restrictions.
*
* The \p flags argument specifies the mechanism through which CUDA will
* access the Direct3D resource. The following values are allowed.
*
* - ::cudaD3D10RegisterFlagsNone: Specifies that CUDA will access this
* resource through a \p void*. The pointer, size, and pitch for each
* subresource of this resource may be queried through
* ::cudaD3D10ResourceGetMappedPointer(), ::cudaD3D10ResourceGetMappedSize(),
* and ::cudaD3D10ResourceGetMappedPitch() respectively. This option is valid
* for all resource types.
* - ::cudaD3D10RegisterFlagsArray: Specifies that CUDA will access this
* resource through a \p CUarray queried on a sub-resource basis through
* ::cudaD3D10ResourceGetMappedArray(). This option is only valid for resources
* of type ::ID3D10Texture1D, ::ID3D10Texture2D, and ::ID3D10Texture3D.
*
* Not all Direct3D resources of the above types may be used for
* interoperability with CUDA. The following are some limitations.
*
* - The primary rendertarget may not be registered with CUDA.
* - Resources allocated as shared may not be registered with CUDA.
* - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
* or 32-bit integer or floating-point data cannot be shared.
* - Surfaces of depth or stencil formats cannot be shared.
*
* If Direct3D interoperability is not initialized on this context then
* ::cudaErrorInvalidDevice is returned. If \p pResource is of incorrect type
* or is already registered then ::cudaErrorInvalidResourceHandle is returned.
* If \p pResource cannot be registered then ::cudaErrorUnknown is returned.
*
* \param pResource - Resource to register
* \param flags - Parameters for resource registration
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidDevice,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa ::cudaGraphicsD3D10RegisterResource
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10RegisterResource(ID3D10Resource *pResource, unsigned int flags);
/**
* \brief Unregisters a Direct3D resource
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Unregisters the Direct3D resource \p resource so it is not accessible by
* CUDA unless registered again.
*
* If \p pResource is not registered, then ::cudaErrorInvalidResourceHandle
* is returned.
*
* \param pResource - Resource to unregister
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa ::cudaGraphicsUnregisterResource
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10UnregisterResource(ID3D10Resource *pResource);
/**
* \brief Maps Direct3D Resources for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Maps the \p count Direct3D resources in \p ppResources for access by CUDA.
*
* The resources in \p ppResources may be accessed in CUDA kernels until they
* are unmapped. Direct3D should not access any resources while they are
* mapped by CUDA. If an application does so, the results are undefined.
*
* This function provides the synchronization guarantee that any Direct3D
* calls issued before ::cudaD3D10MapResources() will complete before any CUDA
* kernels issued after ::cudaD3D10MapResources() begin.
*
* If any of \p ppResources have not been registered for use with CUDA or if
* \p ppResources contains any duplicate entries then ::cudaErrorInvalidResourceHandle
* is returned. If any of \p ppResources are presently mapped for access by
* CUDA then ::cudaErrorUnknown is returned.
*
* \param count - Number of resources to map for CUDA
* \param ppResources - Resources to map for CUDA
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa ::cudaGraphicsMapResources
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10MapResources(int count, ID3D10Resource **ppResources);
/**
* \brief Unmaps Direct3D resources
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Unmaps the \p count Direct3D resource in \p ppResources.
*
* This function provides the synchronization guarantee that any CUDA kernels
* issued before ::cudaD3D10UnmapResources() will complete before any Direct3D
* calls issued after ::cudaD3D10UnmapResources() begin.
*
* If any of \p ppResources have not been registered for use with CUDA or if
* \p ppResources contains any duplicate entries, then
* ::cudaErrorInvalidResourceHandle is returned. If any of \p ppResources are
* not presently mapped for access by CUDA then ::cudaErrorUnknown is returned.
*
* \param count - Number of resources to unmap for CUDA
* \param ppResources - Resources to unmap for CUDA
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa ::cudaGraphicsUnmapResources
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10UnmapResources(int count, ID3D10Resource **ppResources);
/**
* \brief Gets an array through which to access a subresource of a Direct3D
* resource which has been mapped for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Returns in \p *ppArray an array through which the subresource of the mapped
* Direct3D resource \p pResource which corresponds to \p subResource may be
* accessed. The value set in \p ppArray may change every time that
* \p pResource is mapped.
*
* If \p pResource is not registered, then ::cudaErrorInvalidResourceHandle is
* returned. If \p pResource was not registered with usage flags
* ::cudaD3D10RegisterFlagsArray, then ::cudaErrorInvalidResourceHandle is
* returned. If \p pResource is not mapped then ::cudaErrorUnknown is returned.
*
* For usage requirements of the \p subResource parameter, see
* ::cudaD3D10ResourceGetMappedPointer().
*
* \param ppArray - Returned array corresponding to subresource
* \param pResource - Mapped resource to access
* \param subResource - Subresource of pResource to access
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa ::cudaGraphicsSubResourceGetMappedArray
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceGetMappedArray(cudaArray **ppArray, ID3D10Resource *pResource, unsigned int subResource);
/**
* \brief Set usage flags for mapping a Direct3D resource
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Set usage flags for mapping the Direct3D resource \p pResource.
*
* Changes to flags will take effect the next time \p pResource is mapped.
* The \p flags argument may be any of the following:
*
* - ::cudaD3D10MapFlagsNone: Specifies no hints about how this resource will
* be used. It is therefore assumed that this resource will be read from and
* written to by CUDA kernels. This is the default value.
* - ::cudaD3D10MapFlagsReadOnly: Specifies that CUDA kernels which access
* this resource will not write to this resource.
* - ::cudaD3D10MapFlagsWriteDiscard: Specifies that CUDA kernels which access
* this resource will not read from this resource and will write over the
* entire contents of the resource, so none of the data previously stored in
* the resource will be preserved.
*
* If \p pResource has not been registered for use with CUDA then
* ::cudaErrorInvalidHandle is returned. If \p pResource is presently mapped
* for access by CUDA then ::cudaErrorUnknown is returned.
*
* \param pResource - Registered resource to set flags for
* \param flags - Parameters for resource mapping
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown,
* \notefnerr
*
* \sa ::cudaGraphicsResourceSetMapFlags
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceSetMapFlags(ID3D10Resource *pResource, unsigned int flags);
/**
* \brief Gets the dimensions of a registered Direct3D surface
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Returns in \p *pWidth, \p *pHeight, and \p *pDepth the dimensions of the
* subresource of the mapped Direct3D resource \p pResource which corresponds
* to \p subResource.
*
* Since anti-aliased surfaces may have multiple samples per pixel, it is
* possible that the dimensions of a resource will be an integer factor larger
* than the dimensions reported by the Direct3D runtime.
*
* The parameters \p pWidth, \p pHeight, and \p pDepth are optional. For 2D
* surfaces, the value returned in \p *pDepth will be 0.
*
* If \p pResource is not of type ::ID3D10Texture1D, ::ID3D10Texture2D, or
* ::ID3D10Texture3D, or if \p pResource has not been registered for use with
* CUDA, then ::cudaErrorInvalidHandle is returned.
* For usage requirements of \p subResource parameters see
* ::cudaD3D10ResourceGetMappedPointer().
*
* \param pWidth - Returned width of surface
* \param pHeight - Returned height of surface
* \param pDepth - Returned depth of surface
* \param pResource - Registered resource to access
* \param subResource - Subresource of pResource to access
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* \notefnerr
*
* \sa ::cudaGraphicsSubResourceGetMappedArray
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceGetSurfaceDimensions(size_t *pWidth, size_t *pHeight, size_t *pDepth, ID3D10Resource *pResource, unsigned int subResource);
/**
* \brief Gets a pointer through which to access a subresource of a Direct3D
* resource which has been mapped for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Returns in \p *pPointer the base pointer of the subresource of the mapped
* Direct3D resource \p pResource which corresponds to \p subResource. The
* value set in \p pPointer may change every time that \p pResource is mapped.
*
* If \p pResource is not registered, then ::cudaErrorInvalidResourceHandle is
* returned. If \p pResource was not registered with usage flags
* ::cudaD3D9RegisterFlagsNone, then ::cudaErrorInvalidResourceHandle is
* returned. If \p pResource is not mapped then ::cudaErrorUnknown is returned.
*
* If \p pResource is of type ::ID3D10Buffer then \p subResource must be 0.
* If \p pResource is of any other type, then the value of \p subResource must
* come from the subresource calculation in ::D3D10CalcSubResource().
*
* \param pPointer - Returned pointer corresponding to subresource
* \param pResource - Mapped resource to access
* \param subResource - Subresource of pResource to access
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa ::cudaGraphicsResourceGetMappedPointer
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceGetMappedPointer(void **pPointer, ID3D10Resource *pResource, unsigned int subResource);
/**
* \brief Gets the size of a subresource of a Direct3D resource which has been
* mapped for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Returns in \p *pSize the size of the subresource of the mapped Direct3D
* resource \p pResource which corresponds to \p subResource. The value set in
* \p pSize may change every time that \p pResource is mapped.
*
* If \p pResource has not been registered for use with CUDA then
* ::cudaErrorInvalidHandle is returned. If \p pResource was not registered
* with usage flags ::cudaD3D10RegisterFlagsNone, then
* ::cudaErrorInvalidResourceHandle is returned. If \p pResource is not mapped for
* access by CUDA then ::cudaErrorUnknown is returned.
*
* For usage requirements of the \p subResource parameter see
* ::cudaD3D10ResourceGetMappedPointer().
*
* \param pSize - Returned size of subresource
* \param pResource - Mapped resource to access
* \param subResource - Subresource of pResource to access
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa ::cudaGraphicsResourceGetMappedPointer
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceGetMappedSize(size_t *pSize, ID3D10Resource *pResource, unsigned int subResource);
/**
* \brief Gets the pitch of a subresource of a Direct3D resource which has been
* mapped for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Returns in \p *pPitch and \p *pPitchSlice the pitch and Z-slice pitch of
* the subresource of the mapped Direct3D resource \p pResource, which
* corresponds to \p subResource. The values set in \p pPitch and
* \p pPitchSlice may change every time that \p pResource is mapped.
*
* The pitch and Z-slice pitch values may be used to compute the location of a
* sample on a surface as follows.
*
* For a 2D surface, the byte offset of the sample at position \b x, \b y from
* the base pointer of the surface is:
*
* \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
*
* For a 3D surface, the byte offset of the sample at position \b x, \b y,
* \b z from the base pointer of the surface is:
*
* \b z* \b slicePitch + \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
*
* Both parameters \p pPitch and \p pPitchSlice are optional and may be set to
* NULL.
*
* If \p pResource is not of type ::ID3D10Texture1D, ::ID3D10Texture2D, or
* ::ID3D10Texture3D, or if \p pResource has not been registered for use with
* CUDA, then ::cudaErrorInvalidResourceHandle is returned. If \p pResource was
* not registered with usage flags ::cudaD3D10RegisterFlagsNone, then
* ::cudaErrorInvalidResourceHandle is returned. If \p pResource is not mapped
* for access by CUDA then ::cudaErrorUnknown is returned.
*
* For usage requirements of the \p subResource parameter see
* ::cudaD3D10ResourceGetMappedPointer().
*
* \param pPitch - Returned pitch of subresource
* \param pPitchSlice - Returned Z-slice pitch of subresource
* \param pResource - Mapped resource to access
* \param subResource - Subresource of pResource to access
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa ::cudaGraphicsSubResourceGetMappedArray
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceGetMappedPitch(size_t *pPitch, size_t *pPitchSlice, ID3D10Resource *pResource, unsigned int subResource);
/** @} */ /* END CUDART_D3D10_DEPRECATED */
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#undef __dv
#undef __CUDA_DEPRECATED
#endif /* __CUDA_D3D10_INTEROP_H__ */

View file

@ -1,323 +0,0 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_D3D11_INTEROP_H__)
#define __CUDA_D3D11_INTEROP_H__
#include "cuda_runtime_api.h"
/** \cond impl_private */
#if !defined(__dv)
#if defined(__cplusplus)
#define __dv(v) \
= v
#else /* __cplusplus */
#define __dv(v)
#endif /* __cplusplus */
#endif /* !__dv */
/** \endcond impl_private */
#include <d3d11.h>
/** \cond impl_private */
#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
#define __CUDA_DEPRECATED
#elif defined(_MSC_VER)
#define __CUDA_DEPRECATED __declspec(deprecated)
#elif defined(__GNUC__)
#define __CUDA_DEPRECATED __attribute__((deprecated))
#else
#define __CUDA_DEPRECATED
#endif
/** \endcond impl_private */
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
/**
* \addtogroup CUDART_D3D11 Direct3D 11 Interoperability
* This section describes the Direct3D 11 interoperability functions of the CUDA
* runtime application programming interface. Note that mapping of Direct3D 11
* resources is performed with the graphics API agnostic, resource mapping
* interface described in \ref CUDART_INTEROP "Graphics Interopability".
*
* @{
*/
/**
* CUDA devices corresponding to a D3D11 device
*/
enum cudaD3D11DeviceList
{
cudaD3D11DeviceListAll = 1, /**< The CUDA devices for all GPUs used by a D3D11 device */
cudaD3D11DeviceListCurrentFrame = 2, /**< The CUDA devices for the GPUs used by a D3D11 device in its currently rendering frame */
cudaD3D11DeviceListNextFrame = 3 /**< The CUDA devices for the GPUs to be used by a D3D11 device in the next frame */
};
/**
* \brief Register a Direct3D 11 resource for access by CUDA
*
* Registers the Direct3D 11 resource \p pD3DResource for access by CUDA.
*
* If this call is successful, then the application will be able to map and
* unmap this resource until it is unregistered through
* ::cudaGraphicsUnregisterResource(). Also on success, this call will increase the
* internal reference count on \p pD3DResource. This reference count will be
* decremented when this resource is unregistered through
* ::cudaGraphicsUnregisterResource().
*
* This call potentially has a high-overhead and should not be called every frame
* in interactive applications.
*
* The type of \p pD3DResource must be one of the following.
*
* - ::ID3D11Buffer: may be accessed via a device pointer
* - ::ID3D11Texture1D: individual subresources of the texture may be accessed via arrays
* - ::ID3D11Texture2D: individual subresources of the texture may be accessed via arrays
* - ::ID3D11Texture3D: individual subresources of the texture may be accessed via arrays
*
* The \p flags argument may be used to specify additional parameters at register
* time. The valid values for this parameter are
*
* - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
* resource will be used.
* - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will
* bind this resource to a surface reference.
* - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform
* texture gather operations on this resource.
*
* Not all Direct3D resources of the above types may be used for
* interoperability with CUDA. The following are some limitations.
*
* - The primary rendertarget may not be registered with CUDA.
* - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
* or 32-bit integer or floating-point data cannot be shared.
* - Surfaces of depth or stencil formats cannot be shared.
*
* A complete list of supported DXGI formats is as follows. For compactness the
* notation A_{B,C,D} represents A_B, A_C, and A_D.
* - DXGI_FORMAT_A8_UNORM
* - DXGI_FORMAT_B8G8R8A8_UNORM
* - DXGI_FORMAT_B8G8R8X8_UNORM
* - DXGI_FORMAT_R16_FLOAT
* - DXGI_FORMAT_R16G16B16A16_{FLOAT,SINT,SNORM,UINT,UNORM}
* - DXGI_FORMAT_R16G16_{FLOAT,SINT,SNORM,UINT,UNORM}
* - DXGI_FORMAT_R16_{SINT,SNORM,UINT,UNORM}
* - DXGI_FORMAT_R32_FLOAT
* - DXGI_FORMAT_R32G32B32A32_{FLOAT,SINT,UINT}
* - DXGI_FORMAT_R32G32_{FLOAT,SINT,UINT}
* - DXGI_FORMAT_R32_{SINT,UINT}
* - DXGI_FORMAT_R8G8B8A8_{SINT,SNORM,UINT,UNORM,UNORM_SRGB}
* - DXGI_FORMAT_R8G8_{SINT,SNORM,UINT,UNORM}
* - DXGI_FORMAT_R8_{SINT,SNORM,UINT,UNORM}
*
* If \p pD3DResource is of incorrect type or is already registered, then
* ::cudaErrorInvalidResourceHandle is returned.
* If \p pD3DResource cannot be registered, then ::cudaErrorUnknown is returned.
*
* \param resource - Pointer to returned resource handle
* \param pD3DResource - Direct3D resource to register
* \param flags - Parameters for resource registration
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidDevice,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaGraphicsUnregisterResource,
* ::cudaGraphicsMapResources,
* ::cudaGraphicsSubResourceGetMappedArray,
* ::cudaGraphicsResourceGetMappedPointer,
* ::cuGraphicsD3D11RegisterResource
*/
extern __host__ cudaError_t CUDARTAPI cudaGraphicsD3D11RegisterResource(struct cudaGraphicsResource **resource, ID3D11Resource *pD3DResource, unsigned int flags);
/**
* \brief Gets the device number for an adapter
*
* Returns in \p *device the CUDA-compatible device corresponding to the
* adapter \p pAdapter obtained from ::IDXGIFactory::EnumAdapters. This call
* will succeed only if a device on adapter \p pAdapter is CUDA-compatible.
*
* \param device - Returns the device corresponding to pAdapter
* \param pAdapter - D3D11 adapter to get device for
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaGraphicsUnregisterResource,
* ::cudaGraphicsMapResources,
* ::cudaGraphicsSubResourceGetMappedArray,
* ::cudaGraphicsResourceGetMappedPointer,
* ::cuD3D11GetDevice
*/
extern __host__ cudaError_t CUDARTAPI cudaD3D11GetDevice(int *device, IDXGIAdapter *pAdapter);
/**
* \brief Gets the CUDA devices corresponding to a Direct3D 11 device
*
* Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices corresponding
* to the Direct3D 11 device \p pD3D11Device.
* Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the the CUDA-compatible devices
* corresponding to the Direct3D 11 device \p pD3D11Device.
*
* If any of the GPUs being used to render \p pDevice are not CUDA capable then the
* call will return ::cudaErrorNoDevice.
*
* \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D11Device
* \param pCudaDevices - Returned CUDA devices corresponding to \p pD3D11Device
* \param cudaDeviceCount - The size of the output device array \p pCudaDevices
* \param pD3D11Device - Direct3D 11 device to query for CUDA devices
* \param deviceList - The set of devices to return. This set may be
* ::cudaD3D11DeviceListAll for all devices,
* ::cudaD3D11DeviceListCurrentFrame for the devices used to
* render the current frame (in SLI), or
* ::cudaD3D11DeviceListNextFrame for the devices used to
* render the next frame (in SLI).
*
* \return
* ::cudaSuccess,
* ::cudaErrorNoDevice,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaGraphicsUnregisterResource,
* ::cudaGraphicsMapResources,
* ::cudaGraphicsSubResourceGetMappedArray,
* ::cudaGraphicsResourceGetMappedPointer,
* ::cuD3D11GetDevices
*/
extern __host__ cudaError_t CUDARTAPI cudaD3D11GetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, ID3D11Device *pD3D11Device, enum cudaD3D11DeviceList deviceList);
/** @} */ /* END CUDART_D3D11 */
/**
* \addtogroup CUDART_D3D11_DEPRECATED Direct3D 11 Interoperability [DEPRECATED]
* This section describes deprecated Direct3D 11 interoperability functions.
*
* @{
*/
/**
* \brief Gets the Direct3D device against which the current CUDA context was
* created
*
* \deprecated This function is deprecated as of CUDA 5.0.
*
* This function is deprecated and should no longer be used. It is
* no longer necessary to associate a CUDA device with a D3D11
* device in order to achieve maximum interoperability performance.
*
* \param ppD3D11Device - Returns the Direct3D device for this thread
*
* \return
* ::cudaSuccess,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaD3D11SetDirect3DDevice
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D11GetDirect3DDevice(ID3D11Device **ppD3D11Device);
/**
* \brief Sets the Direct3D 11 device to use for interoperability with
* a CUDA device
*
* \deprecated This function is deprecated as of CUDA 5.0.
*
* This function is deprecated and should no longer be used. It is
* no longer necessary to associate a CUDA device with a D3D11
* device in order to achieve maximum interoperability performance.
*
* \param pD3D11Device - Direct3D device to use for interoperability
* \param device - The CUDA device to use. This device must be among the devices
* returned when querying ::cudaD3D11DeviceListAll from ::cudaD3D11GetDevices,
* may be set to -1 to automatically select an appropriate CUDA device.
*
* \return
* ::cudaSuccess,
* ::cudaErrorInitializationError,
* ::cudaErrorInvalidValue,
* ::cudaErrorSetOnActiveProcess
* \notefnerr
*
* \sa
* ::cudaD3D11GetDevice,
* ::cudaGraphicsD3D11RegisterResource,
* ::cudaDeviceReset
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D11SetDirect3DDevice(ID3D11Device *pD3D11Device, int device __dv(-1));
/** @} */ /* END CUDART_D3D11_DEPRECATED */
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#undef __dv
#undef __CUDA_DEPRECATED
#endif /* __CUDA_D3D11_INTEROP_H__ */

View file

@ -1,782 +0,0 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_D3D9_INTEROP_H__)
#define __CUDA_D3D9_INTEROP_H__
#include "cuda_runtime_api.h"
/** \cond impl_private */
#if !defined(__dv)
#if defined(__cplusplus)
#define __dv(v) \
= v
#else /* __cplusplus */
#define __dv(v)
#endif /* __cplusplus */
#endif /* !__dv */
/** \endcond impl_private */
#include <d3d9.h>
/** \cond impl_private */
#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
#define __CUDA_DEPRECATED
#elif defined(_MSC_VER)
#define __CUDA_DEPRECATED __declspec(deprecated)
#elif defined(__GNUC__)
#define __CUDA_DEPRECATED __attribute__((deprecated))
#else
#define __CUDA_DEPRECATED
#endif
/** \endcond impl_private */
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
/**
* \addtogroup CUDART_D3D9 Direct3D 9 Interoperability
* This section describes the Direct3D 9 interoperability functions of the CUDA
* runtime application programming interface. Note that mapping of Direct3D 9
* resources is performed with the graphics API agnostic, resource mapping
* interface described in \ref CUDART_INTEROP "Graphics Interopability".
*
* @{
*/
/**
* CUDA devices corresponding to a D3D9 device
*/
enum cudaD3D9DeviceList
{
cudaD3D9DeviceListAll = 1, /**< The CUDA devices for all GPUs used by a D3D9 device */
cudaD3D9DeviceListCurrentFrame = 2, /**< The CUDA devices for the GPUs used by a D3D9 device in its currently rendering frame */
cudaD3D9DeviceListNextFrame = 3 /**< The CUDA devices for the GPUs to be used by a D3D9 device in the next frame */
};
/**
* \brief Gets the Direct3D device against which the current CUDA context was
* created
*
* Returns in \p *ppD3D9Device the Direct3D device against which this CUDA
* context was created in ::cudaD3D9SetDirect3DDevice().
*
* \param ppD3D9Device - Returns the Direct3D device for this thread
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidGraphicsContext,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaD3D9SetDirect3DDevice,
* ::cuD3D9GetDirect3DDevice
*/
extern __host__ cudaError_t CUDARTAPI cudaD3D9GetDirect3DDevice(IDirect3DDevice9 **ppD3D9Device);
/**
* \brief Register a Direct3D 9 resource for access by CUDA
*
* Registers the Direct3D 9 resource \p pD3DResource for access by CUDA.
*
* If this call is successful then the application will be able to map and
* unmap this resource until it is unregistered through
* ::cudaGraphicsUnregisterResource(). Also on success, this call will increase the
* internal reference count on \p pD3DResource. This reference count will be
* decremented when this resource is unregistered through
* ::cudaGraphicsUnregisterResource().
*
* This call potentially has a high-overhead and should not be called every frame
* in interactive applications.
*
* The type of \p pD3DResource must be one of the following.
*
* - ::IDirect3DVertexBuffer9: may be accessed through a device pointer
* - ::IDirect3DIndexBuffer9: may be accessed through a device pointer
* - ::IDirect3DSurface9: may be accessed through an array.
* Only stand-alone objects of type ::IDirect3DSurface9
* may be explicitly shared. In particular, individual mipmap levels and faces
* of cube maps may not be registered directly. To access individual surfaces
* associated with a texture, one must register the base texture object.
* - ::IDirect3DBaseTexture9: individual surfaces on this texture may be accessed
* through an array.
*
* The \p flags argument may be used to specify additional parameters at register
* time. The valid values for this parameter are
*
* - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
* resource will be used.
* - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will
* bind this resource to a surface reference.
* - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform
* texture gather operations on this resource.
*
* Not all Direct3D resources of the above types may be used for
* interoperability with CUDA. The following are some limitations.
*
* - The primary rendertarget may not be registered with CUDA.
* - Resources allocated as shared may not be registered with CUDA.
* - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
* or 32-bit integer or floating-point data cannot be shared.
* - Surfaces of depth or stencil formats cannot be shared.
*
* A complete list of supported formats is as follows:
* - D3DFMT_L8
* - D3DFMT_L16
* - D3DFMT_A8R8G8B8
* - D3DFMT_X8R8G8B8
* - D3DFMT_G16R16
* - D3DFMT_A8B8G8R8
* - D3DFMT_A8
* - D3DFMT_A8L8
* - D3DFMT_Q8W8V8U8
* - D3DFMT_V16U16
* - D3DFMT_A16B16G16R16F
* - D3DFMT_A16B16G16R16
* - D3DFMT_R32F
* - D3DFMT_G16R16F
* - D3DFMT_A32B32G32R32F
* - D3DFMT_G32R32F
* - D3DFMT_R16F
*
* If \p pD3DResource is of incorrect type or is already registered, then
* ::cudaErrorInvalidResourceHandle is returned.
* If \p pD3DResource cannot be registered, then ::cudaErrorUnknown is returned.
*
* \param resource - Pointer to returned resource handle
* \param pD3DResource - Direct3D resource to register
* \param flags - Parameters for resource registration
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidDevice,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaD3D9SetDirect3DDevice,
* ::cudaGraphicsUnregisterResource,
* ::cudaGraphicsMapResources,
* ::cudaGraphicsSubResourceGetMappedArray,
* ::cudaGraphicsResourceGetMappedPointer,
* ::cuGraphicsD3D9RegisterResource
*/
extern __host__ cudaError_t CUDARTAPI cudaGraphicsD3D9RegisterResource(struct cudaGraphicsResource **resource, IDirect3DResource9 *pD3DResource, unsigned int flags);
/**
* \brief Gets the device number for an adapter
*
* Returns in \p *device the CUDA-compatible device corresponding to the
* adapter name \p pszAdapterName obtained from ::EnumDisplayDevices or
* ::IDirect3D9::GetAdapterIdentifier(). If no device on the adapter with name
* \p pszAdapterName is CUDA-compatible then the call will fail.
*
* \param device - Returns the device corresponding to pszAdapterName
* \param pszAdapterName - D3D9 adapter to get device for
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaD3D9SetDirect3DDevice,
* ::cudaGraphicsD3D9RegisterResource,
* ::cuD3D9GetDevice
*/
extern __host__ cudaError_t CUDARTAPI cudaD3D9GetDevice(int *device, const char *pszAdapterName);
/**
* \brief Gets the CUDA devices corresponding to a Direct3D 9 device
*
* Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices corresponding
* to the Direct3D 9 device \p pD3D9Device.
* Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the the CUDA-compatible devices
* corresponding to the Direct3D 9 device \p pD3D9Device.
*
* If any of the GPUs being used to render \p pDevice are not CUDA capable then the
* call will return ::cudaErrorNoDevice.
*
* \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D9Device
* \param pCudaDevices - Returned CUDA devices corresponding to \p pD3D9Device
* \param cudaDeviceCount - The size of the output device array \p pCudaDevices
* \param pD3D9Device - Direct3D 9 device to query for CUDA devices
* \param deviceList - The set of devices to return. This set may be
* ::cudaD3D9DeviceListAll for all devices,
* ::cudaD3D9DeviceListCurrentFrame for the devices used to
* render the current frame (in SLI), or
* ::cudaD3D9DeviceListNextFrame for the devices used to
* render the next frame (in SLI).
*
* \return
* ::cudaSuccess,
* ::cudaErrorNoDevice,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaGraphicsUnregisterResource,
* ::cudaGraphicsMapResources,
* ::cudaGraphicsSubResourceGetMappedArray,
* ::cudaGraphicsResourceGetMappedPointer,
* ::cuD3D9GetDevices
*/
extern __host__ cudaError_t CUDARTAPI cudaD3D9GetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, IDirect3DDevice9 *pD3D9Device, enum cudaD3D9DeviceList deviceList);
/**
* \brief Sets the Direct3D 9 device to use for interoperability with
* a CUDA device
*
* Records \p pD3D9Device as the Direct3D 9 device to use for Direct3D 9
* interoperability with the CUDA device \p device and sets \p device as
* the current device for the calling host thread.
*
* If \p device has already been initialized then this call will fail with
* the error ::cudaErrorSetOnActiveProcess. In this case it is necessary
* to reset \p device using ::cudaDeviceReset() before Direct3D 9
* interoperability on \p device may be enabled.
*
* Successfully initializing CUDA interoperability with \p pD3D9Device
* will increase the internal reference count on \p pD3D9Device. This
* reference count will be decremented when \p device is reset using
* ::cudaDeviceReset().
*
* Note that this function is never required for correct functionality. Use of
* this function will result in accelerated interoperability only when the
* operating system is Windows Vista or Windows 7, and the device \p pD3DDdevice
* is not an IDirect3DDevice9Ex. In all other cirumstances, this function is
* not necessary.
*
* \param pD3D9Device - Direct3D device to use for this thread
* \param device - The CUDA device to use. This device must be among the devices
* returned when querying ::cudaD3D9DeviceListAll from ::cudaD3D9GetDevices,
* may be set to -1 to automatically select an appropriate CUDA device.
*
* \return
* ::cudaSuccess,
* ::cudaErrorInitializationError,
* ::cudaErrorInvalidValue,
* ::cudaErrorSetOnActiveProcess
* \notefnerr
*
* \sa
* ::cudaD3D9GetDevice,
* ::cudaGraphicsD3D9RegisterResource,
* ::cudaDeviceReset
*/
extern __host__ cudaError_t CUDARTAPI cudaD3D9SetDirect3DDevice(IDirect3DDevice9 *pD3D9Device, int device __dv(-1));
/** @} */ /* END CUDART_D3D9 */
/**
* \addtogroup CUDART_D3D9_DEPRECATED Direct3D 9 Interoperability [DEPRECATED]
* This section describes deprecated Direct3D 9 interoperability functions.
*
* @{
*/
/**
* CUDA D3D9 Register Flags
*/
enum cudaD3D9RegisterFlags
{
cudaD3D9RegisterFlagsNone = 0, /**< Default; Resource can be accessed througa void* */
cudaD3D9RegisterFlagsArray = 1 /**< Resource can be accessed through a CUarray* */
};
/**
* CUDA D3D9 Map Flags
*/
enum cudaD3D9MapFlags
{
cudaD3D9MapFlagsNone = 0, /**< Default; Assume resource can be read/written */
cudaD3D9MapFlagsReadOnly = 1, /**< CUDA kernels will not write to this resource */
cudaD3D9MapFlagsWriteDiscard = 2 /**< CUDA kernels will only write to and will not read from this resource */
};
/**
* \brief Registers a Direct3D resource for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Registers the Direct3D resource \p pResource for access by CUDA.
*
* If this call is successful, then the application will be able to map and
* unmap this resource until it is unregistered through
* ::cudaD3D9UnregisterResource(). Also on success, this call will increase
* the internal reference count on \p pResource. This reference count will be
* decremented when this resource is unregistered through
* ::cudaD3D9UnregisterResource().
*
* This call potentially has a high-overhead and should not be called every frame
* in interactive applications.
*
* The type of \p pResource must be one of the following.
*
* - ::IDirect3DVertexBuffer9: No notes.
* - ::IDirect3DIndexBuffer9: No notes.
* - ::IDirect3DSurface9: Only stand-alone objects of type ::IDirect3DSurface9
* may be explicitly shared. In particular, individual mipmap levels and faces
* of cube maps may not be registered directly. To access individual surfaces
* associated with a texture, one must register the base texture object.
* - ::IDirect3DBaseTexture9: When a texture is registered, all surfaces
* associated with all mipmap levels of all faces of the texture will be
* accessible to CUDA.
*
* The \p flags argument specifies the mechanism through which CUDA will
* access the Direct3D resource. The following value is allowed:
*
* - ::cudaD3D9RegisterFlagsNone: Specifies that CUDA will access this
* resource through a \p void*. The pointer, size, and pitch for each
* subresource of this resource may be queried through
* ::cudaD3D9ResourceGetMappedPointer(), ::cudaD3D9ResourceGetMappedSize(),
* and ::cudaD3D9ResourceGetMappedPitch() respectively. This option is valid
* for all resource types.
*
* Not all Direct3D resources of the above types may be used for
* interoperability with CUDA. The following are some limitations:
*
* - The primary rendertarget may not be registered with CUDA.
* - Resources allocated as shared may not be registered with CUDA.
* - Any resources allocated in ::D3DPOOL_SYSTEMMEM or ::D3DPOOL_MANAGED may
* not be registered with CUDA.
* - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
* or 32-bit integer or floating-point data cannot be shared.
* - Surfaces of depth or stencil formats cannot be shared.
*
* If Direct3D interoperability is not initialized on this context, then
* ::cudaErrorInvalidDevice is returned. If \p pResource is of incorrect type
* (e.g, is a non-stand-alone ::IDirect3DSurface9) or is already registered,
* then ::cudaErrorInvalidResourceHandle is returned. If \p pResource cannot
* be registered then ::cudaErrorUnknown is returned.
*
* \param pResource - Resource to register
* \param flags - Parameters for resource registration
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaGraphicsD3D9RegisterResource
*/
extern __host__ cudaError_t CUDARTAPI cudaD3D9RegisterResource(IDirect3DResource9 *pResource, unsigned int flags);
/**
* \brief Unregisters a Direct3D resource for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Unregisters the Direct3D resource \p pResource so it is not accessible by
* CUDA unless registered again.
*
* If \p pResource is not registered, then ::cudaErrorInvalidResourceHandle is
* returned.
*
* \param pResource - Resource to unregister
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaGraphicsUnregisterResource
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9UnregisterResource(IDirect3DResource9 *pResource);
/**
* \brief Map Direct3D resources for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Maps the \p count Direct3D resources in \p ppResources for access by CUDA.
*
* The resources in \p ppResources may be accessed in CUDA kernels until they
* are unmapped. Direct3D should not access any resources while they are
* mapped by CUDA. If an application does so, the results are undefined.
*
* This function provides the synchronization guarantee that any Direct3D
* calls issued before ::cudaD3D9MapResources() will complete before any CUDA
* kernels issued after ::cudaD3D9MapResources() begin.
*
* If any of \p ppResources have not been registered for use with CUDA or if
* \p ppResources contains any duplicate entries then
* ::cudaErrorInvalidResourceHandle is returned. If any of \p ppResources are
* presently mapped for access by CUDA then ::cudaErrorUnknown is returned.
*
* \param count - Number of resources to map for CUDA
* \param ppResources - Resources to map for CUDA
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaGraphicsMapResources
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9MapResources(int count, IDirect3DResource9 **ppResources);
/**
* \brief Unmap Direct3D resources for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Unmaps the \p count Direct3D resources in \p ppResources.
*
* This function provides the synchronization guarantee that any CUDA kernels
* issued before ::cudaD3D9UnmapResources() will complete before any Direct3D
* calls issued after ::cudaD3D9UnmapResources() begin.
*
* If any of \p ppResources have not been registered for use with CUDA or if
* \p ppResources contains any duplicate entries, then
* ::cudaErrorInvalidResourceHandle is returned. If any of \p ppResources are
* not presently mapped for access by CUDA then ::cudaErrorUnknown is returned.
*
* \param count - Number of resources to unmap for CUDA
* \param ppResources - Resources to unmap for CUDA
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaGraphicsUnmapResources
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9UnmapResources(int count, IDirect3DResource9 **ppResources);
/**
* \brief Set usage flags for mapping a Direct3D resource
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Set flags for mapping the Direct3D resource \p pResource.
*
* Changes to flags will take effect the next time \p pResource is mapped.
* The \p flags argument may be any of the following:
*
* - ::cudaD3D9MapFlagsNone: Specifies no hints about how this resource will
* be used. It is therefore assumed that this resource will be read from and
* written to by CUDA kernels. This is the default value.
* - ::cudaD3D9MapFlagsReadOnly: Specifies that CUDA kernels which access this
* resource will not write to this resource.
* - ::cudaD3D9MapFlagsWriteDiscard: Specifies that CUDA kernels which access
* this resource will not read from this resource and will write over the
* entire contents of the resource, so none of the data previously stored in
* the resource will be preserved.
*
* If \p pResource has not been registered for use with CUDA, then
* ::cudaErrorInvalidResourceHandle is returned. If \p pResource is presently
* mapped for access by CUDA, then ::cudaErrorUnknown is returned.
*
* \param pResource - Registered resource to set flags for
* \param flags - Parameters for resource mapping
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaInteropResourceSetMapFlags
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceSetMapFlags(IDirect3DResource9 *pResource, unsigned int flags);
/**
* \brief Get the dimensions of a registered Direct3D surface
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Returns in \p *pWidth, \p *pHeight, and \p *pDepth the dimensions of the
* subresource of the mapped Direct3D resource \p pResource which corresponds
* to \p face and \p level.
*
* Since anti-aliased surfaces may have multiple samples per pixel, it is
* possible that the dimensions of a resource will be an integer factor larger
* than the dimensions reported by the Direct3D runtime.
*
* The parameters \p pWidth, \p pHeight, and \p pDepth are optional. For 2D
* surfaces, the value returned in \p *pDepth will be 0.
*
* If \p pResource is not of type ::IDirect3DBaseTexture9 or
* ::IDirect3DSurface9 or if \p pResource has not been registered for use with
* CUDA, then ::cudaErrorInvalidResourceHandle is returned.
*
* For usage requirements of \p face and \p level parameters, see
* ::cudaD3D9ResourceGetMappedPointer.
*
* \param pWidth - Returned width of surface
* \param pHeight - Returned height of surface
* \param pDepth - Returned depth of surface
* \param pResource - Registered resource to access
* \param face - Face of resource to access
* \param level - Level of resource to access
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* \notefnerr
*
* \sa
* ::cudaGraphicsSubResourceGetMappedArray
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceGetSurfaceDimensions(size_t *pWidth, size_t *pHeight, size_t *pDepth, IDirect3DResource9 *pResource, unsigned int face, unsigned int level);
/**
* \brief Get an array through which to access a subresource of a Direct3D
* resource which has been mapped for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Returns in \p *pArray an array through which the subresource of the mapped
* Direct3D resource \p pResource, which corresponds to \p face and \p level
* may be accessed. The value set in \p pArray may change every time that
* \p pResource is mapped.
*
* If \p pResource is not registered then ::cudaErrorInvalidResourceHandle is
* returned. If \p pResource was not registered with usage flags
* ::cudaD3D9RegisterFlagsArray, then ::cudaErrorInvalidResourceHandle is
* returned. If \p pResource is not mapped, then ::cudaErrorUnknown is
* returned.
*
* For usage requirements of \p face and \p level parameters, see
* ::cudaD3D9ResourceGetMappedPointer().
*
* \param ppArray - Returned array corresponding to subresource
* \param pResource - Mapped resource to access
* \param face - Face of resource to access
* \param level - Level of resource to access
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaGraphicsSubResourceGetMappedArray
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceGetMappedArray(cudaArray **ppArray, IDirect3DResource9 *pResource, unsigned int face, unsigned int level);
/**
* \brief Get a pointer through which to access a subresource of a Direct3D
* resource which has been mapped for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Returns in \p *pPointer the base pointer of the subresource of the mapped
* Direct3D resource \p pResource, which corresponds to \p face and \p level.
* The value set in \p pPointer may change every time that \p pResource is
* mapped.
*
* If \p pResource is not registered, then ::cudaErrorInvalidResourceHandle is
* returned. If \p pResource was not registered with usage flags
* ::cudaD3D9RegisterFlagsNone, then ::cudaErrorInvalidResourceHandle is
* returned. If \p pResource is not mapped, then ::cudaErrorUnknown is
* returned.
*
* If \p pResource is of type ::IDirect3DCubeTexture9, then \p face must one
* of the values enumerated by type ::D3DCUBEMAP_FACES. For all other types,
* \p face must be 0. If \p face is invalid, then ::cudaErrorInvalidValue is
* returned.
*
* If \p pResource is of type ::IDirect3DBaseTexture9, then \p level must
* correspond to a valid mipmap level. Only mipmap level 0 is supported for
* now. For all other types \p level must be 0. If \p level is invalid, then
* ::cudaErrorInvalidValue is returned.
*
* \param pPointer - Returned pointer corresponding to subresource
* \param pResource - Mapped resource to access
* \param face - Face of resource to access
* \param level - Level of resource to access
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaGraphicsResourceGetMappedPointer
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceGetMappedPointer(void **pPointer, IDirect3DResource9 *pResource, unsigned int face, unsigned int level);
/**
* \brief Get the size of a subresource of a Direct3D resource which has been
* mapped for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Returns in \p *pSize the size of the subresource of the mapped Direct3D
* resource \p pResource, which corresponds to \p face and \p level. The value
* set in \p pSize may change every time that \p pResource is mapped.
*
* If \p pResource has not been registered for use with CUDA then
* ::cudaErrorInvalidResourceHandle is returned. If \p pResource was not
* registered with usage flags ::cudaD3D9RegisterFlagsNone, then
* ::cudaErrorInvalidResourceHandle is returned. If \p pResource is not mapped
* for access by CUDA then ::cudaErrorUnknown is returned.
*
* For usage requirements of \p face and \p level parameters, see
* ::cudaD3D9ResourceGetMappedPointer().
*
* \param pSize - Returned size of subresource
* \param pResource - Mapped resource to access
* \param face - Face of resource to access
* \param level - Level of resource to access
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaGraphicsResourceGetMappedPointer
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceGetMappedSize(size_t *pSize, IDirect3DResource9 *pResource, unsigned int face, unsigned int level);
/**
* \brief Get the pitch of a subresource of a Direct3D resource which has been
* mapped for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Returns in \p *pPitch and \p *pPitchSlice the pitch and Z-slice pitch of
* the subresource of the mapped Direct3D resource \p pResource, which
* corresponds to \p face and \p level. The values set in \p pPitch and
* \p pPitchSlice may change every time that \p pResource is mapped.
*
* The pitch and Z-slice pitch values may be used to compute the location of a
* sample on a surface as follows.
*
* For a 2D surface, the byte offset of the sample at position \b x, \b y from
* the base pointer of the surface is:
*
* \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
*
* For a 3D surface, the byte offset of the sample at position \b x, \b y,
* \b z from the base pointer of the surface is:
*
* \b z* \b slicePitch + \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
*
* Both parameters \p pPitch and \p pPitchSlice are optional and may be set to
* NULL.
*
* If \p pResource is not of type ::IDirect3DBaseTexture9 or one of its
* sub-types or if \p pResource has not been registered for use with CUDA,
* then ::cudaErrorInvalidResourceHandle is returned. If \p pResource was not
* registered with usage flags ::cudaD3D9RegisterFlagsNone, then
* ::cudaErrorInvalidResourceHandle is returned. If \p pResource is not mapped
* for access by CUDA then ::cudaErrorUnknown is returned.
*
* For usage requirements of \p face and \p level parameters, see
* ::cudaD3D9ResourceGetMappedPointer().
*
* \param pPitch - Returned pitch of subresource
* \param pPitchSlice - Returned Z-slice pitch of subresource
* \param pResource - Mapped resource to access
* \param face - Face of resource to access
* \param level - Level of resource to access
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaGraphicsResourceGetMappedPointer
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceGetMappedPitch(size_t *pPitch, size_t *pPitchSlice, IDirect3DResource9 *pResource, unsigned int face, unsigned int level);
/* D3D9 1.x interop interface */
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9Begin(IDirect3DDevice9 *pDevice);
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9End(void);
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9RegisterVertexBuffer(IDirect3DVertexBuffer9 *pVB);
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9UnregisterVertexBuffer(IDirect3DVertexBuffer9 *pVB);
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9MapVertexBuffer(void **dptr, IDirect3DVertexBuffer9 *pVB);
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9UnmapVertexBuffer(IDirect3DVertexBuffer9 *pVB);
/** @} */ /* END CUDART_D3D9_DEPRECATED */
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#undef __dv
#undef __CUDA_DEPRECATED
#endif /* __CUDA_D3D9_INTEROP_H__ */

View file

@ -1,265 +0,0 @@
/*
* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_DEVICE_RUNTIME_API_H__)
#define __CUDA_DEVICE_RUNTIME_API_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if !defined(__CUDACC_RTC__)
#if !defined(__CUDACC_INTERNAL_NO_STUBS__) && !defined(__CUDACC_RDC__) && !defined(__CUDACC_EWP__) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__)
#if defined(__cplusplus)
extern "C" {
#endif
struct cudaFuncAttributes;
inline __device__ cudaError_t CUDARTAPI cudaMalloc(void **p, size_t s)
{
return cudaErrorUnknown;
}
inline __device__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c)
{
return cudaErrorUnknown;
}
inline __device__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
{
return cudaErrorUnknown;
}
inline __device__ cudaError_t CUDARTAPI cudaGetDevice(int *device)
{
return cudaErrorUnknown;
}
inline __device__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
{
return cudaErrorUnknown;
}
inline __device__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
{
return cudaErrorUnknown;
}
#if defined(__cplusplus)
}
#endif
#endif /* !defined(__CUDACC_INTERNAL_NO_STUBS__) && !defined(__CUDACC_RDC__) && !defined(__CUDACC_EWP__) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__) */
#endif /* !defined(__CUDACC_RTC__) */
#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
# define __DEPRECATED__(msg)
#elif defined(_WIN32)
# define __DEPRECATED__(msg) __declspec(deprecated(msg))
#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
# define __DEPRECATED__(msg) __attribute__((deprecated))
#else
# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
#endif
#if defined(__CUDA_ARCH__) && !defined(__CDPRT_SUPPRESS_SYNC_DEPRECATION_WARNING)
# define __CDPRT_DEPRECATED(func_name) __DEPRECATED__("Use of "#func_name" from device code is deprecated and will not be supported in a future release. Disable this warning with -D__CDPRT_SUPPRESS_SYNC_DEPRECATION_WARNING.")
#else
# define __CDPRT_DEPRECATED(func_name)
#endif
#if defined(__cplusplus) && defined(__CUDACC__) /* Visible to nvcc front-end only */
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350) // Visible to SM>=3.5 and "__host__ __device__" only
#include "driver_types.h"
#include "crt/host_defines.h"
extern "C"
{
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
extern __device__ __cudart_builtin__ __CDPRT_DEPRECATED(cudaDeviceSynchronize) cudaError_t CUDARTAPI cudaDeviceSynchronize(void);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaDeviceSynchronizeDeprecationAvoidance(void);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void);
extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);
extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord_ptsz(cudaEvent_t event, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags_ptsz(cudaEvent_t event, cudaStream_t stream, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
/**
* \ingroup CUDART_EXECUTION
* \brief Obtains a parameter buffer
*
* Obtains a parameter buffer which can be filled with parameters for a kernel launch.
* Parameters passed to ::cudaLaunchDevice must be allocated via this function.
*
* This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
* CUDA user code should use <<< >>> to launch kernels.
*
* \param alignment - Specifies alignment requirement of the parameter buffer
* \param size - Specifies size requirement in bytes
*
* \return
* Returns pointer to the allocated parameterBuffer
* \notefnerr
*
* \sa cudaLaunchDevice
*/
extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBuffer(size_t alignment, size_t size);
/**
* \ingroup CUDART_EXECUTION
* \brief Launches a specified kernel
*
* Launches a specified kernel with the specified parameter buffer. A parameter buffer can be obtained
* by calling ::cudaGetParameterBuffer().
*
* This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
* CUDA user code should use <<< >>> to launch the kernels.
*
* \param func - Pointer to the kernel to be launched
* \param parameterBuffer - Holds the parameters to the launched kernel. parameterBuffer can be NULL. (Optional)
* \param gridDimension - Specifies grid dimensions
* \param blockDimension - Specifies block dimensions
* \param sharedMemSize - Specifies size of shared memory
* \param stream - Specifies the stream to be used
*
* \return
* ::cudaSuccess, ::cudaErrorInvalidDevice, ::cudaErrorLaunchMaxDepthExceeded, ::cudaErrorInvalidConfiguration,
* ::cudaErrorStartupFailure, ::cudaErrorLaunchPendingCountExceeded, ::cudaErrorLaunchOutOfResources
* \notefnerr
* \n Please refer to Execution Configuration and Parameter Buffer Layout from the CUDA Programming
* Guide for the detailed descriptions of launch configuration and parameter layout respectively.
*
* \sa cudaGetParameterBuffer
*/
extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream);
#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__)
// When compiling for the device and per thread default stream is enabled, add
// a static inline redirect to the per thread stream entry points.
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream)
{
return cudaLaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
}
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream)
{
return cudaLaunchDeviceV2_ptsz(parameterBuffer, stream);
}
#else
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream);
#endif
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
extern __device__ __cudart_builtin__ unsigned long long CUDARTAPI cudaCGGetIntrinsicHandle(enum cudaCGScope scope);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGSynchronize(unsigned long long handle, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGSynchronizeGrid(unsigned long long handle, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetSize(unsigned int *numThreads, unsigned int *numGrids, unsigned long long handle);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetRank(unsigned int *threadRank, unsigned int *gridRank, unsigned long long handle);
}
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaMalloc(T **devPtr, size_t size);
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *attr, T *entry);
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize);
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
#endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)
#endif /* defined(__cplusplus) && defined(__CUDACC__) */
#undef __DEPRECATED__
#undef __CDPRT_DEPRECATED
#endif /* !__CUDA_DEVICE_RUNTIME_API_H__ */

View file

@ -1,642 +0,0 @@
/*
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_EGL_INTEROP_H__)
#define __CUDA_EGL_INTEROP_H__
#include "cuda_runtime_api.h"
#include "cuda_runtime.h"
#include "cudart_platform.h"
#include "EGL/egl.h"
#include "EGL/eglext.h"
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
/**
* \addtogroup CUDART_TYPES
* @{
*/
/**
* Maximum number of planes per frame
*/
#define CUDA_EGL_MAX_PLANES 3
/**
* CUDA EglFrame type - array or pointer
*/
typedef enum cudaEglFrameType_enum
{
cudaEglFrameTypeArray = 0, /**< Frame type CUDA array */
cudaEglFrameTypePitch = 1, /**< Frame type CUDA pointer */
} cudaEglFrameType;
/**
* Resource location flags- sysmem or vidmem
*
* For CUDA context on iGPU, since video and system memory are equivalent -
* these flags will not have an effect on the execution.
*
* For CUDA context on dGPU, applications can use the flag ::cudaEglResourceLocationFlags
* to give a hint about the desired location.
*
* ::cudaEglResourceLocationSysmem - the frame data is made resident on the system memory
* to be accessed by CUDA.
*
* ::cudaEglResourceLocationVidmem - the frame data is made resident on the dedicated
* video memory to be accessed by CUDA.
*
* There may be an additional latency due to new allocation and data migration,
* if the frame is produced on a different memory.
*/
typedef enum cudaEglResourceLocationFlags_enum {
cudaEglResourceLocationSysmem = 0x00, /**< Resource location sysmem */
cudaEglResourceLocationVidmem = 0x01, /**< Resource location vidmem */
} cudaEglResourceLocationFlags;
/**
* CUDA EGL Color Format - The different planar and multiplanar formats currently supported for CUDA_EGL interops.
*/
typedef enum cudaEglColorFormat_enum {
cudaEglColorFormatYUV420Planar = 0, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
cudaEglColorFormatYUV420SemiPlanar = 1, /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV420Planar. */
cudaEglColorFormatYUV422Planar = 2, /**< Y, U, V each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
cudaEglColorFormatYUV422SemiPlanar = 3, /**< Y, UV in two surfaces with VU byte ordering, width, height ratio same as YUV422Planar. */
cudaEglColorFormatARGB = 6, /**< R/G/B/A four channels in one surface with BGRA byte ordering. */
cudaEglColorFormatRGBA = 7, /**< R/G/B/A four channels in one surface with ABGR byte ordering. */
cudaEglColorFormatL = 8, /**< single luminance channel in one surface. */
cudaEglColorFormatR = 9, /**< single color channel in one surface. */
cudaEglColorFormatYUV444Planar = 10, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
cudaEglColorFormatYUV444SemiPlanar = 11, /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV444Planar. */
cudaEglColorFormatYUYV422 = 12, /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
cudaEglColorFormatUYVY422 = 13, /**< Y, U, V in one surface, interleaved as YUYV in one channel. */
cudaEglColorFormatABGR = 14, /**< R/G/B/A four channels in one surface with RGBA byte ordering. */
cudaEglColorFormatBGRA = 15, /**< R/G/B/A four channels in one surface with ARGB byte ordering. */
cudaEglColorFormatA = 16, /**< Alpha color format - one channel in one surface. */
cudaEglColorFormatRG = 17, /**< R/G color format - two channels in one surface with GR byte ordering */
cudaEglColorFormatAYUV = 18, /**< Y, U, V, A four channels in one surface, interleaved as VUYA. */
cudaEglColorFormatYVU444SemiPlanar = 19, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
cudaEglColorFormatYVU422SemiPlanar = 20, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
cudaEglColorFormatYVU420SemiPlanar = 21, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
cudaEglColorFormatY10V10U10_444SemiPlanar = 22, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
cudaEglColorFormatY10V10U10_420SemiPlanar = 23, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
cudaEglColorFormatY12V12U12_444SemiPlanar = 24, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
cudaEglColorFormatY12V12U12_420SemiPlanar = 25, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
cudaEglColorFormatVYUY_ER = 26, /**< Extended Range Y, U, V in one surface, interleaved as YVYU in one channel. */
cudaEglColorFormatUYVY_ER = 27, /**< Extended Range Y, U, V in one surface, interleaved as YUYV in one channel. */
cudaEglColorFormatYUYV_ER = 28, /**< Extended Range Y, U, V in one surface, interleaved as UYVY in one channel. */
cudaEglColorFormatYVYU_ER = 29, /**< Extended Range Y, U, V in one surface, interleaved as VYUY in one channel. */
cudaEglColorFormatYUVA_ER = 31, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as AVUY. */
cudaEglColorFormatAYUV_ER = 32, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as VUYA. */
cudaEglColorFormatYUV444Planar_ER = 33, /**< Extended Range Y, U, V in three surfaces, U/V width = Y width, U/V height = Y height. */
cudaEglColorFormatYUV422Planar_ER = 34, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
cudaEglColorFormatYUV420Planar_ER = 35, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
cudaEglColorFormatYUV444SemiPlanar_ER = 36, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = Y width, U/V height = Y height. */
cudaEglColorFormatYUV422SemiPlanar_ER = 37, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
cudaEglColorFormatYUV420SemiPlanar_ER = 38, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
cudaEglColorFormatYVU444Planar_ER = 39, /**< Extended Range Y, V, U in three surfaces, U/V width = Y width, U/V height = Y height. */
cudaEglColorFormatYVU422Planar_ER = 40, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
cudaEglColorFormatYVU420Planar_ER = 41, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
cudaEglColorFormatYVU444SemiPlanar_ER = 42, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
cudaEglColorFormatYVU422SemiPlanar_ER = 43, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
cudaEglColorFormatYVU420SemiPlanar_ER = 44, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
cudaEglColorFormatBayerRGGB = 45, /**< Bayer format - one channel in one surface with interleaved RGGB ordering. */
cudaEglColorFormatBayerBGGR = 46, /**< Bayer format - one channel in one surface with interleaved BGGR ordering. */
cudaEglColorFormatBayerGRBG = 47, /**< Bayer format - one channel in one surface with interleaved GRBG ordering. */
cudaEglColorFormatBayerGBRG = 48, /**< Bayer format - one channel in one surface with interleaved GBRG ordering. */
cudaEglColorFormatBayer10RGGB = 49, /**< Bayer10 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
cudaEglColorFormatBayer10BGGR = 50, /**< Bayer10 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
cudaEglColorFormatBayer10GRBG = 51, /**< Bayer10 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
cudaEglColorFormatBayer10GBRG = 52, /**< Bayer10 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
cudaEglColorFormatBayer12RGGB = 53, /**< Bayer12 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
cudaEglColorFormatBayer12BGGR = 54, /**< Bayer12 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
cudaEglColorFormatBayer12GRBG = 55, /**< Bayer12 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
cudaEglColorFormatBayer12GBRG = 56, /**< Bayer12 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
cudaEglColorFormatBayer14RGGB = 57, /**< Bayer14 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
cudaEglColorFormatBayer14BGGR = 58, /**< Bayer14 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
cudaEglColorFormatBayer14GRBG = 59, /**< Bayer14 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
cudaEglColorFormatBayer14GBRG = 60, /**< Bayer14 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
cudaEglColorFormatBayer20RGGB = 61, /**< Bayer20 format - one channel in one surface with interleaved RGGB ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
cudaEglColorFormatBayer20BGGR = 62, /**< Bayer20 format - one channel in one surface with interleaved BGGR ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
cudaEglColorFormatBayer20GRBG = 63, /**< Bayer20 format - one channel in one surface with interleaved GRBG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
cudaEglColorFormatBayer20GBRG = 64, /**< Bayer20 format - one channel in one surface with interleaved GBRG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
cudaEglColorFormatYVU444Planar = 65, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
cudaEglColorFormatYVU422Planar = 66, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
cudaEglColorFormatYVU420Planar = 67, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
cudaEglColorFormatBayerIspRGGB = 68, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved RGGB ordering and mapped to opaque integer datatype. */
cudaEglColorFormatBayerIspBGGR = 69, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved BGGR ordering and mapped to opaque integer datatype. */
cudaEglColorFormatBayerIspGRBG = 70, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GRBG ordering and mapped to opaque integer datatype. */
cudaEglColorFormatBayerIspGBRG = 71, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GBRG ordering and mapped to opaque integer datatype. */
cudaEglColorFormatBayerBCCR = 72, /**< Bayer format - one channel in one surface with interleaved BCCR ordering. */
cudaEglColorFormatBayerRCCB = 73, /**< Bayer format - one channel in one surface with interleaved RCCB ordering. */
cudaEglColorFormatBayerCRBC = 74, /**< Bayer format - one channel in one surface with interleaved CRBC ordering. */
cudaEglColorFormatBayerCBRC = 75, /**< Bayer format - one channel in one surface with interleaved CBRC ordering. */
cudaEglColorFormatBayer10CCCC = 76, /**< Bayer10 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
cudaEglColorFormatBayer12BCCR = 77, /**< Bayer12 format - one channel in one surface with interleaved BCCR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
cudaEglColorFormatBayer12RCCB = 78, /**< Bayer12 format - one channel in one surface with interleaved RCCB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
cudaEglColorFormatBayer12CRBC = 79, /**< Bayer12 format - one channel in one surface with interleaved CRBC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
cudaEglColorFormatBayer12CBRC = 80, /**< Bayer12 format - one channel in one surface with interleaved CBRC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
cudaEglColorFormatBayer12CCCC = 81, /**< Bayer12 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
cudaEglColorFormatY = 82, /**< Color format for single Y plane. */
cudaEglColorFormatYUV420SemiPlanar_2020 = 83, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
cudaEglColorFormatYVU420SemiPlanar_2020 = 84, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
cudaEglColorFormatYUV420Planar_2020 = 85, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
cudaEglColorFormatYVU420Planar_2020 = 86, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
cudaEglColorFormatYUV420SemiPlanar_709 = 87, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
cudaEglColorFormatYVU420SemiPlanar_709 = 88, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
cudaEglColorFormatYUV420Planar_709 = 89, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
cudaEglColorFormatYVU420Planar_709 = 90, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
cudaEglColorFormatY10V10U10_420SemiPlanar_709 = 91, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
cudaEglColorFormatY10V10U10_420SemiPlanar_2020 = 92, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
cudaEglColorFormatY10V10U10_422SemiPlanar_2020 = 93, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
cudaEglColorFormatY10V10U10_422SemiPlanar = 94, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
cudaEglColorFormatY10V10U10_422SemiPlanar_709 = 95, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
cudaEglColorFormatY_ER = 96, /**< Extended Range Color format for single Y plane. */
cudaEglColorFormatY_709_ER = 97, /**< Extended Range Color format for single Y plane. */
cudaEglColorFormatY10_ER = 98, /**< Extended Range Color format for single Y10 plane. */
cudaEglColorFormatY10_709_ER = 99, /**< Extended Range Color format for single Y10 plane. */
cudaEglColorFormatY12_ER = 100, /**< Extended Range Color format for single Y12 plane. */
cudaEglColorFormatY12_709_ER = 101, /**< Extended Range Color format for single Y12 plane. */
cudaEglColorFormatYUVA = 102, /**< Y, U, V, A four channels in one surface, interleaved as AVUY. */
cudaEglColorFormatYVYU = 104, /**< Y, U, V in one surface, interleaved as YVYU in one channel. */
cudaEglColorFormatVYUY = 105, /**< Y, U, V in one surface, interleaved as VYUY in one channel. */
cudaEglColorFormatY10V10U10_420SemiPlanar_ER = 106, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
cudaEglColorFormatY10V10U10_420SemiPlanar_709_ER = 107, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
cudaEglColorFormatY10V10U10_444SemiPlanar_ER = 108, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
cudaEglColorFormatY10V10U10_444SemiPlanar_709_ER = 109, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
cudaEglColorFormatY12V12U12_420SemiPlanar_ER = 110, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
cudaEglColorFormatY12V12U12_420SemiPlanar_709_ER = 111, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
cudaEglColorFormatY12V12U12_444SemiPlanar_ER = 112, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
cudaEglColorFormatY12V12U12_444SemiPlanar_709_ER = 113, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
} cudaEglColorFormat;
/**
* CUDA EGL Plane Descriptor - structure defining each plane of a CUDA EGLFrame
*/
typedef struct cudaEglPlaneDesc_st {
unsigned int width; /**< Width of plane */
unsigned int height; /**< Height of plane */
unsigned int depth; /**< Depth of plane */
unsigned int pitch; /**< Pitch of plane */
unsigned int numChannels; /**< Number of channels for the plane */
struct cudaChannelFormatDesc channelDesc; /**< Channel Format Descriptor */
unsigned int reserved[4]; /**< Reserved for future use */
} cudaEglPlaneDesc;
/**
* CUDA EGLFrame Descriptor - structure defining one frame of EGL.
*
* Each frame may contain one or more planes depending on whether the surface is Multiplanar or not.
* Each plane of EGLFrame is represented by ::cudaEglPlaneDesc which is defined as:
* \code
* typedef struct cudaEglPlaneDesc_st {
* unsigned int width;
* unsigned int height;
* unsigned int depth;
* unsigned int pitch;
* unsigned int numChannels;
* struct cudaChannelFormatDesc channelDesc;
* unsigned int reserved[4];
* } cudaEglPlaneDesc;
* \endcode
*/
typedef struct cudaEglFrame_st {
union {
cudaArray_t pArray[CUDA_EGL_MAX_PLANES]; /**< Array of CUDA arrays corresponding to each plane*/
struct cudaPitchedPtr pPitch[CUDA_EGL_MAX_PLANES]; /**< Array of Pointers corresponding to each plane*/
} frame;
cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES]; /**< CUDA EGL Plane Descriptor ::cudaEglPlaneDesc*/
unsigned int planeCount; /**< Number of planes */
cudaEglFrameType frameType; /**< Array or Pitch */
cudaEglColorFormat eglColorFormat; /**< CUDA EGL Color Format*/
} cudaEglFrame;
/**
* CUDA EGLSream Connection
*/
typedef struct CUeglStreamConnection_st *cudaEglStreamConnection;
/** @} */ /* END CUDART_TYPES */
/**
* \addtogroup CUDART_EGL EGL Interoperability
* This section describes the EGL interoperability functions of the CUDA
* runtime application programming interface.
*
* @{
*/
/**
* \brief Registers an EGL image
*
* Registers the EGLImageKHR specified by \p image for access by
* CUDA. A handle to the registered object is returned as \p pCudaResource.
* Additional Mapping/Unmapping is not required for the registered resource and
* ::cudaGraphicsResourceGetMappedEglFrame can be directly called on the \p pCudaResource.
*
* The application will be responsible for synchronizing access to shared objects.
* The application must ensure that any pending operation which access the objects have completed
* before passing control to CUDA. This may be accomplished by issuing and waiting for
* glFinish command on all GLcontexts (for OpenGL and likewise for other APIs).
* The application will be also responsible for ensuring that any pending operation on the
* registered CUDA resource has completed prior to executing subsequent commands in other APIs
* accesing the same memory objects.
* This can be accomplished by calling cuCtxSynchronize or cuEventSynchronize (preferably).
*
* The surface's intended usage is specified using \p flags, as follows:
*
* - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
* resource will be used. It is therefore assumed that this resource will be
* read from and written to by CUDA. This is the default value.
* - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
* will not write to this resource.
* - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
* CUDA will not read from this resource and will write over the
* entire contents of the resource, so none of the data previously
* stored in the resource will be preserved.
*
* The EGLImageKHR is an object which can be used to create EGLImage target resource. It is defined as a void pointer.
* typedef void* EGLImageKHR
*
* \param pCudaResource - Pointer to the returned object handle
* \param image - An EGLImageKHR image which can be used to create target resource.
* \param flags - Map flags
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorInvalidValue,
* ::cudaErrorUnknown
*
* \sa
* ::cudaGraphicsUnregisterResource,
* ::cudaGraphicsResourceGetMappedEglFrame,
* ::cuGraphicsEGLRegisterImage
*/
extern __host__ cudaError_t CUDARTAPI cudaGraphicsEGLRegisterImage(struct cudaGraphicsResource **pCudaResource, EGLImageKHR image, unsigned int flags);
/**
* \brief Connect CUDA to EGLStream as a consumer.
*
* Connect CUDA as a consumer to EGLStreamKHR specified by \p eglStream.
*
* The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
* API to another.
*
* \param conn - Pointer to the returned connection handle
* \param eglStream - EGLStreamKHR handle
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorUnknown
*
* \sa
* ::cudaEGLStreamConsumerDisconnect,
* ::cudaEGLStreamConsumerAcquireFrame,
* ::cudaEGLStreamConsumerReleaseFrame,
* ::cuEGLStreamConsumerConnect
*/
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerConnect(cudaEglStreamConnection *conn, EGLStreamKHR eglStream);
/**
* \brief Connect CUDA to EGLStream as a consumer with given flags.
*
* Connect CUDA as a consumer to EGLStreamKHR specified by \p stream with specified \p flags defined by
* ::cudaEglResourceLocationFlags.
*
* The flags specify whether the consumer wants to access frames from system memory or video memory.
* Default is ::cudaEglResourceLocationVidmem.
*
* \param conn - Pointer to the returned connection handle
* \param eglStream - EGLStreamKHR handle
* \param flags - Flags denote intended location - system or video.
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorUnknown
*
* \sa
* ::cudaEGLStreamConsumerDisconnect,
* ::cudaEGLStreamConsumerAcquireFrame,
* ::cudaEGLStreamConsumerReleaseFrame,
* ::cuEGLStreamConsumerConnectWithFlags
*/
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerConnectWithFlags(cudaEglStreamConnection *conn, EGLStreamKHR eglStream, unsigned int flags);
/**
* \brief Disconnect CUDA as a consumer to EGLStream .
*
* Disconnect CUDA as a consumer to EGLStreamKHR.
*
* \param conn - Conection to disconnect.
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorUnknown
*
* \sa
* ::cudaEGLStreamConsumerConnect,
* ::cudaEGLStreamConsumerAcquireFrame,
* ::cudaEGLStreamConsumerReleaseFrame,
* ::cuEGLStreamConsumerDisconnect
*/
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerDisconnect(cudaEglStreamConnection *conn);
/**
* \brief Acquire an image frame from the EGLStream with CUDA as a consumer.
*
* Acquire an image frame from EGLStreamKHR.
* ::cudaGraphicsResourceGetMappedEglFrame can be called on \p pCudaResource to get
* ::cudaEglFrame.
*
* \param conn - Connection on which to acquire
* \param pCudaResource - CUDA resource on which the EGLStream frame will be mapped for use.
* \param pStream - CUDA stream for synchronization and any data migrations
* implied by ::cudaEglResourceLocationFlags.
* \param timeout - Desired timeout in usec.
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorUnknown,
* ::cudaErrorLaunchTimeout
*
* \sa
* ::cudaEGLStreamConsumerConnect,
* ::cudaEGLStreamConsumerDisconnect,
* ::cudaEGLStreamConsumerReleaseFrame,
* ::cuEGLStreamConsumerAcquireFrame
*/
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerAcquireFrame(cudaEglStreamConnection *conn,
cudaGraphicsResource_t *pCudaResource, cudaStream_t *pStream, unsigned int timeout);
/**
* \brief Releases the last frame acquired from the EGLStream.
*
* Release the acquired image frame specified by \p pCudaResource to EGLStreamKHR.
*
* \param conn - Connection on which to release
* \param pCudaResource - CUDA resource whose corresponding frame is to be released
* \param pStream - CUDA stream on which release will be done.
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorUnknown
*
* \sa
* ::cudaEGLStreamConsumerConnect,
* ::cudaEGLStreamConsumerDisconnect,
* ::cudaEGLStreamConsumerAcquireFrame,
* ::cuEGLStreamConsumerReleaseFrame
*/
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerReleaseFrame(cudaEglStreamConnection *conn,
cudaGraphicsResource_t pCudaResource, cudaStream_t *pStream);
/**
* \brief Connect CUDA to EGLStream as a producer.
*
* Connect CUDA as a producer to EGLStreamKHR specified by \p stream.
*
* The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
* API to another.
*
* \param conn - Pointer to the returned connection handle
* \param eglStream - EGLStreamKHR handle
* \param width - width of the image to be submitted to the stream
* \param height - height of the image to be submitted to the stream
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorUnknown
*
* \sa
* ::cudaEGLStreamProducerDisconnect,
* ::cudaEGLStreamProducerPresentFrame,
* ::cudaEGLStreamProducerReturnFrame,
* ::cuEGLStreamProducerConnect
*/
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerConnect(cudaEglStreamConnection *conn,
EGLStreamKHR eglStream, EGLint width, EGLint height);
/**
* \brief Disconnect CUDA as a producer to EGLStream .
*
* Disconnect CUDA as a producer to EGLStreamKHR.
*
* \param conn - Conection to disconnect.
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorUnknown
*
* \sa
* ::cudaEGLStreamProducerConnect,
* ::cudaEGLStreamProducerPresentFrame,
* ::cudaEGLStreamProducerReturnFrame,
* ::cuEGLStreamProducerDisconnect
*/
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerDisconnect(cudaEglStreamConnection *conn);
/**
* \brief Present a CUDA eglFrame to the EGLStream with CUDA as a producer.
*
* The ::cudaEglFrame is defined as:
* \code
* typedef struct cudaEglFrame_st {
* union {
* cudaArray_t pArray[CUDA_EGL_MAX_PLANES];
* struct cudaPitchedPtr pPitch[CUDA_EGL_MAX_PLANES];
* } frame;
* cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];
* unsigned int planeCount;
* cudaEglFrameType frameType;
* cudaEglColorFormat eglColorFormat;
* } cudaEglFrame;
* \endcode
*
* For ::cudaEglFrame of type ::cudaEglFrameTypePitch, the application may present sub-region of a memory
* allocation. In that case, ::cudaPitchedPtr::ptr will specify the start address of the sub-region in
* the allocation and ::cudaEglPlaneDesc will specify the dimensions of the sub-region.
*
* \param conn - Connection on which to present the CUDA array
* \param eglframe - CUDA Eglstream Proucer Frame handle to be sent to the consumer over EglStream.
* \param pStream - CUDA stream on which to present the frame.
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorUnknown
*
* \sa
* ::cudaEGLStreamProducerConnect,
* ::cudaEGLStreamProducerDisconnect,
* ::cudaEGLStreamProducerReturnFrame,
* ::cuEGLStreamProducerPresentFrame
*/
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerPresentFrame(cudaEglStreamConnection *conn,
cudaEglFrame eglframe, cudaStream_t *pStream);
/**
* \brief Return the CUDA eglFrame to the EGLStream last released by the consumer.
*
* This API can potentially return cudaErrorLaunchTimeout if the consumer has not
* returned a frame to EGL stream. If timeout is returned the application can retry.
*
* \param conn - Connection on which to present the CUDA array
* \param eglframe - CUDA Eglstream Proucer Frame handle returned from the consumer over EglStream.
* \param pStream - CUDA stream on which to return the frame.
*
* \return
* ::cudaSuccess,
* ::cudaErrorLaunchTimeout,
* ::cudaErrorInvalidValue,
* ::cudaErrorUnknown
*
* \sa
* ::cudaEGLStreamProducerConnect,
* ::cudaEGLStreamProducerDisconnect,
* ::cudaEGLStreamProducerPresentFrame,
* ::cuEGLStreamProducerReturnFrame
*/
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerReturnFrame(cudaEglStreamConnection *conn,
cudaEglFrame *eglframe, cudaStream_t *pStream);
/**
* \brief Get an eglFrame through which to access a registered EGL graphics resource.
*
* Returns in \p *eglFrame an eglFrame pointer through which the registered graphics resource
* \p resource may be accessed.
* This API can only be called for EGL graphics resources.
*
* The ::cudaEglFrame is defined as
* \code
* typedef struct cudaEglFrame_st {
* union {
* cudaArray_t pArray[CUDA_EGL_MAX_PLANES];
* struct cudaPitchedPtr pPitch[CUDA_EGL_MAX_PLANES];
* } frame;
* cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];
* unsigned int planeCount;
* cudaEglFrameType frameType;
* cudaEglColorFormat eglColorFormat;
* } cudaEglFrame;
* \endcode
*
*
* \param eglFrame - Returned eglFrame.
* \param resource - Registered resource to access.
* \param index - Index for cubemap surfaces.
* \param mipLevel - Mipmap level for the subresource to access.
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorUnknown
*
* \note Note that in case of multiplanar \p *eglFrame, pitch of only first plane (unsigned int cudaEglPlaneDesc::pitch) is to be considered by the application.
*
* \sa
* ::cudaGraphicsSubResourceGetMappedArray,
* ::cudaGraphicsResourceGetMappedPointer,
* ::cuGraphicsResourceGetMappedEglFrame
*/
extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedEglFrame(cudaEglFrame* eglFrame,
cudaGraphicsResource_t resource, unsigned int index, unsigned int mipLevel);
/**
* \brief Creates an event from EGLSync object
*
* Creates an event *phEvent from an EGLSyncKHR eglSync with the flages specified
* via \p flags. Valid flags include:
* - ::cudaEventDefault: Default event creation flag.
* - ::cudaEventBlockingSync: Specifies that the created event should use blocking
* synchronization. A CPU thread that uses ::cudaEventSynchronize() to wait on
* an event created with this flag will block until the event has actually
* been completed.
*
* ::cudaEventRecord and TimingData are not supported for events created from EGLSync.
*
* The EGLSyncKHR is an opaque handle to an EGL sync object.
* typedef void* EGLSyncKHR
*
* \param phEvent - Returns newly created event
* \param eglSync - Opaque handle to EGLSync object
* \param flags - Event creation flags
*
* \return
* ::cudaSuccess,
* ::cudaErrorInitializationError,
* ::cudaErrorInvalidValue,
* ::cudaErrorLaunchFailure,
* ::cudaErrorMemoryAllocation
*
* \sa
* ::cudaEventQuery,
* ::cudaEventSynchronize,
* ::cudaEventDestroy
*/
extern __host__ cudaError_t CUDARTAPI cudaEventCreateFromEGLSync(cudaEvent_t *phEvent, EGLSyncKHR eglSync, unsigned int flags);
/** @} */ /* END CUDART_EGL */
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#endif /* __CUDA_EGL_INTEROP_H__ */

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,508 +0,0 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_GL_INTEROP_H__)
#define __CUDA_GL_INTEROP_H__
#include "cuda_runtime_api.h"
#if defined(__APPLE__)
#include <OpenGL/gl.h>
#else /* __APPLE__ */
#if defined(__arm__) || defined(__aarch64__)
#ifndef GL_VERSION
#error Please include the appropriate gl headers before including cuda_gl_interop.h
#endif
#else
#include <GL/gl.h>
#endif
#endif /* __APPLE__ */
/** \cond impl_private */
#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
#define __CUDA_DEPRECATED
#elif defined(_MSC_VER)
#define __CUDA_DEPRECATED __declspec(deprecated)
#elif defined(__GNUC__)
#define __CUDA_DEPRECATED __attribute__((deprecated))
#else
#define __CUDA_DEPRECATED
#endif
/** \endcond impl_private */
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
/**
* \addtogroup CUDART_OPENGL OpenGL Interoperability
* This section describes the OpenGL interoperability functions of the CUDA
* runtime application programming interface. Note that mapping of OpenGL
* resources is performed with the graphics API agnostic, resource mapping
* interface described in \ref CUDART_INTEROP "Graphics Interopability".
*
* @{
*/
/**
* CUDA devices corresponding to the current OpenGL context
*/
enum cudaGLDeviceList
{
cudaGLDeviceListAll = 1, /**< The CUDA devices for all GPUs used by the current OpenGL context */
cudaGLDeviceListCurrentFrame = 2, /**< The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame */
cudaGLDeviceListNextFrame = 3 /**< The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame */
};
/**
* \brief Gets the CUDA devices associated with the current OpenGL context
*
* Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices
* corresponding to the current OpenGL context. Also returns in \p *pCudaDevices
* at most \p cudaDeviceCount of the CUDA-compatible devices corresponding to
* the current OpenGL context. If any of the GPUs being used by the current OpenGL
* context are not CUDA capable then the call will return ::cudaErrorNoDevice.
*
* \param pCudaDeviceCount - Returned number of CUDA devices corresponding to the
* current OpenGL context
* \param pCudaDevices - Returned CUDA devices corresponding to the current
* OpenGL context
* \param cudaDeviceCount - The size of the output device array \p pCudaDevices
* \param deviceList - The set of devices to return. This set may be
* ::cudaGLDeviceListAll for all devices,
* ::cudaGLDeviceListCurrentFrame for the devices used to
* render the current frame (in SLI), or
* ::cudaGLDeviceListNextFrame for the devices used to
* render the next frame (in SLI).
*
* \return
* ::cudaSuccess,
* ::cudaErrorNoDevice,
* ::cudaErrorInvalidGraphicsContext,
* ::cudaErrorUnknown
*
* \note This function is not supported on Mac OS X.
* \notefnerr
*
* \sa
* ::cudaGraphicsUnregisterResource,
* ::cudaGraphicsMapResources,
* ::cudaGraphicsSubResourceGetMappedArray,
* ::cudaGraphicsResourceGetMappedPointer,
* ::cuGLGetDevices
*/
extern __host__ cudaError_t CUDARTAPI cudaGLGetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, enum cudaGLDeviceList deviceList);
/**
* \brief Register an OpenGL texture or renderbuffer object
*
* Registers the texture or renderbuffer object specified by \p image for access by CUDA.
* A handle to the registered object is returned as \p resource.
*
* \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D,
* ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY,
* or ::GL_RENDERBUFFER.
*
* The register flags \p flags specify the intended usage, as follows:
* - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
* resource will be used. It is therefore assumed that this resource will be
* read from and written to by CUDA. This is the default value.
* - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
* will not write to this resource.
* - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
* CUDA will not read from this resource and will write over the
* entire contents of the resource, so none of the data previously
* stored in the resource will be preserved.
* - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will
* bind this resource to a surface reference.
* - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform
* texture gather operations on this resource.
*
* The following image formats are supported. For brevity's sake, the list is abbreviated.
* For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats
* {GL_R8, GL_R16, GL_RG8, GL_RG16} :
* - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY
* - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I}
* - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X
* {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT}
*
* The following image classes are currently disallowed:
* - Textures with borders
* - Multisampled renderbuffers
*
* \param resource - Pointer to the returned object handle
* \param image - name of texture or renderbuffer object to be registered
* \param target - Identifies the type of object specified by \p image
* \param flags - Register flags
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidDevice,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaGraphicsUnregisterResource,
* ::cudaGraphicsMapResources,
* ::cudaGraphicsSubResourceGetMappedArray,
* ::cuGraphicsGLRegisterImage
*/
extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterImage(struct cudaGraphicsResource **resource, GLuint image, GLenum target, unsigned int flags);
/**
* \brief Registers an OpenGL buffer object
*
* Registers the buffer object specified by \p buffer for access by
* CUDA. A handle to the registered object is returned as \p
* resource. The register flags \p flags specify the intended usage,
* as follows:
*
* - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
* resource will be used. It is therefore assumed that this resource will be
* read from and written to by CUDA. This is the default value.
* - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
* will not write to this resource.
* - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
* CUDA will not read from this resource and will write over the
* entire contents of the resource, so none of the data previously
* stored in the resource will be preserved.
*
* \param resource - Pointer to the returned object handle
* \param buffer - name of buffer object to be registered
* \param flags - Register flags
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidDevice,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaGraphicsUnregisterResource,
* ::cudaGraphicsMapResources,
* ::cudaGraphicsResourceGetMappedPointer,
* ::cuGraphicsGLRegisterBuffer
*/
extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterBuffer(struct cudaGraphicsResource **resource, GLuint buffer, unsigned int flags);
#ifdef _WIN32
#ifndef WGL_NV_gpu_affinity
typedef void* HGPUNV;
#endif
/**
* \brief Gets the CUDA device associated with hGpu
*
* Returns the CUDA device associated with a hGpu, if applicable.
*
* \param device - Returns the device associated with hGpu, or -1 if hGpu is
* not a compute device.
* \param hGpu - Handle to a GPU, as queried via WGL_NV_gpu_affinity
*
* \return
* ::cudaSuccess
* \notefnerr
*
* \sa
* ::WGL_NV_gpu_affinity,
* ::cuWGLGetDevice
*/
extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV hGpu);
#endif
/** @} */ /* END CUDART_OPENGL */
/**
* \addtogroup CUDART_OPENGL_DEPRECATED OpenGL Interoperability [DEPRECATED]
* This section describes deprecated OpenGL interoperability functionality.
*
* @{
*/
/**
* CUDA GL Map Flags
*/
enum cudaGLMapFlags
{
cudaGLMapFlagsNone = 0, /**< Default; Assume resource can be read/written */
cudaGLMapFlagsReadOnly = 1, /**< CUDA kernels will not write to this resource */
cudaGLMapFlagsWriteDiscard = 2 /**< CUDA kernels will only write to and will not read from this resource */
};
/**
* \brief Sets a CUDA device to use OpenGL interoperability
*
* \deprecated This function is deprecated as of CUDA 5.0.
*
* This function is deprecated and should no longer be used. It is
* no longer necessary to associate a CUDA device with an OpenGL
* context in order to achieve maximum interoperability performance.
*
* \param device - Device to use for OpenGL interoperability
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidDevice,
* ::cudaErrorSetOnActiveProcess
* \notefnerr
*
* \sa ::cudaGraphicsGLRegisterBuffer, ::cudaGraphicsGLRegisterImage
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device);
/**
* \brief Registers a buffer object for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Registers the buffer object of ID \p bufObj for access by
* CUDA. This function must be called before CUDA can map the buffer
* object. The OpenGL context used to create the buffer, or another
* context from the same share group, must be bound to the current
* thread when this is called.
*
* \param bufObj - Buffer object ID to register
*
* \return
* ::cudaSuccess,
* ::cudaErrorInitializationError
* \notefnerr
*
* \sa ::cudaGraphicsGLRegisterBuffer
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint bufObj);
/**
* \brief Maps a buffer object for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Maps the buffer object of ID \p bufObj into the address space of
* CUDA and returns in \p *devPtr the base pointer of the resulting
* mapping. The buffer must have previously been registered by
* calling ::cudaGLRegisterBufferObject(). While a buffer is mapped
* by CUDA, any OpenGL operation which references the buffer will
* result in undefined behavior. The OpenGL context used to create
* the buffer, or another context from the same share group, must be
* bound to the current thread when this is called.
*
* All streams in the current thread are synchronized with the current
* GL context.
*
* \param devPtr - Returned device pointer to CUDA object
* \param bufObj - Buffer object ID to map
*
* \return
* ::cudaSuccess,
* ::cudaErrorMapBufferObjectFailed
* \notefnerr
*
* \sa ::cudaGraphicsMapResources
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj);
/**
* \brief Unmaps a buffer object for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Unmaps the buffer object of ID \p bufObj for access by CUDA. When
* a buffer is unmapped, the base address returned by
* ::cudaGLMapBufferObject() is invalid and subsequent references to
* the address result in undefined behavior. The OpenGL context used
* to create the buffer, or another context from the same share group,
* must be bound to the current thread when this is called.
*
* All streams in the current thread are synchronized with the current
* GL context.
*
* \param bufObj - Buffer object to unmap
*
* \return
* ::cudaSuccess,
* ::cudaErrorUnmapBufferObjectFailed
* \notefnerr
*
* \sa ::cudaGraphicsUnmapResources
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj);
/**
* \brief Unregisters a buffer object for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Unregisters the buffer object of ID \p bufObj for access by CUDA
* and releases any CUDA resources associated with the buffer. Once a
* buffer is unregistered, it may no longer be mapped by CUDA. The GL
* context used to create the buffer, or another context from the
* same share group, must be bound to the current thread when this is
* called.
*
* \param bufObj - Buffer object to unregister
*
* \return
* ::cudaSuccess
* \notefnerr
*
* \sa ::cudaGraphicsUnregisterResource
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint bufObj);
/**
* \brief Set usage flags for mapping an OpenGL buffer
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Set flags for mapping the OpenGL buffer \p bufObj
*
* Changes to flags will take effect the next time \p bufObj is mapped.
* The \p flags argument may be any of the following:
*
* - ::cudaGLMapFlagsNone: Specifies no hints about how this buffer will
* be used. It is therefore assumed that this buffer will be read from and
* written to by CUDA kernels. This is the default value.
* - ::cudaGLMapFlagsReadOnly: Specifies that CUDA kernels which access this
* buffer will not write to the buffer.
* - ::cudaGLMapFlagsWriteDiscard: Specifies that CUDA kernels which access
* this buffer will not read from the buffer and will write over the
* entire contents of the buffer, so none of the data previously stored in
* the buffer will be preserved.
*
* If \p bufObj has not been registered for use with CUDA, then
* ::cudaErrorInvalidResourceHandle is returned. If \p bufObj is presently
* mapped for access by CUDA, then ::cudaErrorUnknown is returned.
*
* \param bufObj - Registered buffer object to set flags for
* \param flags - Parameters for buffer mapping
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa ::cudaGraphicsResourceSetMapFlags
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetBufferObjectMapFlags(GLuint bufObj, unsigned int flags);
/**
* \brief Maps a buffer object for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Maps the buffer object of ID \p bufObj into the address space of
* CUDA and returns in \p *devPtr the base pointer of the resulting
* mapping. The buffer must have previously been registered by
* calling ::cudaGLRegisterBufferObject(). While a buffer is mapped
* by CUDA, any OpenGL operation which references the buffer will
* result in undefined behavior. The OpenGL context used to create
* the buffer, or another context from the same share group, must be
* bound to the current thread when this is called.
*
* Stream /p stream is synchronized with the current GL context.
*
* \param devPtr - Returned device pointer to CUDA object
* \param bufObj - Buffer object ID to map
* \param stream - Stream to synchronize
*
* \return
* ::cudaSuccess,
* ::cudaErrorMapBufferObjectFailed
* \notefnerr
*
* \sa ::cudaGraphicsMapResources
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObjectAsync(void **devPtr, GLuint bufObj, cudaStream_t stream);
/**
* \brief Unmaps a buffer object for access by CUDA
*
* \deprecated This function is deprecated as of CUDA 3.0.
*
* Unmaps the buffer object of ID \p bufObj for access by CUDA. When
* a buffer is unmapped, the base address returned by
* ::cudaGLMapBufferObject() is invalid and subsequent references to
* the address result in undefined behavior. The OpenGL context used
* to create the buffer, or another context from the same share group,
* must be bound to the current thread when this is called.
*
* Stream /p stream is synchronized with the current GL context.
*
* \param bufObj - Buffer object to unmap
* \param stream - Stream to synchronize
*
* \return
* ::cudaSuccess,
* ::cudaErrorUnmapBufferObjectFailed
* \notefnerr
*
* \sa ::cudaGraphicsUnmapResources
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObjectAsync(GLuint bufObj, cudaStream_t stream);
/** @} */ /* END CUDART_OPENGL_DEPRECATED */
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#undef __CUDA_DEPRECATED
#endif /* __CUDA_GL_INTEROP_H__ */

File diff suppressed because it is too large Load diff

View file

@ -1,224 +0,0 @@
/*
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef _CUDA_PIPELINE_H_
# define _CUDA_PIPELINE_H_
# include "cuda_pipeline_primitives.h"
# if !defined(_CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER)
# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
-std=c++11 compiler option.
# endif
# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
# include "cuda_awbarrier.h"
# endif
// Integration with libcu++'s cuda::barrier<cuda::thread_scope_block>.
# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
# if defined(_LIBCUDACXX_CUDA_ABI_VERSION)
# define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION _LIBCUDACXX_CUDA_ABI_VERSION
# else
# define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION 4
# endif
# define _LIBCUDACXX_PIPELINE_CONCAT(X, Y) X ## Y
# define _LIBCUDACXX_PIPELINE_CONCAT2(X, Y) _LIBCUDACXX_PIPELINE_CONCAT(X, Y)
# define _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE _LIBCUDACXX_PIPELINE_CONCAT2(__, _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION)
namespace cuda { inline namespace _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE {
struct __block_scope_barrier_base;
}}
# endif
_CUDA_PIPELINE_BEGIN_NAMESPACE
template<size_t N, typename T>
_CUDA_PIPELINE_QUALIFIER
auto segment(T* ptr) -> T(*)[N];
class pipeline {
public:
pipeline(const pipeline&) = delete;
pipeline(pipeline&&) = delete;
pipeline& operator=(const pipeline&) = delete;
pipeline& operator=(pipeline&&) = delete;
_CUDA_PIPELINE_QUALIFIER pipeline();
_CUDA_PIPELINE_QUALIFIER size_t commit();
_CUDA_PIPELINE_QUALIFIER void commit_and_wait();
_CUDA_PIPELINE_QUALIFIER void wait(size_t batch);
template<unsigned N>
_CUDA_PIPELINE_QUALIFIER void wait_prior();
# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
_CUDA_PIPELINE_QUALIFIER void arrive_on(awbarrier& barrier);
_CUDA_PIPELINE_QUALIFIER void arrive_on(cuda::__block_scope_barrier_base& barrier);
# endif
private:
size_t current_batch;
};
template<class T>
_CUDA_PIPELINE_QUALIFIER
void memcpy_async(T& dst, const T& src, pipeline& pipe);
template<class T, size_t DstN, size_t SrcN>
_CUDA_PIPELINE_QUALIFIER
void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe);
template<size_t N, typename T>
_CUDA_PIPELINE_QUALIFIER
auto segment(T* ptr) -> T(*)[N]
{
return (T(*)[N])ptr;
}
_CUDA_PIPELINE_QUALIFIER
pipeline::pipeline()
: current_batch(0)
{
}
_CUDA_PIPELINE_QUALIFIER
size_t pipeline::commit()
{
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit();
return this->current_batch++;
}
_CUDA_PIPELINE_QUALIFIER
void pipeline::commit_and_wait()
{
(void)pipeline::commit();
pipeline::wait_prior<0>();
}
_CUDA_PIPELINE_QUALIFIER
void pipeline::wait(size_t batch)
{
const size_t prior = this->current_batch > batch ? this->current_batch - batch : 0;
switch (prior) {
case 0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); break;
case 1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); break;
case 2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); break;
case 3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); break;
case 4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); break;
case 5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); break;
case 6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); break;
case 7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); break;
default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); break;
}
}
template<unsigned N>
_CUDA_PIPELINE_QUALIFIER
void pipeline::wait_prior()
{
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<N>();
}
# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
_CUDA_PIPELINE_QUALIFIER
void pipeline::arrive_on(awbarrier& barrier)
{
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(&barrier.barrier);
}
_CUDA_PIPELINE_QUALIFIER
void pipeline::arrive_on(cuda::__block_scope_barrier_base & barrier)
{
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(reinterpret_cast<uint64_t *>(&barrier));
}
# endif
template<class T>
_CUDA_PIPELINE_QUALIFIER
void memcpy_async(T& dst, const T& src, pipeline& pipe)
{
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&src) & (alignof(T) - 1)));
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&dst) & (alignof(T) - 1)));
if (__is_trivially_copyable(T)) {
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_relaxed<sizeof(T), alignof(T)>(
reinterpret_cast<void*>(&dst), reinterpret_cast<const void*>(&src));
} else {
dst = src;
}
}
template<class T, size_t DstN, size_t SrcN>
_CUDA_PIPELINE_QUALIFIER
void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe)
{
constexpr size_t dst_size = sizeof(*dst);
constexpr size_t src_size = sizeof(*src);
static_assert(dst_size == 4 || dst_size == 8 || dst_size == 16, "Unsupported copy size.");
static_assert(src_size <= dst_size, "Source size must be less than or equal to destination size.");
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (dst_size - 1)));
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (dst_size - 1)));
if (__is_trivially_copyable(T)) {
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_strict<sizeof(*dst), sizeof(*src)>(
reinterpret_cast<void*>(*dst), reinterpret_cast<const void*>(*src));
} else {
for (size_t i = 0; i < DstN; ++i) {
(*dst)[i] = (i < SrcN) ? (*src)[i] : T();
}
}
}
_CUDA_PIPELINE_END_NAMESPACE
#endif /* !_CUDA_PIPELINE_H_ */

View file

@ -1,373 +0,0 @@
/*
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef _CUDA_PIPELINE_HELPERS_H_
# define _CUDA_PIPELINE_HELPERS_H_
# define _CUDA_PIPELINE_NAMESPACE nvcuda::experimental
# define _CUDA_PIPELINE_BEGIN_NAMESPACE namespace nvcuda { namespace experimental {
# define _CUDA_PIPELINE_END_NAMESPACE } }
# define _CUDA_PIPELINE_INTERNAL_NAMESPACE _CUDA_PIPELINE_NAMESPACE::__pipeline_internal
# define _CUDA_PIPELINE_BEGIN_INTERNAL_NAMESPACE _CUDA_PIPELINE_BEGIN_NAMESPACE namespace __pipeline_internal {
# define _CUDA_PIPELINE_END_INTERNAL_NAMESPACE } _CUDA_PIPELINE_END_NAMESPACE
# if !defined(_CUDA_PIPELINE_QUALIFIER)
# define _CUDA_PIPELINE_QUALIFIER inline __device__
# endif
# if !defined(_CUDA_PIPELINE_STATIC_QUALIFIER)
# define _CUDA_PIPELINE_STATIC_QUALIFIER static inline __device__
# endif
# if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
# define _CUDA_PIPELINE_ARCH_700_OR_LATER
# endif
# if (__CUDA_ARCH__ >= 800)
# define _CUDA_PIPELINE_HAS_ASYNC_COPY 1
# else
# define _CUDA_PIPELINE_HAS_ASYNC_COPY 0
# endif
# if !defined(_CUDA_PIPELINE_MAX_STAGES)
# define _CUDA_PIPELINE_MAX_STAGES 8
# endif
# if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && (_MSC_VER >= 1900)))
# define _CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER
# endif
# if !defined(_CUDA_PIPELINE_DEBUG)
# if defined(__CUDACC_DEBUG__)
# define _CUDA_PIPELINE_DEBUG 1
# else
# define _CUDA_PIPELINE_DEBUG 0
# endif
# endif
# if defined(_CUDA_PIPELINE_DEBUG) && (_CUDA_PIPELINE_DEBUG == 1) && !defined(NDEBUG)
# if !defined(__CUDACC_RTC__)
# include <cassert>
# endif
# define _CUDA_PIPELINE_ASSERT(x) assert((x));
# define _CUDA_PIPELINE_ABORT() assert(0);
# else
# define _CUDA_PIPELINE_ASSERT(x)
# define _CUDA_PIPELINE_ABORT() __trap();
# endif
# if defined(_CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER)
# define _CUDA_PIPELINE_STATIC_ASSERT(c, m) static_assert(c, m)
# else
# define _CUDA_PIPELINE_STATIC_ASSERT(c, m)
# endif
# if (defined(_MSC_VER) && !defined(_WIN64)) || defined(__arm__)
# define _CUDA_PIPELINE_ASM_PTR_CONSTRAINT "r"
# else
# define _CUDA_PIPELINE_ASM_PTR_CONSTRAINT "l"
# endif
# if defined(__CUDACC_RTC__)
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
typedef uint64_t uintptr_t;
# else
# include <stdint.h>
# endif
_CUDA_PIPELINE_BEGIN_INTERNAL_NAMESPACE
_CUDA_PIPELINE_STATIC_ASSERT(sizeof(short) == 2, "Size mismatch for type 'short'");
_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int) == 4, "Size mismatch for type 'int'");
_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int2) == 8, "Size mismatch for type 'int2'");
_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int4) == 16, "Size mismatch for type 'int4'");
extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *);
template<size_t CopySize, size_t SourceSize>
_CUDA_PIPELINE_QUALIFIER
void pipeline_memcpy_sync(void* __restrict__ dst, const void* __restrict__ src)
{
_CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
_CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
char* const d = reinterpret_cast<char*>(dst);
const char* const s = reinterpret_cast<const char*>(src);
size_t copy_step_size;
if (SourceSize == 0) {
copy_step_size = CopySize;
} else if (SourceSize == 2 || SourceSize == 4 || SourceSize == 8 || SourceSize == 16) {
copy_step_size = SourceSize;
} else {
copy_step_size = 1;
}
for (size_t i = 0; i < CopySize; i += copy_step_size) {
const bool copy_source = SourceSize && (i < SourceSize);
switch (copy_step_size) {
case 1:
d[i] = copy_source ? s[i] : char();
break;
case 2:
*reinterpret_cast<short*>(d + i) = copy_source ? *reinterpret_cast<const short*>(s + i) : short();
break;
case 4:
*reinterpret_cast<int*>(d + i) = copy_source ? *reinterpret_cast<const int*>(s + i) : int();
break;
case 8:
*reinterpret_cast<int2*>(d + i) = copy_source ? *reinterpret_cast<const int2*>(s + i) : int2();
break;
case 16:
*reinterpret_cast<int4*>(d + i) = copy_source ? *reinterpret_cast<const int4*>(s + i) : int4();
break;
}
}
}
template<bool UseHwAsyncCopy>
struct ImplementationChooser;
template<>
struct ImplementationChooser<true> {
template<size_t CopySize, size_t SourceSize>
struct CpAsyncChooser {
_CUDA_PIPELINE_STATIC_QUALIFIER
void cp_async(void* __restrict__ dst, const void* __restrict__ src)
{
asm volatile ("cp.async.ca.shared.global [%0], [%1], %2, %3;"
:
: "r"(__nvvm_get_smem_pointer(dst)), _CUDA_PIPELINE_ASM_PTR_CONSTRAINT(src), "n"(CopySize),
"n"(SourceSize)
: "memory");
}
};
template<size_t SourceSize>
struct CpAsyncChooser<16, SourceSize> {
_CUDA_PIPELINE_STATIC_QUALIFIER
void cp_async(void* __restrict__ dst, const void* __restrict__ src)
{
asm volatile ("cp.async.cg.shared.global [%0], [%1], %2, %3;"
:
: "r"(__nvvm_get_smem_pointer(dst)), _CUDA_PIPELINE_ASM_PTR_CONSTRAINT(src), "n"(16), "n"(SourceSize)
: "memory");
}
};
template<size_t CopySize, size_t SourceSize>
_CUDA_PIPELINE_STATIC_QUALIFIER
void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
{
_CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
_CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
_CUDA_PIPELINE_ASSERT(__isShared(dst));
_CUDA_PIPELINE_ASSERT(__isGlobal(src));
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
CpAsyncChooser<CopySize, SourceSize>::cp_async(dst, src);
}
_CUDA_PIPELINE_STATIC_QUALIFIER
void pipeline_commit()
{
asm volatile ("cp.async.commit_group;");
}
template<unsigned N>
_CUDA_PIPELINE_STATIC_QUALIFIER
void pipeline_wait_prior()
{
asm volatile ("cp.async.wait_group %0;"
:
: "n"(N < _CUDA_PIPELINE_MAX_STAGES ? N : _CUDA_PIPELINE_MAX_STAGES));
}
_CUDA_PIPELINE_STATIC_QUALIFIER
void pipeline_arrive_on(uint64_t* barrier)
{
_CUDA_PIPELINE_ASSERT(__isShared(barrier));
asm volatile ("cp.async.mbarrier.arrive.shared.b64 [%0];"
:
: "r"(__nvvm_get_smem_pointer(barrier)));
}
};
template<>
struct ImplementationChooser<false> {
template<size_t CopySize, size_t SourceSize>
_CUDA_PIPELINE_STATIC_QUALIFIER
void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
{
_CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
_CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
_CUDA_PIPELINE_ASSERT(__isShared(dst));
_CUDA_PIPELINE_ASSERT(__isGlobal(src));
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
pipeline_memcpy_sync<CopySize, SourceSize>(dst, src);
}
_CUDA_PIPELINE_STATIC_QUALIFIER
void pipeline_commit()
{
}
template<unsigned N>
_CUDA_PIPELINE_STATIC_QUALIFIER
void pipeline_wait_prior()
{
}
_CUDA_PIPELINE_STATIC_QUALIFIER
void pipeline_arrive_on(uint64_t* barrier)
{
}
};
template<size_t CopySize, size_t SourceSize>
_CUDA_PIPELINE_QUALIFIER
void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
{
_CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
_CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
_CUDA_PIPELINE_ASSERT(__isShared(dst));
_CUDA_PIPELINE_ASSERT(__isGlobal(src));
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_memcpy_async<CopySize, SourceSize>(dst, src);
}
_CUDA_PIPELINE_QUALIFIER
void pipeline_commit()
{
ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_commit();
}
template<unsigned N>
_CUDA_PIPELINE_QUALIFIER
void pipeline_wait_prior()
{
ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_wait_prior<N>();
}
_CUDA_PIPELINE_QUALIFIER
void pipeline_arrive_on(uint64_t* barrier)
{
ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_arrive_on(barrier);
}
template<size_t CopySize, size_t SourceSize>
_CUDA_PIPELINE_QUALIFIER
void pipeline_copy_strict(void* __restrict__ dst, const void* __restrict__ src)
{
_CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
_CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size.");
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
if (__isGlobal(src) && __isShared(dst)) {
pipeline_memcpy_async<CopySize, SourceSize>(dst, src);
} else {
pipeline_memcpy_sync<CopySize, SourceSize>(dst, src);
}
}
template<size_t CopySize, size_t Align>
_CUDA_PIPELINE_QUALIFIER
void pipeline_copy_relaxed(void* __restrict__ dst, const void* __restrict__ src)
{
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (Align - 1)));
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (Align - 1)));
const char* s = reinterpret_cast<const char*>(src);
char* d = reinterpret_cast<char*>(dst);
size_t remaining = CopySize;
while (remaining) {
if ((Align >= 16) && (remaining >= 16)) {
pipeline_copy_strict<16, 16>(dst, src);
d += 16;
s += 16;
remaining -= 16;
} else if ((Align >= 8) && (remaining >= 8)) {
pipeline_copy_strict<8, 8>(dst, src);
d += 8;
s += 8;
remaining -= 8;
} else if ((Align >= 4) && (remaining >= 4)) {
pipeline_copy_strict<4, 4>(dst, src);
d += 4;
s += 4;
remaining -= 4;
} else if ((Align >= 2) && (remaining >= 2)) {
*reinterpret_cast<short*>(d) = *reinterpret_cast<const short*>(s);
d += 2;
s += 2;
remaining -= 2;
} else {
*d = *s;
d += 1;
s += 1;
remaining -= 1;
}
}
}
_CUDA_PIPELINE_END_INTERNAL_NAMESPACE
#endif /* !_CUDA_PIPELINE_HELPERS_H_ */

View file

@ -1,148 +0,0 @@
/*
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef _CUDA_PIPELINE_PRIMITIVES_H_
# define _CUDA_PIPELINE_PRIMITIVES_H_
# include "cuda_pipeline_helpers.h"
_CUDA_PIPELINE_STATIC_QUALIFIER
void __pipeline_memcpy_async(void* __restrict__ dst_shared, const void* __restrict__ src_global, size_t size_and_align,
size_t zfill = 0)
{
_CUDA_PIPELINE_ASSERT(size_and_align == 4 || size_and_align == 8 || size_and_align == 16);
_CUDA_PIPELINE_ASSERT(zfill <= size_and_align);
_CUDA_PIPELINE_ASSERT(__isShared(dst_shared));
_CUDA_PIPELINE_ASSERT(__isGlobal(src_global));
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst_shared) & (size_and_align - 1)));
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src_global) & (size_and_align - 1)));
switch (size_and_align) {
case 16:
switch (zfill) {
case 0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 16>(dst_shared, src_global); return;
case 1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 15>(dst_shared, src_global); return;
case 2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 14>(dst_shared, src_global); return;
case 3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 13>(dst_shared, src_global); return;
case 4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 12>(dst_shared, src_global); return;
case 5: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 11>(dst_shared, src_global); return;
case 6: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 10>(dst_shared, src_global); return;
case 7: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 9>(dst_shared, src_global); return;
case 8: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 8>(dst_shared, src_global); return;
case 9: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 7>(dst_shared, src_global); return;
case 10: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 6>(dst_shared, src_global); return;
case 11: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 5>(dst_shared, src_global); return;
case 12: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 4>(dst_shared, src_global); return;
case 13: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 3>(dst_shared, src_global); return;
case 14: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 2>(dst_shared, src_global); return;
case 15: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 1>(dst_shared, src_global); return;
case 16: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 0>(dst_shared, src_global); return;
default: _CUDA_PIPELINE_ABORT(); return;
}
case 8:
switch (zfill) {
case 0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 8>(dst_shared, src_global); return;
case 1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 7>(dst_shared, src_global); return;
case 2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 6>(dst_shared, src_global); return;
case 3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 5>(dst_shared, src_global); return;
case 4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 4>(dst_shared, src_global); return;
case 5: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 3>(dst_shared, src_global); return;
case 6: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 2>(dst_shared, src_global); return;
case 7: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 1>(dst_shared, src_global); return;
case 8: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 0>(dst_shared, src_global); return;
default: _CUDA_PIPELINE_ABORT(); return;
}
case 4:
switch (zfill) {
case 0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 4>(dst_shared, src_global); return;
case 1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 3>(dst_shared, src_global); return;
case 2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 2>(dst_shared, src_global); return;
case 3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 1>(dst_shared, src_global); return;
case 4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 0>(dst_shared, src_global); return;
default: _CUDA_PIPELINE_ABORT(); return;
}
default:
_CUDA_PIPELINE_ABORT();
return;
}
}
_CUDA_PIPELINE_STATIC_QUALIFIER
void __pipeline_commit()
{
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit();
}
_CUDA_PIPELINE_STATIC_QUALIFIER
void __pipeline_wait_prior(size_t prior)
{
switch (prior) {
case 0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); return;
case 1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); return;
case 2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); return;
case 3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); return;
case 4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); return;
case 5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); return;
case 6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); return;
case 7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); return;
default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); return;
}
}
# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
# include "cuda_awbarrier_primitives.h"
_CUDA_PIPELINE_STATIC_QUALIFIER
void __pipeline_arrive_on(__mbarrier_t* barrier)
{
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(barrier);
}
# endif
#endif /* !_CUDA_PIPELINE_PRIMITIVES_H_ */

View file

@ -1,207 +0,0 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_PROFILER_API_H__)
#define __CUDA_PROFILER_API_H__
#include "driver_types.h"
#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
#define __CUDA_DEPRECATED
#elif defined(_MSC_VER)
#define __CUDA_DEPRECATED __declspec(deprecated)
#elif defined(__GNUC__)
#define __CUDA_DEPRECATED __attribute__((deprecated))
#else
#define __CUDA_DEPRECATED
#endif
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
/**
* \ingroup CUDART
* \defgroup CUDART_PROFILER_DEPRECATED Profiler Control [DEPRECATED]
*
* ___MANBRIEF___ profiler control functions of the CUDA runtime API
* (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the profiler control functions of the CUDA runtime
* application programming interface.
*
* @{
*/
/**
* \brief Initialize the CUDA profiler.
*
* \deprecated
*
* Using this API user can initialize the CUDA profiler by specifying
* the configuration file, output file and output file format. This
* API is generally used to profile different set of counters by
* looping the kernel launch. The \p configFile parameter can be used
* to select profiling options including profiler counters. Refer to
* the "Compute Command Line Profiler User Guide" for supported
* profiler options and counters.
*
* Limitation: The CUDA profiler cannot be initialized with this API
* if another profiling tool is already active, as indicated by the
* ::cudaErrorProfilerDisabled return code.
*
* Typical usage of the profiling APIs is as follows:
*
* for each set of counters/options\n
* {\n
* cudaProfilerInitialize(); //Initialize profiling,set the counters/options in
* the config file \n
* ...\n
* cudaProfilerStart(); \n
* // code to be profiled \n
* cudaProfilerStop();\n
* ...\n
* cudaProfilerStart(); \n
* // code to be profiled \n
* cudaProfilerStop();\n
* ...\n
* }\n
*
*
* \param configFile - Name of the config file that lists the counters/options
* for profiling.
* \param outputFile - Name of the outputFile where the profiling results will
* be stored.
* \param outputMode - outputMode, can be ::cudaKeyValuePair OR ::cudaCSV.
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorProfilerDisabled
* \notefnerr
*
* \sa
* ::cudaProfilerStart,
* ::cudaProfilerStop,
* ::cuProfilerInitialize
*/
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaProfilerInitialize(const char *configFile,
const char *outputFile,
cudaOutputMode_t outputMode);
/** @} */ /* END CUDART_PROFILER_DEPRECATED */
/**
* \ingroup CUDART
* \defgroup CUDART_PROFILER Profiler Control
*
* ___MANBRIEF___ profiler control functions of the CUDA runtime API
* (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the profiler control functions of the CUDA runtime
* application programming interface.
*
* @{
*/
/**
* \brief Enable profiling.
*
* Enables profile collection by the active profiling tool for the
* current context. If profiling is already enabled, then
* cudaProfilerStart() has no effect.
*
* cudaProfilerStart and cudaProfilerStop APIs are used to
* programmatically control the profiling granularity by allowing
* profiling to be done only on selective pieces of code.
*
*
* \return
* ::cudaSuccess
* \notefnerr
*
* \sa
* ::cudaProfilerInitialize,
* ::cudaProfilerStop,
* ::cuProfilerStart
*/
extern __host__ cudaError_t CUDARTAPI cudaProfilerStart(void);
/**
* \brief Disable profiling.
*
* Disables profile collection by the active profiling tool for the
* current context. If profiling is already disabled, then
* cudaProfilerStop() has no effect.
*
* cudaProfilerStart and cudaProfilerStop APIs are used to
* programmatically control the profiling granularity by allowing
* profiling to be done only on selective pieces of code.
*
* \return
* ::cudaSuccess
* \notefnerr
*
* \sa
* ::cudaProfilerInitialize,
* ::cudaProfilerStart,
* ::cuProfilerStop
*/
extern __host__ cudaError_t CUDARTAPI cudaProfilerStop(void);
/** @} */ /* END CUDART_PROFILER */
#undef __CUDA_DEPRECATED
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#endif /* !__CUDA_PROFILER_API_H__ */

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,103 +0,0 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_SURFACE_TYPES_H__)
#define __CUDA_SURFACE_TYPES_H__
#if defined(__cplusplus) && defined(__CUDACC__)
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if !defined(__CUDACC_RTC__)
#define EXCLUDE_FROM_RTC
#include "channel_descriptor.h"
#undef EXCLUDE_FROM_RTC
#endif /* !__CUDACC_RTC__ */
#include "cuda_runtime_api.h"
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
template<class T, int dim = 1>
struct __device_builtin_surface_type__ surface : public surfaceReference
{
#if !defined(__CUDACC_RTC__)
__host__ surface(void)
{
channelDesc = cudaCreateChannelDesc<T>();
}
__host__ surface(struct cudaChannelFormatDesc desc)
{
channelDesc = desc;
}
#endif /* !__CUDACC_RTC__ */
};
template<int dim>
struct __device_builtin_surface_type__ surface<void, dim> : public surfaceReference
{
#if !defined(__CUDACC_RTC__)
__host__ surface(void)
{
channelDesc = cudaCreateChannelDesc<void>();
}
#endif /* !__CUDACC_RTC__ */
};
#endif /* __cplusplus && __CUDACC__ */
#endif /* !__CUDA_SURFACE_TYPES_H__ */

View file

@ -1,109 +0,0 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_TEXTURE_TYPES_H__)
#define __CUDA_TEXTURE_TYPES_H__
#if defined(__cplusplus) && defined(__CUDACC__)
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if !defined(__CUDACC_RTC__)
#define EXCLUDE_FROM_RTC
#include "channel_descriptor.h"
#undef EXCLUDE_FROM_RTC
#endif /* !__CUDACC_RTC__ */
#include "cuda_runtime_api.h"
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
template<class T, int texType = cudaTextureType1D, enum cudaTextureReadMode mode = cudaReadModeElementType>
struct __device_builtin_texture_type__ texture : public textureReference
{
#if !defined(__CUDACC_RTC__)
__host__ texture(int norm = 0,
enum cudaTextureFilterMode fMode = cudaFilterModePoint,
enum cudaTextureAddressMode aMode = cudaAddressModeClamp)
{
normalized = norm;
filterMode = fMode;
addressMode[0] = aMode;
addressMode[1] = aMode;
addressMode[2] = aMode;
channelDesc = cudaCreateChannelDesc<T>();
sRGB = 0;
}
__host__ texture(int norm,
enum cudaTextureFilterMode fMode,
enum cudaTextureAddressMode aMode,
struct cudaChannelFormatDesc desc)
{
normalized = norm;
filterMode = fMode;
addressMode[0] = aMode;
addressMode[1] = aMode;
addressMode[2] = aMode;
channelDesc = desc;
sRGB = 0;
}
#endif /* !__CUDACC_RTC__ */
};
#endif /* __cplusplus && __CUDACC__ */
#endif /* !__CUDA_TEXTURE_TYPES_H__ */

View file

@ -1,97 +0,0 @@
/* Copyright 2013,2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* The source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* The Licensed Deliverables contained herein are PROPRIETARY and
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
/*!
* \file cudalibxt.h
* \brief Public header file for the NVIDIA library multi-GPU support structures
*/
#ifndef _CUDA_LIB_XT_H_
#define _CUDA_LIB_XT_H_
#include <cuda_runtime.h>
#define CUDA_XT_DESCRIPTOR_VERSION 0x01000000 // This is added to CUDART_VERSION
enum cudaXtCopyType_t {
LIB_XT_COPY_HOST_TO_DEVICE,
LIB_XT_COPY_DEVICE_TO_HOST,
LIB_XT_COPY_DEVICE_TO_DEVICE
} ;
typedef enum cudaXtCopyType_t cudaLibXtCopyType;
enum libFormat_t {
LIB_FORMAT_CUFFT = 0x0,
LIB_FORMAT_UNDEFINED = 0x1
};
typedef enum libFormat_t libFormat;
#define MAX_CUDA_DESCRIPTOR_GPUS 64
struct cudaXtDesc_t{
int version; //descriptor version
int nGPUs; //number of GPUs
int GPUs[MAX_CUDA_DESCRIPTOR_GPUS]; //array of device IDs
void *data[MAX_CUDA_DESCRIPTOR_GPUS]; //array of pointers to data, one per GPU
size_t size[MAX_CUDA_DESCRIPTOR_GPUS]; //array of data sizes, one per GPU
void *cudaXtState; //opaque CUDA utility structure
};
typedef struct cudaXtDesc_t cudaXtDesc;
struct cudaLibXtDesc_t{
int version; //descriptor version
cudaXtDesc *descriptor; //multi-GPU memory descriptor
libFormat library; //which library recognizes the format
int subFormat; //library specific enumerator of sub formats
void *libDescriptor; //library specific descriptor e.g. FFT transform plan object
};
typedef struct cudaLibXtDesc_t cudaLibXtDesc;
#endif

View file

@ -1,57 +0,0 @@
/*
* Copyright 2016 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef __CUDART_PLATFORM_H__
#define __CUDART_PLATFORM_H__
#if ((defined(__linux__) || defined(__QNX__)) && (defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)))
#define isEglSupported 1
#endif
#endif

View file

@ -1,322 +0,0 @@
/* Copyright 2005-2021 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* The source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* The Licensed Deliverables contained herein are PROPRIETARY and
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
/*!
* \file cufft.h
* \brief Public header file for the NVIDIA CUDA FFT library (CUFFT)
*/
#ifndef _CUFFT_H_
#define _CUFFT_H_
#include "cuComplex.h"
#include "driver_types.h"
#include "library_types.h"
#ifndef CUFFTAPI
#ifdef _WIN32
#define CUFFTAPI __stdcall
#elif __GNUC__ >= 4
#define CUFFTAPI __attribute__ ((visibility ("default")))
#else
#define CUFFTAPI
#endif
#endif
#ifdef __cplusplus
extern "C" {
#endif
#define CUFFT_VER_MAJOR 10
#define CUFFT_VER_MINOR 7
#define CUFFT_VER_PATCH 1
#define CUFFT_VER_BUILD 0
// cuFFT library version
//
// CUFFT_VERSION / 1000 - major version
// CUFFT_VERSION / 100 % 100 - minor version
// CUFFT_VERSION % 100 - patch level
#define CUFFT_VERSION 10701
// CUFFT API function return values
typedef enum cufftResult_t {
CUFFT_SUCCESS = 0x0,
CUFFT_INVALID_PLAN = 0x1,
CUFFT_ALLOC_FAILED = 0x2,
CUFFT_INVALID_TYPE = 0x3,
CUFFT_INVALID_VALUE = 0x4,
CUFFT_INTERNAL_ERROR = 0x5,
CUFFT_EXEC_FAILED = 0x6,
CUFFT_SETUP_FAILED = 0x7,
CUFFT_INVALID_SIZE = 0x8,
CUFFT_UNALIGNED_DATA = 0x9,
CUFFT_INCOMPLETE_PARAMETER_LIST = 0xA,
CUFFT_INVALID_DEVICE = 0xB,
CUFFT_PARSE_ERROR = 0xC,
CUFFT_NO_WORKSPACE = 0xD,
CUFFT_NOT_IMPLEMENTED = 0xE,
CUFFT_LICENSE_ERROR = 0x0F,
CUFFT_NOT_SUPPORTED = 0x10
} cufftResult;
#define MAX_CUFFT_ERROR 0x11
// CUFFT defines and supports the following data types
// cufftReal is a single-precision, floating-point real data type.
// cufftDoubleReal is a double-precision, real data type.
typedef float cufftReal;
typedef double cufftDoubleReal;
// cufftComplex is a single-precision, floating-point complex data type that
// consists of interleaved real and imaginary components.
// cufftDoubleComplex is the double-precision equivalent.
typedef cuComplex cufftComplex;
typedef cuDoubleComplex cufftDoubleComplex;
// CUFFT transform directions
#define CUFFT_FORWARD -1 // Forward FFT
#define CUFFT_INVERSE 1 // Inverse FFT
// CUFFT supports the following transform types
typedef enum cufftType_t {
CUFFT_R2C = 0x2a, // Real to Complex (interleaved)
CUFFT_C2R = 0x2c, // Complex (interleaved) to Real
CUFFT_C2C = 0x29, // Complex to Complex, interleaved
CUFFT_D2Z = 0x6a, // Double to Double-Complex
CUFFT_Z2D = 0x6c, // Double-Complex to Double
CUFFT_Z2Z = 0x69 // Double-Complex to Double-Complex
} cufftType;
// CUFFT supports the following data layouts
typedef enum cufftCompatibility_t {
CUFFT_COMPATIBILITY_FFTW_PADDING = 0x01 // The default value
} cufftCompatibility;
#define CUFFT_COMPATIBILITY_DEFAULT CUFFT_COMPATIBILITY_FFTW_PADDING
//
// structure definition used by the shim between old and new APIs
//
#define MAX_SHIM_RANK 3
// cufftHandle is a handle type used to store and access CUFFT plans.
typedef int cufftHandle;
cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan,
int nx,
cufftType type,
int batch);
cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan,
int nx, int ny,
cufftType type);
cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan,
int nx, int ny, int nz,
cufftType type);
cufftResult CUFFTAPI cufftPlanMany(cufftHandle *plan,
int rank,
int *n,
int *inembed, int istride, int idist,
int *onembed, int ostride, int odist,
cufftType type,
int batch);
cufftResult CUFFTAPI cufftMakePlan1d(cufftHandle plan,
int nx,
cufftType type,
int batch,
size_t *workSize);
cufftResult CUFFTAPI cufftMakePlan2d(cufftHandle plan,
int nx, int ny,
cufftType type,
size_t *workSize);
cufftResult CUFFTAPI cufftMakePlan3d(cufftHandle plan,
int nx, int ny, int nz,
cufftType type,
size_t *workSize);
cufftResult CUFFTAPI cufftMakePlanMany(cufftHandle plan,
int rank,
int *n,
int *inembed, int istride, int idist,
int *onembed, int ostride, int odist,
cufftType type,
int batch,
size_t *workSize);
cufftResult CUFFTAPI cufftMakePlanMany64(cufftHandle plan,
int rank,
long long int *n,
long long int *inembed,
long long int istride,
long long int idist,
long long int *onembed,
long long int ostride, long long int odist,
cufftType type,
long long int batch,
size_t * workSize);
cufftResult CUFFTAPI cufftGetSizeMany64(cufftHandle plan,
int rank,
long long int *n,
long long int *inembed,
long long int istride, long long int idist,
long long int *onembed,
long long int ostride, long long int odist,
cufftType type,
long long int batch,
size_t *workSize);
cufftResult CUFFTAPI cufftEstimate1d(int nx,
cufftType type,
int batch,
size_t *workSize);
cufftResult CUFFTAPI cufftEstimate2d(int nx, int ny,
cufftType type,
size_t *workSize);
cufftResult CUFFTAPI cufftEstimate3d(int nx, int ny, int nz,
cufftType type,
size_t *workSize);
cufftResult CUFFTAPI cufftEstimateMany(int rank,
int *n,
int *inembed, int istride, int idist,
int *onembed, int ostride, int odist,
cufftType type,
int batch,
size_t *workSize);
cufftResult CUFFTAPI cufftCreate(cufftHandle * handle);
cufftResult CUFFTAPI cufftGetSize1d(cufftHandle handle,
int nx,
cufftType type,
int batch,
size_t *workSize );
cufftResult CUFFTAPI cufftGetSize2d(cufftHandle handle,
int nx, int ny,
cufftType type,
size_t *workSize);
cufftResult CUFFTAPI cufftGetSize3d(cufftHandle handle,
int nx, int ny, int nz,
cufftType type,
size_t *workSize);
cufftResult CUFFTAPI cufftGetSizeMany(cufftHandle handle,
int rank, int *n,
int *inembed, int istride, int idist,
int *onembed, int ostride, int odist,
cufftType type, int batch, size_t *workArea);
cufftResult CUFFTAPI cufftGetSize(cufftHandle handle, size_t *workSize);
cufftResult CUFFTAPI cufftSetWorkArea(cufftHandle plan, void *workArea);
cufftResult CUFFTAPI cufftSetAutoAllocation(cufftHandle plan, int autoAllocate);
cufftResult CUFFTAPI cufftExecC2C(cufftHandle plan,
cufftComplex *idata,
cufftComplex *odata,
int direction);
cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan,
cufftReal *idata,
cufftComplex *odata);
cufftResult CUFFTAPI cufftExecC2R(cufftHandle plan,
cufftComplex *idata,
cufftReal *odata);
cufftResult CUFFTAPI cufftExecZ2Z(cufftHandle plan,
cufftDoubleComplex *idata,
cufftDoubleComplex *odata,
int direction);
cufftResult CUFFTAPI cufftExecD2Z(cufftHandle plan,
cufftDoubleReal *idata,
cufftDoubleComplex *odata);
cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan,
cufftDoubleComplex *idata,
cufftDoubleReal *odata);
// utility functions
cufftResult CUFFTAPI cufftSetStream(cufftHandle plan,
cudaStream_t stream);
cufftResult CUFFTAPI cufftDestroy(cufftHandle plan);
cufftResult CUFFTAPI cufftGetVersion(int *version);
cufftResult CUFFTAPI cufftGetProperty(libraryPropertyType type,
int *value);
#ifdef __cplusplus
}
#endif
#endif /* _CUFFT_H_ */

View file

@ -1,257 +0,0 @@
/* Copyright 2005-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* The source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* The Licensed Deliverables contained herein are PROPRIETARY and
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
/*!
* \file cufftXt.h
* \brief Public header file for the NVIDIA CUDA FFT library (CUFFT)
*/
#ifndef _CUFFTXT_H_
#define _CUFFTXT_H_
#include "cudalibxt.h"
#include "cufft.h"
#ifndef CUFFTAPI
#ifdef _WIN32
#define CUFFTAPI __stdcall
#else
#define CUFFTAPI
#endif
#endif
#ifdef __cplusplus
extern "C" {
#endif
//
// cufftXtSubFormat identifies the data layout of
// a memory descriptor owned by cufft.
// note that multi GPU cufft does not yet support out-of-place transforms
//
typedef enum cufftXtSubFormat_t {
CUFFT_XT_FORMAT_INPUT = 0x00, //by default input is in linear order across GPUs
CUFFT_XT_FORMAT_OUTPUT = 0x01, //by default output is in scrambled order depending on transform
CUFFT_XT_FORMAT_INPLACE = 0x02, //by default inplace is input order, which is linear across GPUs
CUFFT_XT_FORMAT_INPLACE_SHUFFLED = 0x03, //shuffled output order after execution of the transform
CUFFT_XT_FORMAT_1D_INPUT_SHUFFLED = 0x04, //shuffled input order prior to execution of 1D transforms
CUFFT_FORMAT_UNDEFINED = 0x05
} cufftXtSubFormat;
//
// cufftXtCopyType specifies the type of copy for cufftXtMemcpy
//
typedef enum cufftXtCopyType_t {
CUFFT_COPY_HOST_TO_DEVICE = 0x00,
CUFFT_COPY_DEVICE_TO_HOST = 0x01,
CUFFT_COPY_DEVICE_TO_DEVICE = 0x02,
CUFFT_COPY_UNDEFINED = 0x03
} cufftXtCopyType;
//
// cufftXtQueryType specifies the type of query for cufftXtQueryPlan
//
typedef enum cufftXtQueryType_t {
CUFFT_QUERY_1D_FACTORS = 0x00,
CUFFT_QUERY_UNDEFINED = 0x01
} cufftXtQueryType;
typedef struct cufftXt1dFactors_t {
long long int size;
long long int stringCount;
long long int stringLength;
long long int substringLength;
long long int factor1;
long long int factor2;
long long int stringMask;
long long int substringMask;
long long int factor1Mask;
long long int factor2Mask;
int stringShift;
int substringShift;
int factor1Shift;
int factor2Shift;
} cufftXt1dFactors;
//
// cufftXtWorkAreaPolicy specifies policy for cufftXtSetWorkAreaPolicy
//
typedef enum cufftXtWorkAreaPolicy_t {
CUFFT_WORKAREA_MINIMAL = 0, /* maximum reduction */
CUFFT_WORKAREA_USER = 1, /* use workSize parameter as limit */
CUFFT_WORKAREA_PERFORMANCE = 2, /* default - 1x overhead or more, maximum performance */
} cufftXtWorkAreaPolicy;
// multi-GPU routines
cufftResult CUFFTAPI cufftXtSetGPUs(cufftHandle handle, int nGPUs, int *whichGPUs);
cufftResult CUFFTAPI cufftXtMalloc(cufftHandle plan,
cudaLibXtDesc ** descriptor,
cufftXtSubFormat format);
cufftResult CUFFTAPI cufftXtMemcpy(cufftHandle plan,
void *dstPointer,
void *srcPointer,
cufftXtCopyType type);
cufftResult CUFFTAPI cufftXtFree(cudaLibXtDesc *descriptor);
cufftResult CUFFTAPI cufftXtSetWorkArea(cufftHandle plan, void **workArea);
cufftResult CUFFTAPI cufftXtExecDescriptorC2C(cufftHandle plan,
cudaLibXtDesc *input,
cudaLibXtDesc *output,
int direction);
cufftResult CUFFTAPI cufftXtExecDescriptorR2C(cufftHandle plan,
cudaLibXtDesc *input,
cudaLibXtDesc *output);
cufftResult CUFFTAPI cufftXtExecDescriptorC2R(cufftHandle plan,
cudaLibXtDesc *input,
cudaLibXtDesc *output);
cufftResult CUFFTAPI cufftXtExecDescriptorZ2Z(cufftHandle plan,
cudaLibXtDesc *input,
cudaLibXtDesc *output,
int direction);
cufftResult CUFFTAPI cufftXtExecDescriptorD2Z(cufftHandle plan,
cudaLibXtDesc *input,
cudaLibXtDesc *output);
cufftResult CUFFTAPI cufftXtExecDescriptorZ2D(cufftHandle plan,
cudaLibXtDesc *input,
cudaLibXtDesc *output);
// Utility functions
cufftResult CUFFTAPI cufftXtQueryPlan(cufftHandle plan, void *queryStruct, cufftXtQueryType queryType);
// callbacks
typedef enum cufftXtCallbackType_t {
CUFFT_CB_LD_COMPLEX = 0x0,
CUFFT_CB_LD_COMPLEX_DOUBLE = 0x1,
CUFFT_CB_LD_REAL = 0x2,
CUFFT_CB_LD_REAL_DOUBLE = 0x3,
CUFFT_CB_ST_COMPLEX = 0x4,
CUFFT_CB_ST_COMPLEX_DOUBLE = 0x5,
CUFFT_CB_ST_REAL = 0x6,
CUFFT_CB_ST_REAL_DOUBLE = 0x7,
CUFFT_CB_UNDEFINED = 0x8
} cufftXtCallbackType;
typedef cufftComplex (*cufftCallbackLoadC)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
typedef cufftDoubleComplex (*cufftCallbackLoadZ)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
typedef cufftReal (*cufftCallbackLoadR)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
typedef cufftDoubleReal(*cufftCallbackLoadD)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
typedef void (*cufftCallbackStoreC)(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPointer);
typedef void (*cufftCallbackStoreZ)(void *dataOut, size_t offset, cufftDoubleComplex element, void *callerInfo, void *sharedPointer);
typedef void (*cufftCallbackStoreR)(void *dataOut, size_t offset, cufftReal element, void *callerInfo, void *sharedPointer);
typedef void (*cufftCallbackStoreD)(void *dataOut, size_t offset, cufftDoubleReal element, void *callerInfo, void *sharedPointer);
cufftResult CUFFTAPI cufftXtSetCallback(cufftHandle plan, void **callback_routine, cufftXtCallbackType cbType, void **caller_info);
cufftResult CUFFTAPI cufftXtClearCallback(cufftHandle plan, cufftXtCallbackType cbType);
cufftResult CUFFTAPI cufftXtSetCallbackSharedSize(cufftHandle plan, cufftXtCallbackType cbType, size_t sharedSize);
cufftResult CUFFTAPI cufftXtMakePlanMany(cufftHandle plan,
int rank,
long long int *n,
long long int *inembed,
long long int istride,
long long int idist,
cudaDataType inputtype,
long long int *onembed,
long long int ostride,
long long int odist,
cudaDataType outputtype,
long long int batch,
size_t *workSize,
cudaDataType executiontype);
cufftResult CUFFTAPI cufftXtGetSizeMany(cufftHandle plan,
int rank,
long long int *n,
long long int *inembed,
long long int istride,
long long int idist,
cudaDataType inputtype,
long long int *onembed,
long long int ostride,
long long int odist,
cudaDataType outputtype,
long long int batch,
size_t *workSize,
cudaDataType executiontype);
cufftResult CUFFTAPI cufftXtExec(cufftHandle plan,
void *input,
void *output,
int direction);
cufftResult CUFFTAPI cufftXtExecDescriptor(cufftHandle plan,
cudaLibXtDesc *input,
cudaLibXtDesc *output,
int direction);
cufftResult CUFFTAPI cufftXtSetWorkAreaPolicy(cufftHandle plan, cufftXtWorkAreaPolicy policy, size_t *workSize);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -1,454 +0,0 @@
/* Copyright 2005-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* The source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* The Licensed Deliverables contained herein are PROPRIETARY and
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
/*!
* \file cufftw.h
* \brief Public header file for the NVIDIA CUDA FFTW library (CUFFTW)
*/
#ifndef _CUFFTW_H_
#define _CUFFTW_H_
#include <stdio.h>
#include "cufft.h"
#ifdef __cplusplus
extern "C" {
#endif
// transform direction
#define FFTW_FORWARD -1
#define FFTW_INVERSE 1
#define FFTW_BACKWARD 1
// Planner flags
#define FFTW_ESTIMATE 0x01
#define FFTW_MEASURE 0x02
#define FFTW_PATIENT 0x03
#define FFTW_EXHAUSTIVE 0x04
#define FFTW_WISDOM_ONLY 0x05
//Algorithm restriction flags
#define FFTW_DESTROY_INPUT 0x08
#define FFTW_PRESERVE_INPUT 0x0C
#define FFTW_UNALIGNED 0x10
// CUFFTW defines and supports the following data types
// note if complex.h has been included we use the C99 complex types
#if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined (complex)
typedef double _Complex fftw_complex;
typedef float _Complex fftwf_complex;
#else
typedef double fftw_complex[2];
typedef float fftwf_complex[2];
#endif
typedef void *fftw_plan;
typedef void *fftwf_plan;
typedef struct {
int n;
int is;
int os;
} fftw_iodim;
typedef fftw_iodim fftwf_iodim;
typedef struct {
ptrdiff_t n;
ptrdiff_t is;
ptrdiff_t os;
} fftw_iodim64;
typedef fftw_iodim64 fftwf_iodim64;
// CUFFTW defines and supports the following double precision APIs
fftw_plan CUFFTAPI fftw_plan_dft_1d(int n,
fftw_complex *in,
fftw_complex *out,
int sign,
unsigned flags);
fftw_plan CUFFTAPI fftw_plan_dft_2d(int n0,
int n1,
fftw_complex *in,
fftw_complex *out,
int sign,
unsigned flags);
fftw_plan CUFFTAPI fftw_plan_dft_3d(int n0,
int n1,
int n2,
fftw_complex *in,
fftw_complex *out,
int sign,
unsigned flags);
fftw_plan CUFFTAPI fftw_plan_dft(int rank,
const int *n,
fftw_complex *in,
fftw_complex *out,
int sign,
unsigned flags);
fftw_plan CUFFTAPI fftw_plan_dft_r2c_1d(int n,
double *in,
fftw_complex *out,
unsigned flags);
fftw_plan CUFFTAPI fftw_plan_dft_r2c_2d(int n0,
int n1,
double *in,
fftw_complex *out,
unsigned flags);
fftw_plan CUFFTAPI fftw_plan_dft_r2c_3d(int n0,
int n1,
int n2,
double *in,
fftw_complex *out,
unsigned flags);
fftw_plan CUFFTAPI fftw_plan_dft_r2c(int rank,
const int *n,
double *in,
fftw_complex *out,
unsigned flags);
fftw_plan CUFFTAPI fftw_plan_dft_c2r_1d(int n,
fftw_complex *in,
double *out,
unsigned flags);
fftw_plan CUFFTAPI fftw_plan_dft_c2r_2d(int n0,
int n1,
fftw_complex *in,
double *out,
unsigned flags);
fftw_plan CUFFTAPI fftw_plan_dft_c2r_3d(int n0,
int n1,
int n2,
fftw_complex *in,
double *out,
unsigned flags);
fftw_plan CUFFTAPI fftw_plan_dft_c2r(int rank,
const int *n,
fftw_complex *in,
double *out,
unsigned flags);
fftw_plan CUFFTAPI fftw_plan_many_dft(int rank,
const int *n,
int batch,
fftw_complex *in,
const int *inembed, int istride, int idist,
fftw_complex *out,
const int *onembed, int ostride, int odist,
int sign, unsigned flags);
fftw_plan CUFFTAPI fftw_plan_many_dft_r2c(int rank,
const int *n,
int batch,
double *in,
const int *inembed, int istride, int idist,
fftw_complex *out,
const int *onembed, int ostride, int odist,
unsigned flags);
fftw_plan CUFFTAPI fftw_plan_many_dft_c2r(int rank,
const int *n,
int batch,
fftw_complex *in,
const int *inembed, int istride, int idist,
double *out,
const int *onembed, int ostride, int odist,
unsigned flags);
fftw_plan CUFFTAPI fftw_plan_guru_dft(int rank, const fftw_iodim *dims,
int batch_rank, const fftw_iodim *batch_dims,
fftw_complex *in, fftw_complex *out,
int sign, unsigned flags);
fftw_plan CUFFTAPI fftw_plan_guru_dft_r2c(int rank, const fftw_iodim *dims,
int batch_rank, const fftw_iodim *batch_dims,
double *in, fftw_complex *out,
unsigned flags);
fftw_plan CUFFTAPI fftw_plan_guru_dft_c2r(int rank, const fftw_iodim *dims,
int batch_rank, const fftw_iodim *batch_dims,
fftw_complex *in, double *out,
unsigned flags);
void CUFFTAPI fftw_execute(const fftw_plan plan);
void CUFFTAPI fftw_execute_dft(const fftw_plan plan,
fftw_complex *idata,
fftw_complex *odata);
void CUFFTAPI fftw_execute_dft_r2c(const fftw_plan plan,
double *idata,
fftw_complex *odata);
void CUFFTAPI fftw_execute_dft_c2r(const fftw_plan plan,
fftw_complex *idata,
double *odata);
// CUFFTW defines and supports the following single precision APIs
fftwf_plan CUFFTAPI fftwf_plan_dft_1d(int n,
fftwf_complex *in,
fftwf_complex *out,
int sign,
unsigned flags);
fftwf_plan CUFFTAPI fftwf_plan_dft_2d(int n0,
int n1,
fftwf_complex *in,
fftwf_complex *out,
int sign,
unsigned flags);
fftwf_plan CUFFTAPI fftwf_plan_dft_3d(int n0,
int n1,
int n2,
fftwf_complex *in,
fftwf_complex *out,
int sign,
unsigned flags);
fftwf_plan CUFFTAPI fftwf_plan_dft(int rank,
const int *n,
fftwf_complex *in,
fftwf_complex *out,
int sign,
unsigned flags);
fftwf_plan CUFFTAPI fftwf_plan_dft_r2c_1d(int n,
float *in,
fftwf_complex *out,
unsigned flags);
fftwf_plan CUFFTAPI fftwf_plan_dft_r2c_2d(int n0,
int n1,
float *in,
fftwf_complex *out,
unsigned flags);
fftwf_plan CUFFTAPI fftwf_plan_dft_r2c_3d(int n0,
int n1,
int n2,
float *in,
fftwf_complex *out,
unsigned flags);
fftwf_plan CUFFTAPI fftwf_plan_dft_r2c(int rank,
const int *n,
float *in,
fftwf_complex *out,
unsigned flags);
fftwf_plan CUFFTAPI fftwf_plan_dft_c2r_1d(int n,
fftwf_complex *in,
float *out,
unsigned flags);
fftwf_plan CUFFTAPI fftwf_plan_dft_c2r_2d(int n0,
int n1,
fftwf_complex *in,
float *out,
unsigned flags);
fftwf_plan CUFFTAPI fftwf_plan_dft_c2r_3d(int n0,
int n1,
int n2,
fftwf_complex *in,
float *out,
unsigned flags);
fftwf_plan CUFFTAPI fftwf_plan_dft_c2r(int rank,
const int *n,
fftwf_complex *in,
float *out,
unsigned flags);
fftwf_plan CUFFTAPI fftwf_plan_many_dft(int rank,
const int *n,
int batch,
fftwf_complex *in,
const int *inembed, int istride, int idist,
fftwf_complex *out,
const int *onembed, int ostride, int odist,
int sign, unsigned flags);
fftwf_plan CUFFTAPI fftwf_plan_many_dft_r2c(int rank,
const int *n,
int batch,
float *in,
const int *inembed, int istride, int idist,
fftwf_complex *out,
const int *onembed, int ostride, int odist,
unsigned flags);
fftwf_plan CUFFTAPI fftwf_plan_many_dft_c2r(int rank,
const int *n,
int batch,
fftwf_complex *in,
const int *inembed, int istride, int idist,
float *out,
const int *onembed, int ostride, int odist,
unsigned flags);
fftwf_plan CUFFTAPI fftwf_plan_guru_dft(int rank, const fftwf_iodim *dims,
int batch_rank, const fftwf_iodim *batch_dims,
fftwf_complex *in, fftwf_complex *out,
int sign, unsigned flags);
fftwf_plan CUFFTAPI fftwf_plan_guru_dft_r2c(int rank, const fftwf_iodim *dims,
int batch_rank, const fftwf_iodim *batch_dims,
float *in, fftwf_complex *out,
unsigned flags);
fftwf_plan CUFFTAPI fftwf_plan_guru_dft_c2r(int rank, const fftwf_iodim *dims,
int batch_rank, const fftwf_iodim *batch_dims,
fftwf_complex *in, float *out,
unsigned flags);
void CUFFTAPI fftwf_execute(const fftw_plan plan);
void CUFFTAPI fftwf_execute_dft(const fftwf_plan plan,
fftwf_complex *idata,
fftwf_complex *odata);
void CUFFTAPI fftwf_execute_dft_r2c(const fftwf_plan plan,
float *idata,
fftwf_complex *odata);
void CUFFTAPI fftwf_execute_dft_c2r(const fftwf_plan plan,
fftwf_complex *idata,
float *odata);
/// CUFFTW 64-bit Guru Interface
/// dp
fftw_plan CUFFTAPI fftw_plan_guru64_dft(int rank, const fftw_iodim64* dims, int batch_rank, const fftw_iodim64* batch_dims, fftw_complex* in, fftw_complex* out, int sign, unsigned flags);
fftw_plan CUFFTAPI fftw_plan_guru64_dft_r2c(int rank, const fftw_iodim64* dims, int batch_rank, const fftw_iodim64* batch_dims, double* in, fftw_complex* out, unsigned flags);
fftw_plan CUFFTAPI fftw_plan_guru64_dft_c2r(int rank, const fftw_iodim64* dims, int batch_rank, const fftw_iodim64* batch_dims, fftw_complex* in, double* out, unsigned flags);
/// sp
fftwf_plan CUFFTAPI fftwf_plan_guru64_dft(int rank, const fftwf_iodim64* dims, int batch_rank, const fftwf_iodim64* batch_dims, fftwf_complex* in, fftwf_complex* out, int sign, unsigned flags);
fftwf_plan CUFFTAPI fftwf_plan_guru64_dft_r2c(int rank, const fftwf_iodim64* dims, int batch_rank, const fftwf_iodim64* batch_dims, float* in, fftwf_complex* out, unsigned flags);
fftwf_plan CUFFTAPI fftwf_plan_guru64_dft_c2r(int rank, const fftwf_iodim64* dims, int batch_rank, const fftwf_iodim64* batch_dims, fftwf_complex* in, float* out, unsigned flags);
#ifdef _WIN32
#define _CUFFTAPI(T) T CUFFTAPI
#else
#define _CUFFTAPI(T) CUFFTAPI T
#endif
// CUFFTW defines and supports the following support APIs
_CUFFTAPI(void *) fftw_malloc(size_t n);
_CUFFTAPI(void *) fftwf_malloc(size_t n);
void CUFFTAPI fftw_free(void *pointer);
void CUFFTAPI fftwf_free(void *pointer);
void CUFFTAPI fftw_export_wisdom_to_file(FILE * output_file);
void CUFFTAPI fftwf_export_wisdom_to_file(FILE * output_file);
void CUFFTAPI fftw_import_wisdom_from_file(FILE * input_file);
void CUFFTAPI fftwf_import_wisdom_from_file(FILE * input_file);
void CUFFTAPI fftw_print_plan(const fftw_plan plan);
void CUFFTAPI fftwf_print_plan(const fftwf_plan plan);
void CUFFTAPI fftw_set_timelimit(double seconds);
void CUFFTAPI fftwf_set_timelimit(double seconds);
double CUFFTAPI fftw_cost(const fftw_plan plan);
double CUFFTAPI fftwf_cost(const fftw_plan plan);
void CUFFTAPI fftw_flops(const fftw_plan plan, double *add, double *mul, double *fma);
void CUFFTAPI fftwf_flops(const fftw_plan plan, double *add, double *mul, double *fma);
void CUFFTAPI fftw_destroy_plan(fftw_plan plan);
void CUFFTAPI fftwf_destroy_plan(fftwf_plan plan);
void CUFFTAPI fftw_cleanup(void);
void CUFFTAPI fftwf_cleanup(void);
#ifdef __cplusplus
}
#endif
#endif /* _CUFFTW_H_ */

File diff suppressed because it is too large Load diff

View file

@ -1,87 +0,0 @@
/* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* The source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* The Licensed Deliverables contained herein are PROPRIETARY and
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(CURANDDISCRETE_H_)
#define CURANDDISCRETE_H_
struct curandDistributionShift_st {
curandDistribution_t probability;
curandDistribution_t host_probability;
unsigned int shift;
unsigned int length;
unsigned int host_gen;
};
struct curandHistogramM2_st {
curandHistogramM2V_t V;
curandHistogramM2V_t host_V;
curandHistogramM2K_t K;
curandHistogramM2K_t host_K;
unsigned int host_gen;
};
struct curandDistributionM2Shift_st {
curandHistogramM2_t histogram;
curandHistogramM2_t host_histogram;
unsigned int shift;
unsigned int length;
unsigned int host_gen;
};
struct curandDiscreteDistribution_st {
curandDiscreteDistribution_t self_host_ptr;
curandDistributionM2Shift_t M2;
curandDistributionM2Shift_t host_M2;
double stddev;
double mean;
curandMethod_t method;
unsigned int host_gen;
};
#endif // !defined(CURANDDISCRETE_H_)

View file

@ -1,253 +0,0 @@
/* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* The source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* The Licensed Deliverables contained herein are PROPRIETARY and
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(CURAND_DISCRETE_H_)
#define CURAND_DISCRETE_H_
/**
* \defgroup DEVICE Device API
*
* @{
*/
#ifndef __CUDACC_RTC__
#include <math.h>
#endif // __CUDACC_RTC__
#include "curand_mrg32k3a.h"
#include "curand_mtgp32_kernel.h"
#include "curand_philox4x32_x.h"
template <typename T>
QUALIFIERS unsigned int _curand_discrete(T x, curandDiscreteDistribution_t discrete_distribution){
if (discrete_distribution->method == CURAND_M2){
return _curand_M2_double(x, discrete_distribution->M2);
}
return (unsigned int)((discrete_distribution->stddev * _curand_normal_icdf_double(x)) + discrete_distribution->mean + 0.5);
}
template <typename STATE>
QUALIFIERS unsigned int curand__discrete(STATE state, curandDiscreteDistribution_t discrete_distribution){
if (discrete_distribution->method == CURAND_M2){
return curand_M2_double(state, discrete_distribution->M2);
}
return (unsigned int)((discrete_distribution->stddev * curand_normal_double(state)) + discrete_distribution->mean + 0.5); //Round to nearest
}
template <typename STATE>
QUALIFIERS uint4 curand__discrete4(STATE state, curandDiscreteDistribution_t discrete_distribution){
if (discrete_distribution->method == CURAND_M2){
return curand_M2_double4(state, discrete_distribution->M2);
}
double4 _res;
uint4 result;
_res = curand_normal4_double(state);
result.x = (unsigned int)((discrete_distribution->stddev * _res.x) + discrete_distribution->mean + 0.5); //Round to nearest
result.y = (unsigned int)((discrete_distribution->stddev * _res.y) + discrete_distribution->mean + 0.5); //Round to nearest
result.z = (unsigned int)((discrete_distribution->stddev * _res.z) + discrete_distribution->mean + 0.5); //Round to nearest
result.w = (unsigned int)((discrete_distribution->stddev * _res.w) + discrete_distribution->mean + 0.5); //Round to nearest
return result;
}
/*
* \brief Return a discrete distributed unsigned int from a XORWOW generator.
*
* Return a single discrete distributed unsigned int derived from a
* distribution defined by \p discrete_distribution from the XORWOW generator in \p state,
* increment position of generator by one.
*
* \param state - Pointer to state to update
* \param discrete_distribution - ancillary structure for discrete distribution
*
* \return unsigned int distributed by distribution defined by \p discrete_distribution.
*/
QUALIFIERS unsigned int curand_discrete(curandStateXORWOW_t *state, curandDiscreteDistribution_t discrete_distribution)
{
return curand__discrete(state, discrete_distribution);
}
/*
* \brief Return a discrete distributed unsigned int from a Philox4_32_10 generator.
*
* Return a single discrete distributed unsigned int derived from a
* distribution defined by \p discrete_distribution from the Philox4_32_10 generator in \p state,
* increment position of generator by one.
*
* \param state - Pointer to state to update
* \param discrete_distribution - ancillary structure for discrete distribution
*
* \return unsigned int distributed by distribution defined by \p discrete_distribution.
*/
QUALIFIERS unsigned int curand_discrete(curandStatePhilox4_32_10_t *state, curandDiscreteDistribution_t discrete_distribution)
{
return curand__discrete(state, discrete_distribution);
}
/*
* \brief Return four discrete distributed unsigned ints from a Philox4_32_10 generator.
*
* Return four single discrete distributed unsigned ints derived from a
* distribution defined by \p discrete_distribution from the Philox4_32_10 generator in \p state,
* increment position of generator by one.
*
* \param state - Pointer to state to update
* \param discrete_distribution - ancillary structure for discrete distribution
*
* \return unsigned int distributed by distribution defined by \p discrete_distribution.
*/
QUALIFIERS uint4 curand_discrete4(curandStatePhilox4_32_10_t *state, curandDiscreteDistribution_t discrete_distribution)
{
return curand__discrete4(state, discrete_distribution);
}
/*
* \brief Return a discrete distributed unsigned int from a MRG32k3a generator.
*
* Re turn a single discrete distributed unsigned int derived from a
* distribution defined by \p discrete_distribution from the MRG32k3a generator in \p state,
* increment position of generator by one.
*
* \param state - Pointer to state to update
* \param discrete_distribution - ancillary structure for discrete distribution
*
* \return unsigned int distributed by distribution defined by \p discrete_distribution.
*/
QUALIFIERS unsigned int curand_discrete(curandStateMRG32k3a_t *state, curandDiscreteDistribution_t discrete_distribution)
{
return curand__discrete(state, discrete_distribution);
}
/*
* \brief Return a discrete distributed unsigned int from a MTGP32 generator.
*
* Return a single discrete distributed unsigned int derived from a
* distribution defined by \p discrete_distribution from the MTGP32 generator in \p state,
* increment position of generator by one.
*
* \param state - Pointer to state to update
* \param discrete_distribution - ancillary structure for discrete distribution
*
* \return unsigned int distributed by distribution defined by \p discrete_distribution.
*/
QUALIFIERS unsigned int curand_discrete(curandStateMtgp32_t *state, curandDiscreteDistribution_t discrete_distribution)
{
return curand__discrete(state, discrete_distribution);
}
/*
* \brief Return a discrete distributed unsigned int from a Sobol32 generator.
*
* Return a single discrete distributed unsigned int derived from a
* distribution defined by \p discrete_distribution from the Sobol32 generator in \p state,
* increment position of generator by one.
*
* \param state - Pointer to state to update
* \param discrete_distribution - ancillary structure for discrete distribution
*
* \return unsigned int distributed by distribution defined by \p discrete_distribution.
*/
QUALIFIERS unsigned int curand_discrete(curandStateSobol32_t *state, curandDiscreteDistribution_t discrete_distribution)
{
return curand__discrete(state, discrete_distribution);
}
/*
* \brief Return a discrete distributed unsigned int from a scrambled Sobol32 generator.
*
* Return a single discrete distributed unsigned int derived from a
* distribution defined by \p discrete_distribution from the scrambled Sobol32 generator in \p state,
* increment position of generator by one.
*
* \param state - Pointer to state to update
* \param discrete_distribution - ancillary structure for discrete distribution
*
* \return unsigned int distributed by distribution defined by \p discrete_distribution.
*/
QUALIFIERS unsigned int curand_discrete(curandStateScrambledSobol32_t *state, curandDiscreteDistribution_t discrete_distribution)
{
return curand__discrete(state, discrete_distribution);
}
/*
* \brief Return a discrete distributed unsigned int from a Sobol64 generator.
*
* Return a single discrete distributed unsigned int derived from a
* distribution defined by \p discrete_distribution from the Sobol64 generator in \p state,
* increment position of generator by one.
*
* \param state - Pointer to state to update
* \param discrete_distribution - ancillary structure for discrete distribution
*
* \return unsigned int distributed by distribution defined by \p discrete_distribution.
*/
QUALIFIERS unsigned int curand_discrete(curandStateSobol64_t *state, curandDiscreteDistribution_t discrete_distribution)
{
return curand__discrete(state, discrete_distribution);
}
/*
* \brief Return a discrete distributed unsigned int from a scrambled Sobol64 generator.
*
* Return a single discrete distributed unsigned int derived from a
* distribution defined by \p discrete_distribution from the scrambled Sobol64 generator in \p state,
* increment position of generator by one.
*
* \param state - Pointer to state to update
* \param discrete_distribution - ancillary structure for discrete distribution
*
* \return unsigned int distributed by distribution defined by \p discrete_distribution.
*/
QUALIFIERS unsigned int curand_discrete(curandStateScrambledSobol64_t *state, curandDiscreteDistribution_t discrete_distribution)
{
return curand__discrete(state, discrete_distribution);
}
#endif // !defined(CURAND_DISCRETE_H_)

View file

@ -1,93 +0,0 @@
/* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* The source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* The Licensed Deliverables contained herein are PROPRIETARY and
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef CURAND_GLOBALS_H
#define CURAND_GLOBALS_H
#define MAX_XOR_N (5)
#define SKIPAHEAD_BLOCKSIZE (4)
#define SKIPAHEAD_MASK ((1<<SKIPAHEAD_BLOCKSIZE)-1)
#define CURAND_2POW32 (4294967296.f)
#define CURAND_2POW32_DOUBLE (4294967296.)
#define CURAND_2POW32_INV (2.3283064e-10f)
#define CURAND_2POW32_INV_DOUBLE (2.3283064365386963e-10)
#define CURAND_2POW53_INV_DOUBLE (1.1102230246251565e-16)
#define CURAND_2POW32_INV_2PI (2.3283064e-10f * 6.2831855f)
#define CURAND_2PI (6.2831855f)
#define CURAND_2POW53_INV_2PI_DOUBLE (1.1102230246251565e-16 * 6.2831853071795860)
#define CURAND_PI_DOUBLE (3.1415926535897932)
#define CURAND_2PI_DOUBLE (6.2831853071795860)
#define CURAND_SQRT2 (-1.4142135f)
#define CURAND_SQRT2_DOUBLE (-1.4142135623730951)
#define SOBOL64_ITR_BINARY_DIVIDE 2
#define SOBOL_M2_BINARY_DIVIDE 10
#define MTGP32_M2_BINARY_DIVIDE 32
#define MAX_LAMBDA 400000
#define MIN_GAUSS_LAMBDA 2000
struct normal_args_st {
float mean;
float stddev;
};
typedef struct normal_args_st normal_args_t;
struct normal_args_double_st {
double mean;
double stddev;
};
typedef struct normal_args_double_st normal_args_double_t;
#endif

File diff suppressed because it is too large Load diff

View file

@ -1,697 +0,0 @@
/* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* The source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* The Licensed Deliverables contained herein are PROPRIETARY and
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(CURAND_LOGNORMAL_H_)
#define CURAND_LOGNORMAL_H_
/**
* \defgroup DEVICE Device API
*
* @{
*/
#ifndef __CUDACC_RTC__
#include <math.h>
#endif // __CUDACC_RTC__
#include "curand_mrg32k3a.h"
#include "curand_mtgp32_kernel.h"
#include "curand_philox4x32_x.h"
/**
* \brief Return a log-normally distributed float from an XORWOW generator.
*
* Return a single log-normally distributed float derived from a normal
* distribution with mean \p mean and standard deviation \p stddev
* from the XORWOW generator in \p state,
* increment position of generator by one.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results, transforms them to log-normal distribution,
* then returns them one at a time.
* See ::curand_log_normal2() for a more efficient version that returns
* both results at once.
*
* \param state - Pointer to state to update
* \param mean - Mean of the related normal distribution
* \param stddev - Standard deviation of the related normal distribution
*
* \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
*/
QUALIFIERS float curand_log_normal(curandStateXORWOW_t *state, float mean, float stddev)
{
if(state->boxmuller_flag != EXTRA_FLAG_LOG_NORMAL) {
unsigned int x, y;
x = curand(state);
y = curand(state);
float2 v = _curand_box_muller(x, y);
state->boxmuller_extra = expf(mean + (stddev * v.y));
state->boxmuller_flag = EXTRA_FLAG_LOG_NORMAL;
return expf(mean + (stddev * v.x));
}
state->boxmuller_flag = 0;
return state->boxmuller_extra;
}
/**
* \brief Return a log-normally distributed float from an Philox4_32_10 generator.
*
* Return a single log-normally distributed float derived from a normal
* distribution with mean \p mean and standard deviation \p stddev
* from the Philox4_32_10 generator in \p state,
* increment position of generator by one.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results, transforms them to log-normal distribution,
* then returns them one at a time.
* See ::curand_log_normal2() for a more efficient version that returns
* both results at once.
*
* \param state - Pointer to state to update
* \param mean - Mean of the related normal distribution
* \param stddev - Standard deviation of the related normal distribution
*
* \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
*/
QUALIFIERS float curand_log_normal(curandStatePhilox4_32_10_t *state, float mean, float stddev)
{
if(state->boxmuller_flag != EXTRA_FLAG_LOG_NORMAL) {
unsigned int x, y;
x = curand(state);
y = curand(state);
float2 v = _curand_box_muller(x, y);
state->boxmuller_extra = expf(mean + (stddev * v.y));
state->boxmuller_flag = EXTRA_FLAG_LOG_NORMAL;
return expf(mean + (stddev * v.x));
}
state->boxmuller_flag = 0;
return state->boxmuller_extra;
}
/**
* \brief Return two normally distributed floats from an XORWOW generator.
*
* Return two log-normally distributed floats derived from a normal
* distribution with mean \p mean and standard deviation \p stddev
* from the XORWOW generator in \p state,
* increment position of generator by two.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results, then transforms them to log-normal.
*
* \param state - Pointer to state to update
* \param mean - Mean of the related normal distribution
* \param stddev - Standard deviation of the related normal distribution
*
* \return Log-normally distributed float2 where each element is from a
* distribution with mean \p mean and standard deviation \p stddev
*/
QUALIFIERS float2 curand_log_normal2(curandStateXORWOW_t *state, float mean, float stddev)
{
float2 v = curand_box_muller(state);
v.x = expf(mean + (stddev * v.x));
v.y = expf(mean + (stddev * v.y));
return v;
}
/**
* \brief Return two normally distributed floats from an Philox4_32_10 generator.
*
* Return two log-normally distributed floats derived from a normal
* distribution with mean \p mean and standard deviation \p stddev
* from the Philox4_32_10 generator in \p state,
* increment position of generator by two.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results, then transforms them to log-normal.
*
* \param state - Pointer to state to update
* \param mean - Mean of the related normal distribution
* \param stddev - Standard deviation of the related normal distribution
*
* \return Log-normally distributed float2 where each element is from a
* distribution with mean \p mean and standard deviation \p stddev
*/
QUALIFIERS float2 curand_log_normal2(curandStatePhilox4_32_10_t *state, float mean, float stddev)
{
float2 v = curand_box_muller(state);
v.x = expf(mean + (stddev * v.x));
v.y = expf(mean + (stddev * v.y));
return v;
}
/**
* \brief Return four normally distributed floats from an Philox4_32_10 generator.
*
* Return four log-normally distributed floats derived from a normal
* distribution with mean \p mean and standard deviation \p stddev
* from the Philox4_32_10 generator in \p state,
* increment position of generator by four.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results, then transforms them to log-normal.
*
* \param state - Pointer to state to update
* \param mean - Mean of the related normal distribution
* \param stddev - Standard deviation of the related normal distribution
*
* \return Log-normally distributed float4 where each element is from a
* distribution with mean \p mean and standard deviation \p stddev
*/
QUALIFIERS float4 curand_log_normal4(curandStatePhilox4_32_10_t *state, float mean, float stddev)
{
float4 v = curand_box_muller4(state);
v.x = expf(mean + (stddev * v.x));
v.y = expf(mean + (stddev * v.y));
v.z = expf(mean + (stddev * v.z));
v.w = expf(mean + (stddev * v.w));
return v;
}
/**
* \brief Return a log-normally distributed float from an MRG32k3a generator.
*
* Return a single log-normally distributed float derived from a normal
* distribution with mean \p mean and standard deviation \p stddev
* from the MRG32k3a generator in \p state,
* increment position of generator by one.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results, transforms them to log-normal distribution,
* then returns them one at a time.
* See ::curand_log_normal2() for a more efficient version that returns
* both results at once.
*
* \param state - Pointer to state to update
* \param mean - Mean of the related normal distribution
* \param stddev - Standard deviation of the related normal distribution
*
* \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
*/
QUALIFIERS float curand_log_normal(curandStateMRG32k3a_t *state, float mean, float stddev)
{
if(state->boxmuller_flag != EXTRA_FLAG_LOG_NORMAL) {
float2 v = curand_box_muller_mrg(state);
state->boxmuller_extra = expf(mean + (stddev * v.y));
state->boxmuller_flag = EXTRA_FLAG_LOG_NORMAL;
return expf(mean + (stddev * v.x));
}
state->boxmuller_flag = 0;
return state->boxmuller_extra;
}
/**
* \brief Return two normally distributed floats from an MRG32k3a generator.
*
* Return two log-normally distributed floats derived from a normal
* distribution with mean \p mean and standard deviation \p stddev
* from the MRG32k3a generator in \p state,
* increment position of generator by two.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results, then transforms them to log-normal.
*
* \param state - Pointer to state to update
* \param mean - Mean of the related normal distribution
* \param stddev - Standard deviation of the related normal distribution
*
* \return Log-normally distributed float2 where each element is from a
* distribution with mean \p mean and standard deviation \p stddev
*/
QUALIFIERS float2 curand_log_normal2(curandStateMRG32k3a_t *state, float mean, float stddev)
{
float2 v = curand_box_muller_mrg(state);
v.x = expf(mean + (stddev * v.x));
v.y = expf(mean + (stddev * v.y));
return v;
}
/**
* \brief Return a log-normally distributed float from an MTGP32 generator.
*
* Return a single log-normally distributed float derived from a normal
* distribution with mean \p mean and standard deviation \p stddev
* from the MTGP32 generator in \p state,
* increment position of generator.
*
* The implementation uses the inverse cumulative distribution function
* to generate a normally distributed result, then transforms the result
* to log-normal.
*
* \param state - Pointer to state to update
* \param mean - Mean of the related normal distribution
* \param stddev - Standard deviation of the related normal distribution
*
* \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
*/
QUALIFIERS float curand_log_normal(curandStateMtgp32_t *state, float mean, float stddev)
{
return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
}
/**
* \brief Return a log-normally distributed float from a Sobol32 generator.
*
* Return a single log-normally distributed float derived from a normal
* distribution with mean \p mean and standard deviation \p stddev
* from the Sobol32 generator in \p state,
* increment position of generator by one.
*
* The implementation uses the inverse cumulative distribution function
* to generate a normally distributed result, then transforms the result
* to log-normal.
*
* \param state - Pointer to state to update
* \param mean - Mean of the related normal distribution
* \param stddev - Standard deviation of the related normal distribution
*
* \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
*/
QUALIFIERS float curand_log_normal(curandStateSobol32_t *state, float mean, float stddev)
{
return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
}
/**
* \brief Return a log-normally distributed float from a scrambled Sobol32 generator.
*
* Return a single log-normally distributed float derived from a normal
* distribution with mean \p mean and standard deviation \p stddev
* from the scrambled Sobol32 generator in \p state,
* increment position of generator by one.
*
* The implementation uses the inverse cumulative distribution function
* to generate a normally distributed result, then transforms the result
* to log-normal.
*
* \param state - Pointer to state to update
* \param mean - Mean of the related normal distribution
* \param stddev - Standard deviation of the related normal distribution
*
* \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
*/
QUALIFIERS float curand_log_normal(curandStateScrambledSobol32_t *state, float mean, float stddev)
{
return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
}
/**
* \brief Return a log-normally distributed float from a Sobol64 generator.
*
* Return a single log-normally distributed float derived from a normal
* distribution with mean \p mean and standard deviation \p stddev
* from the Sobol64 generator in \p state,
* increment position of generator by one.
*
* The implementation uses the inverse cumulative distribution function
* to generate normally distributed results, then converts to log-normal
* distribution.
*
* \param state - Pointer to state to update
* \param mean - Mean of the related normal distribution
* \param stddev - Standard deviation of the related normal distribution
*
* \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
*/
QUALIFIERS float curand_log_normal(curandStateSobol64_t *state, float mean, float stddev)
{
return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
}
/**
* \brief Return a log-normally distributed float from a scrambled Sobol64 generator.
*
* Return a single log-normally distributed float derived from a normal
* distribution with mean \p mean and standard deviation \p stddev
* from the scrambled Sobol64 generator in \p state,
* increment position of generator by one.
*
* The implementation uses the inverse cumulative distribution function
* to generate normally distributed results, then converts to log-normal
* distribution.
*
* \param state - Pointer to state to update
* \param mean - Mean of the related normal distribution
* \param stddev - Standard deviation of the related normal distribution
*
* \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
*/
QUALIFIERS float curand_log_normal(curandStateScrambledSobol64_t *state, float mean, float stddev)
{
return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
}
/**
* \brief Return a log-normally distributed double from an XORWOW generator.
*
* Return a single normally distributed double derived from a normal
* distribution with mean \p mean and standard deviation \p stddev
* from the XORWOW generator in \p state,
* increment position of generator.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results, transforms them to log-normal distribution,
* then returns them one at a time.
* See ::curand_log_normal2_double() for a more efficient version that returns
* both results at once.
*
* \param state - Pointer to state to update
* \param mean - Mean of the related normal distribution
* \param stddev - Standard deviation of the related normal distribution
*
* \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
*/
QUALIFIERS double curand_log_normal_double(curandStateXORWOW_t *state, double mean, double stddev)
{
if(state->boxmuller_flag_double != EXTRA_FLAG_LOG_NORMAL) {
unsigned int x0, x1, y0, y1;
x0 = curand(state);
x1 = curand(state);
y0 = curand(state);
y1 = curand(state);
double2 v = _curand_box_muller_double(x0, x1, y0, y1);
state->boxmuller_extra_double = exp(mean + (stddev * v.y));
state->boxmuller_flag_double = EXTRA_FLAG_LOG_NORMAL;
return exp(mean + (stddev * v.x));
}
state->boxmuller_flag_double = 0;
return state->boxmuller_extra_double;
}
/**
* \brief Return a log-normally distributed double from an Philox4_32_10 generator.
*
* Return a single normally distributed double derived from a normal
* distribution with mean \p mean and standard deviation \p stddev
* from the Philox4_32_10 generator in \p state,
* increment position of generator.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results, transforms them to log-normal distribution,
* then returns them one at a time.
* See ::curand_log_normal2_double() for a more efficient version that returns
* both results at once.
*
* \param state - Pointer to state to update
* \param mean - Mean of the related normal distribution
* \param stddev - Standard deviation of the related normal distribution
*
* \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
*/
QUALIFIERS double curand_log_normal_double(curandStatePhilox4_32_10_t *state, double mean, double stddev)
{
if(state->boxmuller_flag_double != EXTRA_FLAG_LOG_NORMAL) {
uint4 _x;
_x = curand4(state);
double2 v = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
state->boxmuller_extra_double = exp(mean + (stddev * v.y));
state->boxmuller_flag_double = EXTRA_FLAG_LOG_NORMAL;
return exp(mean + (stddev * v.x));
}
state->boxmuller_flag_double = 0;
return state->boxmuller_extra_double;
}
/**
* \brief Return two log-normally distributed doubles from an XORWOW generator.
*
* Return two log-normally distributed doubles derived from a normal
* distribution with mean \p mean and standard deviation \p stddev
* from the XORWOW generator in \p state,
* increment position of generator by two.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results, and transforms them to log-normal distribution,.
*
* \param state - Pointer to state to update
* \param mean - Mean of the related normal distribution
* \param stddev - Standard deviation of the related normal distribution
*
* \return Log-normally distributed double2 where each element is from a
* distribution with mean \p mean and standard deviation \p stddev
*/
QUALIFIERS double2 curand_log_normal2_double(curandStateXORWOW_t *state, double mean, double stddev)
{
double2 v = curand_box_muller_double(state);
v.x = exp(mean + (stddev * v.x));
v.y = exp(mean + (stddev * v.y));
return v;
}
/**
* \brief Return two log-normally distributed doubles from an Philox4_32_10 generator.
*
* Return two log-normally distributed doubles derived from a normal
* distribution with mean \p mean and standard deviation \p stddev
* from the Philox4_32_10 generator in \p state,
* increment position of generator by four.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results, and transforms them to log-normal distribution,.
*
* \param state - Pointer to state to update
* \param mean - Mean of the related normal distribution
* \param stddev - Standard deviation of the related normal distribution
*
* \return Log-normally distributed double4 where each element is from a
* distribution with mean \p mean and standard deviation \p stddev
*/
QUALIFIERS double2 curand_log_normal2_double(curandStatePhilox4_32_10_t *state, double mean, double stddev)
{
double2 v = curand_box_muller2_double(state);
v.x = exp(mean + (stddev * v.x));
v.y = exp(mean + (stddev * v.y));
return v;
}
// nor part of API
QUALIFIERS double4 curand_log_normal4_double(curandStatePhilox4_32_10_t *state, double mean, double stddev)
{
double4 v = curand_box_muller4_double(state);
v.x = exp(mean + (stddev * v.x));
v.y = exp(mean + (stddev * v.y));
v.z = exp(mean + (stddev * v.z));
v.w = exp(mean + (stddev * v.w));
return v;
}
/**
* \brief Return a log-normally distributed double from an MRG32k3a generator.
*
* Return a single normally distributed double derived from a normal
* distribution with mean \p mean and standard deviation \p stddev
* from the MRG32k3a generator in \p state,
* increment position of generator.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results, transforms them to log-normal distribution,
* then returns them one at a time.
* See ::curand_log_normal2_double() for a more efficient version that returns
* both results at once.
*
* \param state - Pointer to state to update
* \param mean - Mean of the related normal distribution
* \param stddev - Standard deviation of the related normal distribution
*
* \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
*/
QUALIFIERS double curand_log_normal_double(curandStateMRG32k3a_t *state, double mean, double stddev)
{
if(state->boxmuller_flag_double != EXTRA_FLAG_LOG_NORMAL) {
double2 v = curand_box_muller_mrg_double(state);
state->boxmuller_extra_double = exp(mean + (stddev * v.y));
state->boxmuller_flag_double = EXTRA_FLAG_LOG_NORMAL;
return exp(mean + (stddev * v.x));
}
state->boxmuller_flag_double = 0;
return state->boxmuller_extra_double;
}
/**
* \brief Return two log-normally distributed doubles from an MRG32k3a generator.
*
* Return two log-normally distributed doubles derived from a normal
* distribution with mean \p mean and standard deviation \p stddev
* from the MRG32k3a generator in \p state,
* increment position of generator by two.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results, and transforms them to log-normal distribution,.
*
* \param state - Pointer to state to update
* \param mean - Mean of the related normal distribution
* \param stddev - Standard deviation of the related normal distribution
*
* \return Log-normally distributed double2 where each element is from a
* distribution with mean \p mean and standard deviation \p stddev
*/
QUALIFIERS double2 curand_log_normal2_double(curandStateMRG32k3a_t *state, double mean, double stddev)
{
double2 v = curand_box_muller_mrg_double(state);
v.x = exp(mean + (stddev * v.x));
v.y = exp(mean + (stddev * v.y));
return v;
}
/**
* \brief Return a log-normally distributed double from an MTGP32 generator.
*
* Return a single log-normally distributed double derived from a normal
* distribution with mean \p mean and standard deviation \p stddev
* from the MTGP32 generator in \p state,
* increment position of generator.
*
* The implementation uses the inverse cumulative distribution function
* to generate normally distributed results, and transforms them into
* log-normal distribution.
*
* \param state - Pointer to state to update
* \param mean - Mean of the related normal distribution
* \param stddev - Standard deviation of the related normal distribution
*
* \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
*/
QUALIFIERS double curand_log_normal_double(curandStateMtgp32_t *state, double mean, double stddev)
{
return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
}
/**
* \brief Return a log-normally distributed double from a Sobol32 generator.
*
* Return a single log-normally distributed double derived from a normal
* distribution with mean \p mean and standard deviation \p stddev
* from the Sobol32 generator in \p state,
* increment position of generator by one.
*
* The implementation uses the inverse cumulative distribution function
* to generate normally distributed results, and transforms them into
* log-normal distribution.
*
* \param state - Pointer to state to update
* \param mean - Mean of the related normal distribution
* \param stddev - Standard deviation of the related normal distribution
*
* \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
*/
QUALIFIERS double curand_log_normal_double(curandStateSobol32_t *state, double mean, double stddev)
{
return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
}
/**
* \brief Return a log-normally distributed double from a scrambled Sobol32 generator.
*
* Return a single log-normally distributed double derived from a normal
* distribution with mean \p mean and standard deviation \p stddev
* from the scrambled Sobol32 generator in \p state,
* increment position of generator by one.
*
* The implementation uses the inverse cumulative distribution function
* to generate normally distributed results, and transforms them into
* log-normal distribution.
*
* \param state - Pointer to state to update
* \param mean - Mean of the related normal distribution
* \param stddev - Standard deviation of the related normal distribution
*
* \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
*/
QUALIFIERS double curand_log_normal_double(curandStateScrambledSobol32_t *state, double mean, double stddev)
{
return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
}
/**
* \brief Return a log-normally distributed double from a Sobol64 generator.
*
* Return a single normally distributed double derived from a normal
* distribution with mean \p mean and standard deviation \p stddev
* from the Sobol64 generator in \p state,
* increment position of generator by one.
*
* The implementation uses the inverse cumulative distribution function
* to generate normally distributed results.
*
* \param state - Pointer to state to update
* \param mean - Mean of the related normal distribution
* \param stddev - Standard deviation of the related normal distribution
*
* \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
*/
QUALIFIERS double curand_log_normal_double(curandStateSobol64_t *state, double mean, double stddev)
{
return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
}
/**
* \brief Return a log-normally distributed double from a scrambled Sobol64 generator.
*
* Return a single normally distributed double derived from a normal
* distribution with mean \p mean and standard deviation \p stddev
* from the scrambled Sobol64 generator in \p state,
* increment position of generator by one.
*
* The implementation uses the inverse cumulative distribution function
* to generate normally distributed results.
*
* \param state - Pointer to state to update
* \param mean - Mean of the related normal distribution
* \param stddev - Standard deviation of the related normal distribution
*
* \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
*/
QUALIFIERS double curand_log_normal_double(curandStateScrambledSobol64_t *state, double mean, double stddev)
{
return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
}
#endif // !defined(CURAND_LOGNORMAL_H_)

File diff suppressed because it is too large Load diff

View file

@ -1,210 +0,0 @@
/*
* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef CURAND_MTGP32_H
#define CURAND_MTGP32_H
/*
* @file curand_mtgp32.h
*
* @brief Mersenne Twister for Graphic Processors (mtgp32), which
* generates 32-bit unsigned integers and single precision floating
* point numbers based on IEEE 754 format.
*
* @author Mutsuo Saito (Hiroshima University)
* @author Makoto Matsumoto (Hiroshima University)
*
*/
/*
* Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
* University. All rights reserved.
* Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
* University and University of Tokyo. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
* * Neither the name of the Hiroshima University nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define MTGPDC_MEXP 11213
#define MTGPDC_N 351
#define MTGPDC_FLOOR_2P 256
#define MTGPDC_CEIL_2P 512
#define MTGPDC_PARAM_TABLE mtgp32dc_params_fast_11213
#define MTGP32_STATE_SIZE 1024
#define MTGP32_STATE_MASK 1023
#define CURAND_NUM_MTGP32_PARAMS 200
#define MEXP 11213
#define THREAD_NUM MTGPDC_FLOOR_2P
#define LARGE_SIZE (THREAD_NUM * 3)
#define TBL_SIZE 16
/**
* \addtogroup DEVICE Device API
*
* @{
*/
/*
* \struct MTGP32_PARAMS_FAST_T
* MTGP32 parameters.
* Some element is redundant to keep structure simple.
*
* \b pos is a pick up position which is selected to have good
* performance on graphic processors. 3 < \b pos < Q, where Q is a
* maximum number such that the size of status array - Q is a power of
* 2. For example, when \b mexp is 44497, size of 32-bit status array
* is 696, and Q is 184, then \b pos is between 4 and 183. This means
* 512 parallel calculations is allowed when \b mexp is 44497.
*
* \b poly_sha1 is SHA1 digest of the characteristic polynomial of
* state transition function. SHA1 is calculated based on printing
* form of the polynomial. This is important when we use parameters
* generated by the dynamic creator which
*
* \b mask This is a mask to make the dimension of state space have
* just Mersenne Prime. This is redundant.
*/
struct mtgp32_params_fast;
struct mtgp32_params_fast {
int mexp; /*< Mersenne exponent. This is redundant. */
int pos; /*< pick up position. */
int sh1; /*< shift value 1. 0 < sh1 < 32. */
int sh2; /*< shift value 2. 0 < sh2 < 32. */
unsigned int tbl[16]; /*< a small matrix. */
unsigned int tmp_tbl[16]; /*< a small matrix for tempering. */
unsigned int flt_tmp_tbl[16]; /*< a small matrix for tempering and
converting to float. */
unsigned int mask; /*< This is a mask for state space */
unsigned char poly_sha1[21]; /*< SHA1 digest */
};
/** \cond UNHIDE_TYPEDEFS */
typedef struct mtgp32_params_fast mtgp32_params_fast_t;
/** \endcond */
/*
* Generator Parameters.
*/
struct mtgp32_kernel_params;
struct mtgp32_kernel_params {
unsigned int pos_tbl[CURAND_NUM_MTGP32_PARAMS];
unsigned int param_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE];
unsigned int temper_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE];
unsigned int single_temper_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE];
unsigned int sh1_tbl[CURAND_NUM_MTGP32_PARAMS];
unsigned int sh2_tbl[CURAND_NUM_MTGP32_PARAMS];
unsigned int mask[1];
};
/** \cond UNHIDE_TYPEDEFS */
typedef struct mtgp32_kernel_params mtgp32_kernel_params_t;
/** \endcond */
/*
* kernel I/O
* This structure must be initialized before first use.
*/
/* MTGP (Mersenne Twister) RNG */
/* This generator uses the Mersenne Twister algorithm of
* http://arxiv.org/abs/1005.4973v2
* Has period 2^11213.
*/
/**
* CURAND MTGP32 state
*/
struct curandStateMtgp32;
struct curandStateMtgp32 {
unsigned int s[MTGP32_STATE_SIZE];
int offset;
int pIdx;
mtgp32_kernel_params_t * k;
};
/*
* CURAND MTGP32 state
*/
/** \cond UNHIDE_TYPEDEFS */
typedef struct curandStateMtgp32 curandStateMtgp32_t;
/** \endcond */
/** @} */
#endif

View file

@ -1,516 +0,0 @@
/*
* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
/*
* curand_mtgp32_host.h
*
*
* MTGP32-11213
*
* Mersenne Twister RNG for the GPU
*
* The period of generated integers is 2<sup>11213</sup>-1.
*
* This code generates 32-bit unsigned integers, and
* single precision floating point numbers uniformly distributed
* in the range [1, 2). (float r; 1.0 <= r < 2.0)
*/
/*
* Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
* University. All rights reserved.
* Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
* University and University of Tokyo. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
* * Neither the name of the Hiroshima University nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if !defined CURAND_MTGP32_HOST_H
#define CURAND_MTGP32_HOST_H
#if !defined(QUALIFIERS)
#define QUALIFIERS static inline __device__
#endif
#include <cuda.h>
#include <stdlib.h>
#include <memory.h>
#include <string.h>
#include "curand.h"
#include "curand_mtgp32.h"
#include "curand_mtgp32dc_p_11213.h"
/**
* \addtogroup DEVICE Device API
*
* @{
*/
static const unsigned int non_zero = 0x4d544750;
/*
* This function represents a function used in the initialization
* by mtgp32_init_by_array() and mtgp32_init_by_str().
* @param[in] x 32-bit integer
* @return 32-bit integer
*/
static __forceinline__ unsigned int ini_func1(unsigned int x) {
return (x ^ (x >> 27)) * (1664525);
}
/*
* This function represents a function used in the initialization
* by mtgp32_init_by_array() and mtgp32_init_by_str().
* @param[in] x 32-bit integer
* @return 32-bit integer
*/
static __forceinline__ unsigned int ini_func2(unsigned int x) {
return (x ^ (x >> 27)) * (1566083941);
}
/*
* This function initializes the internal state array with a 32-bit
* integer seed. The allocated memory should be freed by calling
* mtgp32_free(). \b para should be one of the elements in the
* parameter table (mtgp32-param-ref.c).
*
* This function is call by cuda program, because cuda program uses
* another structure and another allocation method.
*
* @param[out] array MTGP internal status vector.
* @param[in] para parameter structure
* @param[in] seed a 32-bit integer used as the seed.
*/
static __forceinline__ __host__
void mtgp32_init_state(unsigned int state[],
const mtgp32_params_fast_t *para, unsigned int seed) {
int i;
int size = para->mexp / 32 + 1;
unsigned int hidden_seed;
unsigned int tmp;
hidden_seed = para->tbl[4] ^ (para->tbl[8] << 16);
tmp = hidden_seed;
tmp += tmp >> 16;
tmp += tmp >> 8;
memset(state, tmp & 0xff, sizeof(unsigned int) * size);
state[0] = seed;
state[1] = hidden_seed;
for (i = 1; i < size; i++) {
state[i] ^= (1812433253) * (state[i - 1] ^ (state[i - 1] >> 30)) + i;
}
}
/*
* This function initializes the internal state array
* with a 32-bit integer array. \b para should be one of the elements in
* the parameter table (mtgp32-param-ref.c).
*
* @param[out] mtgp32 MTGP structure.
* @param[in] para parameter structure
* @param[in] array a 32-bit integer array used as a seed.
* @param[in] length length of the array.
* @return CURAND_STATUS_SUCCESS
*/
static __forceinline__ __host__
int mtgp32_init_by_array(unsigned int state[],
const mtgp32_params_fast_t *para,
unsigned int *array, int length) {
int i, j, count;
unsigned int r;
int lag;
int mid;
int size = para->mexp / 32 + 1;
unsigned int hidden_seed;
unsigned int tmp;
if (size >= 623) {
lag = 11;
} else if (size >= 68) {
lag = 7;
} else if (size >= 39) {
lag = 5;
} else {
lag = 3;
}
mid = (size - lag) / 2;
hidden_seed = para->tbl[4] ^ (para->tbl[8] << 16);
tmp = hidden_seed;
tmp += tmp >> 16;
tmp += tmp >> 8;
memset(state, tmp & 0xff, sizeof(unsigned int) * size);
state[0] = hidden_seed;
if (length + 1 > size) {
count = length + 1;
} else {
count = size;
}
r = ini_func1(state[0] ^ state[mid] ^ state[size - 1]);
state[mid] += r;
r += length;
state[(mid + lag) % size] += r;
state[0] = r;
i = 1;
count--;
for (i = 1, j = 0; (j < count) && (j < length); j++) {
r = ini_func1(state[i] ^ state[(i + mid) % size]
^ state[(i + size - 1) % size]);
state[(i + mid) % size] += r;
r += array[j] + i;
state[(i + mid + lag) % size] += r;
state[i] = r;
i = (i + 1) % size;
}
for (; j < count; j++) {
r = ini_func1(state[i] ^ state[(i + mid) % size]
^ state[(i + size - 1) % size]);
state[(i + mid) % size] += r;
r += i;
state[(i + mid + lag) % size] += r;
state[i] = r;
i = (i + 1) % size;
}
for (j = 0; j < size; j++) {
r = ini_func2(state[i] + state[(i + mid) % size]
+ state[(i + size - 1) % size]);
state[(i + mid) % size] ^= r;
r -= i;
state[(i + mid + lag) % size] ^= r;
state[i] = r;
i = (i + 1) % size;
}
if (state[size - 1] == 0) {
state[size - 1] = non_zero;
}
return 0;
}
/*
* This function initializes the internal state array
* with a character array. \b para should be one of the elements in
* the parameter table (mtgp32-param-ref.c).
* This is the same algorithm with mtgp32_init_by_array(), but hope to
* be more useful.
*
* @param[out] mtgp32 MTGP structure.
* @param[in] para parameter structure
* @param[in] array a character array used as a seed. (terminated by zero.)
* @return memory allocation result. if 0 then O.K.
*/
static __forceinline__ __host__
int mtgp32_init_by_str(unsigned int state[],
const mtgp32_params_fast_t *para, unsigned char *array) {
int i, j, count;
unsigned int r;
int lag;
int mid;
int size = para->mexp / 32 + 1;
int length = (unsigned int)strlen((char *)array);
unsigned int hidden_seed;
unsigned int tmp;
if (size >= 623) {
lag = 11;
} else if (size >= 68) {
lag = 7;
} else if (size >= 39) {
lag = 5;
} else {
lag = 3;
}
mid = (size - lag) / 2;
hidden_seed = para->tbl[4] ^ (para->tbl[8] << 16);
tmp = hidden_seed;
tmp += tmp >> 16;
tmp += tmp >> 8;
memset(state, tmp & 0xff, sizeof(unsigned int) * size);
state[0] = hidden_seed;
if (length + 1 > size) {
count = length + 1;
} else {
count = size;
}
r = ini_func1(state[0] ^ state[mid] ^ state[size - 1]);
state[mid] += r;
r += length;
state[(mid + lag) % size] += r;
state[0] = r;
i = 1;
count--;
for (i = 1, j = 0; (j < count) && (j < length); j++) {
r = ini_func1(state[i] ^ state[(i + mid) % size]
^ state[(i + size - 1) % size]);
state[(i + mid) % size] += r;
r += array[j] + i;
state[(i + mid + lag) % size] += r;
state[i] = r;
i = (i + 1) % size;
}
for (; j < count; j++) {
r = ini_func1(state[i] ^ state[(i + mid) % size]
^ state[(i + size - 1) % size]);
state[(i + mid) % size] += r;
r += i;
state[(i + mid + lag) % size] += r;
state[i] = r;
i = (i + 1) % size;
}
for (j = 0; j < size; j++) {
r = ini_func2(state[i] + state[(i + mid) % size]
+ state[(i + size - 1) % size]);
state[(i + mid) % size] ^= r;
r -= i;
state[(i + mid + lag) % size] ^= r;
state[i] = r;
i = (i + 1) % size;
}
if (state[size - 1] == 0) {
state[size - 1] = non_zero;
}
return 0;
}
template<typename ParamsType>
static __forceinline__ __host__
curandStatus_t curandMakeMTGP32ConstantsImpl(const mtgp32_params_fast_t params[], ParamsType * p, const int block_num)
{
const int size1 = sizeof(unsigned int) * block_num;
const int size2 = sizeof(unsigned int) * block_num * TBL_SIZE;
unsigned int *h_pos_tbl;
unsigned int *h_sh1_tbl;
unsigned int *h_sh2_tbl;
unsigned int *h_param_tbl;
unsigned int *h_temper_tbl;
unsigned int *h_single_temper_tbl;
unsigned int *h_mask;
curandStatus_t status = CURAND_STATUS_SUCCESS;
h_pos_tbl = (unsigned int *)malloc(size1);
h_sh1_tbl = (unsigned int *)malloc(size1);
h_sh2_tbl = (unsigned int *)malloc(size1);
h_param_tbl = (unsigned int *)malloc(size2);
h_temper_tbl = (unsigned int *)malloc(size2);
h_single_temper_tbl = (unsigned int *)malloc(size2);
h_mask = (unsigned int *)malloc(sizeof(unsigned int));
if (h_pos_tbl == NULL
|| h_sh1_tbl == NULL
|| h_sh2_tbl == NULL
|| h_param_tbl == NULL
|| h_temper_tbl == NULL
|| h_single_temper_tbl == NULL
|| h_mask == NULL) {
if (h_pos_tbl != NULL) free(h_pos_tbl);
if (h_sh1_tbl != NULL) free(h_sh1_tbl);
if (h_sh2_tbl != NULL) free(h_sh2_tbl);
if (h_param_tbl != NULL) free(h_param_tbl);
if (h_temper_tbl != NULL) free(h_temper_tbl);
if (h_single_temper_tbl != NULL) free(h_single_temper_tbl);
if (h_mask != NULL) free(h_mask);
status = CURAND_STATUS_ALLOCATION_FAILED;
} else {
h_mask[0] = params[0].mask;
for (int i = 0; i < block_num; i++) {
h_pos_tbl[i] = params[i].pos;
h_sh1_tbl[i] = params[i].sh1;
h_sh2_tbl[i] = params[i].sh2;
for (int j = 0; j < TBL_SIZE; j++) {
h_param_tbl[i * TBL_SIZE + j] = params[i].tbl[j];
h_temper_tbl[i * TBL_SIZE + j] = params[i].tmp_tbl[j];
h_single_temper_tbl[i * TBL_SIZE + j] = params[i].flt_tmp_tbl[j];
}
}
if (cudaMemcpy( p->pos_tbl,
h_pos_tbl, size1, cudaMemcpyHostToDevice) != cudaSuccess)
{
status = CURAND_STATUS_INITIALIZATION_FAILED;
} else
if (cudaMemcpy( p->sh1_tbl,
h_sh1_tbl, size1, cudaMemcpyHostToDevice) != cudaSuccess)
{
status = CURAND_STATUS_INITIALIZATION_FAILED;
} else
if (cudaMemcpy( p->sh2_tbl,
h_sh2_tbl, size1, cudaMemcpyHostToDevice) != cudaSuccess)
{
status = CURAND_STATUS_INITIALIZATION_FAILED;
} else
if (cudaMemcpy( p->param_tbl,
h_param_tbl, size2, cudaMemcpyHostToDevice) != cudaSuccess)
{
status = CURAND_STATUS_INITIALIZATION_FAILED;
} else
if (cudaMemcpy( p->temper_tbl,
h_temper_tbl, size2, cudaMemcpyHostToDevice) != cudaSuccess)
{
status = CURAND_STATUS_INITIALIZATION_FAILED;
} else
if (cudaMemcpy( p->single_temper_tbl,
h_single_temper_tbl, size2, cudaMemcpyHostToDevice) != cudaSuccess)
{
status = CURAND_STATUS_INITIALIZATION_FAILED;
} else
if (cudaMemcpy( p->mask,
h_mask, sizeof(unsigned int), cudaMemcpyHostToDevice) != cudaSuccess)
{
status = CURAND_STATUS_INITIALIZATION_FAILED;
}
}
if (h_pos_tbl != NULL) free(h_pos_tbl);
if (h_sh1_tbl != NULL) free(h_sh1_tbl);
if (h_sh2_tbl != NULL) free(h_sh2_tbl);
if (h_param_tbl != NULL) free(h_param_tbl);
if (h_temper_tbl != NULL) free(h_temper_tbl);
if (h_single_temper_tbl != NULL)free(h_single_temper_tbl);
if (h_mask != NULL) free(h_mask);
return status;
}
/**
* \brief Set up constant parameters for the mtgp32 generator
*
* This host-side helper function re-organizes CURAND_NUM_MTGP32_PARAMS sets of
* generator parameters for use by kernel functions and copies the
* result to the specified location in device memory.
*
* \param params - Pointer to an array of type mtgp32_params_fast_t in host memory
* \param p - pointer to a structure of type mtgp32_kernel_params_t in device memory.
*
* \return
* - CURAND_STATUS_ALLOCATION_FAILED if host memory could not be allocated
* - CURAND_STATUS_INITIALIZATION_FAILED if the copy to device memory failed
* - CURAND_STATUS_SUCCESS otherwise
*/
static __forceinline__ __host__
curandStatus_t curandMakeMTGP32Constants(const mtgp32_params_fast_t params[], mtgp32_kernel_params_t * p)
{
return curandMakeMTGP32ConstantsImpl(params, p, CURAND_NUM_MTGP32_PARAMS);
}
/**
* \brief Set up initial states for the mtgp32 generator
*
* This host-side helper function initializes a number of states (one parameter set per state) for
* an mtgp32 generator. To accomplish this it allocates a state array in host memory,
* initializes that array, and copies the result to device memory.
*
* \param s - pointer to an array of states in device memory
* \param params - Pointer to an array of type mtgp32_params_fast_t in host memory
* \param k - pointer to a structure of type mtgp32_kernel_params_t in device memory
* \param n - number of parameter sets/states to initialize
* \param seed - seed value
*
* \return
* - CURAND_STATUS_ALLOCATION_FAILED if host memory state could not be allocated
* - CURAND_STATUS_INITIALIZATION_FAILED if the copy to device memory failed
* - CURAND_STATUS_SUCCESS otherwise
*/
static __forceinline__ __host__
curandStatus_t CURANDAPI curandMakeMTGP32KernelState(curandStateMtgp32_t *s,
mtgp32_params_fast_t params[],
mtgp32_kernel_params_t *k,
int n,
unsigned long long seed)
{
int i;
curandStatus_t status = CURAND_STATUS_SUCCESS;
curandStateMtgp32_t *h_status =(curandStateMtgp32_t *) malloc(sizeof(curandStateMtgp32_t) * n);
if (h_status == NULL) {
status = CURAND_STATUS_ALLOCATION_FAILED;
} else {
seed = seed ^ (seed >> 32);
for (i = 0; i < n; i++) {
mtgp32_init_state(&(h_status[i].s[0]), &params[i],(unsigned int)seed + i + 1);
h_status[i].offset = 0;
h_status[i].pIdx = i;
h_status[i].k = k;
}
if (cudaMemcpy(s, h_status,
sizeof(curandStateMtgp32_t) * n,
cudaMemcpyHostToDevice) != cudaSuccess) {
status = CURAND_STATUS_INITIALIZATION_FAILED;
}
}
free(h_status);
return status;
}
/** @} */
#endif

View file

@ -1,385 +0,0 @@
/*
* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
/*
* curand_mtgp32_kernel.h
*
*
* MTGP32-11213
*
* Mersenne Twister RNG for the GPU
*
* The period of generated integers is 2<sup>11213</sup>-1.
*
* This code generates 32-bit unsigned integers, and
* single precision floating point numbers uniformly distributed
* in the range [1, 2). (float r; 1.0 <= r < 2.0)
*/
/*
* Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
* University. All rights reserved.
* Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
* University and University of Tokyo. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
* * Neither the name of the Hiroshima University nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if !defined CURAND_MTGP32_KERNEL_H
#define CURAND_MTGP32_KERNEL_H
#if !defined(QUALIFIERS)
#define QUALIFIERS static __forceinline__ __device__
#endif
#ifndef __CUDACC_RTC__
#include <cuda.h>
#include <stdlib.h>
#include <memory.h>
#include <string.h>
#endif // ifndef __CUDACC_RTC__
#include "curand.h"
#include "curand_mtgp32.h"
/**
* \addtogroup DEVICE Device API
*
* @{
*/
#ifndef __CUDA_ARCH__
// define blockDim and threadIdx for host compatibility call
extern const dim3 blockDim;
extern const uint3 threadIdx;
#endif
/*
* The function of the recursion formula calculation.
*
* @param[in] X1 the farthest part of state array.
* @param[in] X2 the second farthest part of state array.
* @param[in] Y a part of state array.
* @param[in] bid block id.
* @return output
*/
QUALIFIERS unsigned int para_rec(mtgp32_kernel_params_t * k,unsigned int X1, unsigned int X2, unsigned int Y, int bid) {
unsigned int X = (X1 & k->mask[0]) ^ X2;
unsigned int MAT;
X ^= X << k->sh1_tbl[bid];
Y = X ^ (Y >> k->sh2_tbl[bid]);
MAT = k->param_tbl[bid][Y & 0x0f];
return Y ^ MAT;
}
/*
* The tempering function.
*
* @param[in] V the output value should be tempered.
* @param[in] T the tempering helper value.
* @param[in] bid block id.
* @return the tempered value.
*/
QUALIFIERS unsigned int temper(mtgp32_kernel_params_t * k,unsigned int V, unsigned int T, int bid) {
unsigned int MAT;
T ^= T >> 16;
T ^= T >> 8;
MAT = k->temper_tbl[bid][T & 0x0f];
return V ^ MAT;
}
/*
* The tempering and converting function.
* By using the preset table, converting to IEEE format
* and tempering are done simultaneously.
*
* @param[in] V the output value should be tempered.
* @param[in] T the tempering helper value.
* @param[in] bid block id.
* @return the tempered and converted value.
*/
QUALIFIERS unsigned int temper_single(mtgp32_kernel_params_t * k,unsigned int V, unsigned int T, int bid) {
unsigned int MAT;
unsigned int r;
T ^= T >> 16;
T ^= T >> 8;
MAT = k->single_temper_tbl[bid][T & 0x0f];
r = (V >> 9) ^ MAT;
return r;
}
/**
* \brief Return 32-bits of pseudorandomness from a mtgp32 generator.
*
* Return 32-bits of pseudorandomness from the mtgp32 generator in \p state,
* increment position of generator by the number of threads in the block.
* Note the number of threads in the block can not exceed 256.
*
* \param state - Pointer to state to update
*
* \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
*/
QUALIFIERS unsigned int curand(curandStateMtgp32_t *state)
{
unsigned int t;
unsigned int d;
int pos = state->k->pos_tbl[state->pIdx];
unsigned int r;
unsigned int o;
d = blockDim.z * blockDim.y * blockDim.x;
//assert( d <= 256 );
t = (blockDim.z * blockDim.y * threadIdx.z) + (blockDim.x * threadIdx.y) + threadIdx.x;
r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
state->pIdx);
state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r;
o = temper(state->k, r,
state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
state->pIdx);
#if __CUDA_ARCH__ != 0
__syncthreads();
#endif
if (t == 0)
{
state->offset = (state->offset + d) & MTGP32_STATE_MASK;
}
#if __CUDA_ARCH__ != 0
__syncthreads();
#endif
return o;
}
/**
* \brief Return 32-bits of pseudorandomness from a specific position in a mtgp32 generator.
*
* Return 32-bits of pseudorandomness from position \p index of the mtgp32 generator in \p state,
* increment position of generator by \p n positions, which must be the total number of positions
* upddated in the state by the thread block, for this invocation.
*
* Note :
* Thread indices must range from 0...\ n - 1.
* The number of positions updated may not exceed 256.
* A thread block may update more than one state, but a given state may not be updated by more than one thread block.
*
* \param state - Pointer to state to update
* \param index - Index (0..255) of the position within the state to draw from and update
* \param n - The total number of postions in this state that are being updated by this invocation
*
* \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
*/
QUALIFIERS unsigned int curand_mtgp32_specific(curandStateMtgp32_t *state, unsigned char index, unsigned char n)
{
unsigned int t;
int pos = state->k->pos_tbl[state->pIdx];
unsigned int r;
unsigned int o;
t = index;
r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
state->pIdx);
state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r;
o = temper(state->k, r,
state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
state->pIdx);
#if __CUDA_ARCH__ != 0
__syncthreads();
#endif
if (index == 0)
{
state->offset = (state->offset + n) & MTGP32_STATE_MASK;
}
#if __CUDA_ARCH__ != 0
__syncthreads();
#endif
return o;
}
/**
* \brief Return a uniformly distributed float from a mtgp32 generator.
*
* Return a uniformly distributed float between \p 0.0f and \p 1.0f
* from the mtgp32 generator in \p state, increment position of generator.
* Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
* point outputs are never returned.
*
* Note: This alternate derivation of a uniform float is provided for completeness
* with the original source
*
* \param state - Pointer to state to update
*
* \return uniformly distributed float between \p 0.0f and \p 1.0f
*/
QUALIFIERS float curand_mtgp32_single(curandStateMtgp32_t *state)
{
unsigned int t;
unsigned int d;
int pos = state->k->pos_tbl[state->pIdx];
unsigned int r;
unsigned int o_u;
float o_f;
t = blockDim.z * blockDim.y;
d = t * blockDim.x;
//assert( d <= 256 );
t += threadIdx.x;
r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
state->pIdx);
state->s[t] = r;
o_u = temper_single(state->k, r,
state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
state->pIdx);
#if __CUDA_ARCH__ != 0
__syncthreads();
#endif
if (threadIdx.x == 0)
{
state->offset = (state->offset + d) & MTGP32_STATE_MASK;
}
#if __CUDA_ARCH__ != 0
__syncthreads();
#endif
memcpy(&o_f, &o_u, sizeof(o_u));
return o_f;
}
/**
* \brief Return a uniformly distributed float from a specific position in a mtgp32 generator.
*
* Return a uniformly distributed float between \p 0.0f and \p 1.0f
* from position \p index of the mtgp32 generator in \p state, and
* increment position of generator by \p n positions, which must be the total number of positions
* upddated in the state by the thread block, for this invocation.
* Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
* point outputs are never returned.
*
* Note 1:
* Thread indices must range from 0...\p n - 1.
* The number of positions updated may not exceed 256.
* A thread block may update more than one state, but a given state may not be updated by more than one thread block.
*
* Note 2: This alternate derivation of a uniform float is provided for completeness
* with the original source
*
* \param state - Pointer to state to update
* \param index - Index (0..255) of the position within the state to draw from and update
* \param n - The total number of postions in this state that are being updated by this invocation
*
* \return uniformly distributed float between \p 0.0f and \p 1.0f
*/
QUALIFIERS float curand_mtgp32_single_specific(curandStateMtgp32_t *state, unsigned char index, unsigned char n)
{
unsigned int t;
int pos = state->k->pos_tbl[state->pIdx];
unsigned int r;
unsigned int o_u;
float o_f;
t = index;
r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
state->pIdx);
state->s[t] = r;
o_u = temper_single(state->k, r,
state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
state->pIdx);
#if __CUDA_ARCH__ != 0
__syncthreads();
#endif
if (threadIdx.x == 0)
{
state->offset = (state->offset + n) & MTGP32_STATE_MASK;
}
#if __CUDA_ARCH__ != 0
__syncthreads();
#endif
memcpy(&o_f, &o_u, sizeof(o_u));
return o_f;
}
/** @} */
#endif

File diff suppressed because it is too large Load diff

View file

@ -1,837 +0,0 @@
/* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* The source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* The Licensed Deliverables contained herein are PROPRIETARY and
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(CURAND_NORMAL_H_)
#define CURAND_NORMAL_H_
/**
* \defgroup DEVICE Device API
*
* @{
*/
#ifndef __CUDACC_RTC__
#include <math.h>
#endif // __CUDACC_RTC__
#include "curand_mrg32k3a.h"
#include "curand_mtgp32_kernel.h"
#include "curand_philox4x32_x.h"
#include "curand_normal_static.h"
QUALIFIERS float2 _curand_box_muller(unsigned int x, unsigned int y)
{
float2 result;
float u = x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2);
float v = y * CURAND_2POW32_INV_2PI + (CURAND_2POW32_INV_2PI/2);
#if __CUDA_ARCH__ > 0
float s = sqrtf(-2.0f * logf(u));
__sincosf(v, &result.x, &result.y);
#else
float s = sqrtf(-2.0f * logf(u));
result.x = sinf(v);
result.y = cosf(v);
#endif
result.x *= s;
result.y *= s;
return result;
}
QUALIFIERS float2 curand_box_muller_mrg(curandStateMRG32k3a_t * state)
{
float x, y;
x = curand_uniform(state);
y = curand_uniform(state) * CURAND_2PI;
float2 result;
#if __CUDA_ARCH__ > 0
float s = sqrtf(-2.0f * logf(x));
__sincosf(y, &result.x, &result.y);
#else
float s = sqrtf(-2.0f * logf(x));
result.x = sinf(y);
result.y = cosf(y);
#endif
result.x *= s;
result.y *= s;
return result;
}
QUALIFIERS double2
_curand_box_muller_double(unsigned int x0, unsigned int x1,
unsigned int y0, unsigned int y1)
{
double2 result;
unsigned long long zx = (unsigned long long)x0 ^
((unsigned long long)x1 << (53 - 32));
double u = zx * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
unsigned long long zy = (unsigned long long)y0 ^
((unsigned long long)y1 << (53 - 32));
double v = zy * (CURAND_2POW53_INV_DOUBLE*2.0) + CURAND_2POW53_INV_DOUBLE;
double s = sqrt(-2.0 * log(u));
#if __CUDA_ARCH__ > 0
sincospi(v, &result.x, &result.y);
#else
result.x = sin(v*CURAND_PI_DOUBLE);
result.y = cos(v*CURAND_PI_DOUBLE);
#endif
result.x *= s;
result.y *= s;
return result;
}
QUALIFIERS double2
curand_box_muller_mrg_double(curandStateMRG32k3a_t * state)
{
double x, y;
double2 result;
x = curand_uniform_double(state);
y = curand_uniform_double(state) * 2.0;
double s = sqrt(-2.0 * log(x));
#if __CUDA_ARCH__ > 0
sincospi(y, &result.x, &result.y);
#else
result.x = sin(y*CURAND_PI_DOUBLE);
result.y = cos(y*CURAND_PI_DOUBLE);
#endif
result.x *= s;
result.y *= s;
return result;
}
template <typename R>
QUALIFIERS float2 curand_box_muller(R *state)
{
float2 result;
unsigned int x = curand(state);
unsigned int y = curand(state);
result = _curand_box_muller(x, y);
return result;
}
template <typename R>
QUALIFIERS float4 curand_box_muller4(R *state)
{
float4 result;
float2 _result;
uint4 x = curand4(state);
//unsigned int y = curand(state);
_result = _curand_box_muller(x.x, x.y);
result.x = _result.x;
result.y = _result.y;
_result = _curand_box_muller(x.z, x.w);
result.z = _result.x;
result.w = _result.y;
return result;
}
template <typename R>
QUALIFIERS double2 curand_box_muller_double(R *state)
{
double2 result;
unsigned int x0 = curand(state);
unsigned int x1 = curand(state);
unsigned int y0 = curand(state);
unsigned int y1 = curand(state);
result = _curand_box_muller_double(x0, x1, y0, y1);
return result;
}
template <typename R>
QUALIFIERS double2 curand_box_muller2_double(R *state)
{
double2 result;
uint4 _x;
_x = curand4(state);
result = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
return result;
}
template <typename R>
QUALIFIERS double4 curand_box_muller4_double(R *state)
{
double4 result;
double2 _res1;
double2 _res2;
uint4 _x;
uint4 _y;
_x = curand4(state);
_y = curand4(state);
_res1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
_res2 = _curand_box_muller_double(_y.x, _y.y, _y.z, _y.w);
result.x = _res1.x;
result.y = _res1.y;
result.z = _res2.x;
result.w = _res2.y;
return result;
}
//QUALIFIERS float _curand_normal_icdf(unsigned int x)
//{
//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
// float s = CURAND_SQRT2;
// // Mirror to avoid loss of precision
// if(x > 0x80000000UL) {
// x = 0xffffffffUL - x;
// s = -s;
// }
// float p = x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
// // p is in (0, 0.5], 2p is in (0, 1]
// return s * erfcinvf(2.0f * p);
//#else
// x++; //suppress warnings
// return 0.0f;
//#endif
//}
//
//QUALIFIERS float _curand_normal_icdf(unsigned long long x)
//{
//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
// unsigned int t = (unsigned int)(x >> 32);
// float s = CURAND_SQRT2;
// // Mirror to avoid loss of precision
// if(t > 0x80000000UL) {
// t = 0xffffffffUL - t;
// s = -s;
// }
// float p = t * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
// // p is in (0, 0.5], 2p is in (0, 1]
// return s * erfcinvf(2.0f * p);
//#else
// x++;
// return 0.0f;
//#endif
//}
//
//QUALIFIERS double _curand_normal_icdf_double(unsigned int x)
//{
//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
// double s = CURAND_SQRT2_DOUBLE;
// // Mirror to avoid loss of precision
// if(x > 0x80000000UL) {
// x = 0xffffffffUL - x;
// s = -s;
// }
// double p = x * CURAND_2POW32_INV_DOUBLE + (CURAND_2POW32_INV_DOUBLE/2.0);
// // p is in (0, 0.5], 2p is in (0, 1]
// return s * erfcinv(2.0 * p);
//#else
// x++;
// return 0.0;
//#endif
//}
//
//QUALIFIERS double _curand_normal_icdf_double(unsigned long long x)
//{
//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
// double s = CURAND_SQRT2_DOUBLE;
// x >>= 11;
// // Mirror to avoid loss of precision
// if(x > 0x10000000000000UL) {
// x = 0x1fffffffffffffUL - x;
// s = -s;
// }
// double p = x * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
// // p is in (0, 0.5], 2p is in (0, 1]
// return s * erfcinv(2.0 * p);
//#else
// x++;
// return 0.0;
//#endif
//}
//
/**
* \brief Return a normally distributed float from an XORWOW generator.
*
* Return a single normally distributed float with mean \p 0.0f and
* standard deviation \p 1.0f from the XORWOW generator in \p state,
* increment position of generator by one.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results, then returns them one at a time.
* See ::curand_normal2() for a more efficient version that returns
* both results at once.
*
* \param state - Pointer to state to update
*
* \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
*/
QUALIFIERS float curand_normal(curandStateXORWOW_t *state)
{
if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) {
unsigned int x, y;
x = curand(state);
y = curand(state);
float2 v = _curand_box_muller(x, y);
state->boxmuller_extra = v.y;
state->boxmuller_flag = EXTRA_FLAG_NORMAL;
return v.x;
}
state->boxmuller_flag = 0;
return state->boxmuller_extra;
}
/**
* \brief Return a normally distributed float from an Philox4_32_10 generator.
*
* Return a single normally distributed float with mean \p 0.0f and
* standard deviation \p 1.0f from the Philox4_32_10 generator in \p state,
* increment position of generator by one.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results, then returns them one at a time.
* See ::curand_normal2() for a more efficient version that returns
* both results at once.
*
* \param state - Pointer to state to update
*
* \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
*/
QUALIFIERS float curand_normal(curandStatePhilox4_32_10_t *state)
{
if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) {
unsigned int x, y;
x = curand(state);
y = curand(state);
float2 v = _curand_box_muller(x, y);
state->boxmuller_extra = v.y;
state->boxmuller_flag = EXTRA_FLAG_NORMAL;
return v.x;
}
state->boxmuller_flag = 0;
return state->boxmuller_extra;
}
/**
* \brief Return a normally distributed float from an MRG32k3a generator.
*
* Return a single normally distributed float with mean \p 0.0f and
* standard deviation \p 1.0f from the MRG32k3a generator in \p state,
* increment position of generator by one.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results, then returns them one at a time.
* See ::curand_normal2() for a more efficient version that returns
* both results at once.
*
* \param state - Pointer to state to update
*
* \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
*/
QUALIFIERS float curand_normal(curandStateMRG32k3a_t *state)
{
if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) {
float2 v = curand_box_muller_mrg(state);
state->boxmuller_extra = v.y;
state->boxmuller_flag = EXTRA_FLAG_NORMAL;
return v.x;
}
state->boxmuller_flag = 0;
return state->boxmuller_extra;
}
/**
* \brief Return two normally distributed floats from an XORWOW generator.
*
* Return two normally distributed floats with mean \p 0.0f and
* standard deviation \p 1.0f from the XORWOW generator in \p state,
* increment position of generator by two.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results.
*
* \param state - Pointer to state to update
*
* \return Normally distributed float2 where each element is from a
* distribution with mean \p 0.0f and standard deviation \p 1.0f
*/
QUALIFIERS float2 curand_normal2(curandStateXORWOW_t *state)
{
return curand_box_muller(state);
}
/**
* \brief Return two normally distributed floats from an Philox4_32_10 generator.
*
* Return two normally distributed floats with mean \p 0.0f and
* standard deviation \p 1.0f from the Philox4_32_10 generator in \p state,
* increment position of generator by two.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results.
*
* \param state - Pointer to state to update
*
* \return Normally distributed float2 where each element is from a
* distribution with mean \p 0.0f and standard deviation \p 1.0f
*/
QUALIFIERS float2 curand_normal2(curandStatePhilox4_32_10_t *state)
{
return curand_box_muller(state);
}
/**
* \brief Return four normally distributed floats from an Philox4_32_10 generator.
*
* Return four normally distributed floats with mean \p 0.0f and
* standard deviation \p 1.0f from the Philox4_32_10 generator in \p state,
* increment position of generator by four.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results.
*
* \param state - Pointer to state to update
*
* \return Normally distributed float2 where each element is from a
* distribution with mean \p 0.0f and standard deviation \p 1.0f
*/
QUALIFIERS float4 curand_normal4(curandStatePhilox4_32_10_t *state)
{
return curand_box_muller4(state);
}
/**
* \brief Return two normally distributed floats from an MRG32k3a generator.
*
* Return two normally distributed floats with mean \p 0.0f and
* standard deviation \p 1.0f from the MRG32k3a generator in \p state,
* increment position of generator by two.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results.
*
* \param state - Pointer to state to update
*
* \return Normally distributed float2 where each element is from a
* distribution with mean \p 0.0f and standard deviation \p 1.0f
*/
QUALIFIERS float2 curand_normal2(curandStateMRG32k3a_t *state)
{
return curand_box_muller_mrg(state);
}
/**
* \brief Return a normally distributed float from a MTGP32 generator.
*
* Return a single normally distributed float with mean \p 0.0f and
* standard deviation \p 1.0f from the MTGP32 generator in \p state,
* increment position of generator.
*
* The implementation uses the inverse cumulative distribution function
* to generate normally distributed results.
*
* \param state - Pointer to state to update
*
* \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
*/
QUALIFIERS float curand_normal(curandStateMtgp32_t *state)
{
return _curand_normal_icdf(curand(state));
}
/**
* \brief Return a normally distributed float from a Sobol32 generator.
*
* Return a single normally distributed float with mean \p 0.0f and
* standard deviation \p 1.0f from the Sobol32 generator in \p state,
* increment position of generator by one.
*
* The implementation uses the inverse cumulative distribution function
* to generate normally distributed results.
*
* \param state - Pointer to state to update
*
* \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
*/
QUALIFIERS float curand_normal(curandStateSobol32_t *state)
{
return _curand_normal_icdf(curand(state));
}
/**
* \brief Return a normally distributed float from a scrambled Sobol32 generator.
*
* Return a single normally distributed float with mean \p 0.0f and
* standard deviation \p 1.0f from the scrambled Sobol32 generator in \p state,
* increment position of generator by one.
*
* The implementation uses the inverse cumulative distribution function
* to generate normally distributed results.
*
* \param state - Pointer to state to update
*
* \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
*/
QUALIFIERS float curand_normal(curandStateScrambledSobol32_t *state)
{
return _curand_normal_icdf(curand(state));
}
/**
* \brief Return a normally distributed float from a Sobol64 generator.
*
* Return a single normally distributed float with mean \p 0.0f and
* standard deviation \p 1.0f from the Sobol64 generator in \p state,
* increment position of generator by one.
*
* The implementation uses the inverse cumulative distribution function
* to generate normally distributed results.
*
* \param state - Pointer to state to update
*
* \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
*/
QUALIFIERS float curand_normal(curandStateSobol64_t *state)
{
return _curand_normal_icdf(curand(state));
}
/**
* \brief Return a normally distributed float from a scrambled Sobol64 generator.
*
* Return a single normally distributed float with mean \p 0.0f and
* standard deviation \p 1.0f from the scrambled Sobol64 generator in \p state,
* increment position of generator by one.
*
* The implementation uses the inverse cumulative distribution function
* to generate normally distributed results.
*
* \param state - Pointer to state to update
*
* \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
*/
QUALIFIERS float curand_normal(curandStateScrambledSobol64_t *state)
{
return _curand_normal_icdf(curand(state));
}
/**
* \brief Return a normally distributed double from an XORWOW generator.
*
* Return a single normally distributed double with mean \p 0.0 and
* standard deviation \p 1.0 from the XORWOW generator in \p state,
* increment position of generator.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results, then returns them one at a time.
* See ::curand_normal2_double() for a more efficient version that returns
* both results at once.
*
* \param state - Pointer to state to update
*
* \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
*/
QUALIFIERS double curand_normal_double(curandStateXORWOW_t *state)
{
if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) {
unsigned int x0, x1, y0, y1;
x0 = curand(state);
x1 = curand(state);
y0 = curand(state);
y1 = curand(state);
double2 v = _curand_box_muller_double(x0, x1, y0, y1);
state->boxmuller_extra_double = v.y;
state->boxmuller_flag_double = EXTRA_FLAG_NORMAL;
return v.x;
}
state->boxmuller_flag_double = 0;
return state->boxmuller_extra_double;
}
/**
* \brief Return a normally distributed double from an Philox4_32_10 generator.
*
* Return a single normally distributed double with mean \p 0.0 and
* standard deviation \p 1.0 from the Philox4_32_10 generator in \p state,
* increment position of generator.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results, then returns them one at a time.
* See ::curand_normal2_double() for a more efficient version that returns
* both results at once.
*
* \param state - Pointer to state to update
*
* \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
*/
QUALIFIERS double curand_normal_double(curandStatePhilox4_32_10_t *state)
{
if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) {
uint4 _x;
_x = curand4(state);
double2 v = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
state->boxmuller_extra_double = v.y;
state->boxmuller_flag_double = EXTRA_FLAG_NORMAL;
return v.x;
}
state->boxmuller_flag_double = 0;
return state->boxmuller_extra_double;
}
/**
* \brief Return a normally distributed double from an MRG32k3a generator.
*
* Return a single normally distributed double with mean \p 0.0 and
* standard deviation \p 1.0 from the XORWOW generator in \p state,
* increment position of generator.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results, then returns them one at a time.
* See ::curand_normal2_double() for a more efficient version that returns
* both results at once.
*
* \param state - Pointer to state to update
*
* \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
*/
QUALIFIERS double curand_normal_double(curandStateMRG32k3a_t *state)
{
if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) {
double2 v = curand_box_muller_mrg_double(state);
state->boxmuller_extra_double = v.y;
state->boxmuller_flag_double = EXTRA_FLAG_NORMAL;
return v.x;
}
state->boxmuller_flag_double = 0;
return state->boxmuller_extra_double;
}
/**
* \brief Return two normally distributed doubles from an XORWOW generator.
*
* Return two normally distributed doubles with mean \p 0.0 and
* standard deviation \p 1.0 from the XORWOW generator in \p state,
* increment position of generator by 2.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results.
*
* \param state - Pointer to state to update
*
* \return Normally distributed double2 where each element is from a
* distribution with mean \p 0.0 and standard deviation \p 1.0
*/
QUALIFIERS double2 curand_normal2_double(curandStateXORWOW_t *state)
{
return curand_box_muller_double(state);
}
/**
* \brief Return two normally distributed doubles from an Philox4_32_10 generator.
*
* Return two normally distributed doubles with mean \p 0.0 and
* standard deviation \p 1.0 from the Philox4_32_10 generator in \p state,
* increment position of generator by 2.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results.
*
* \param state - Pointer to state to update
*
* \return Normally distributed double2 where each element is from a
* distribution with mean \p 0.0 and standard deviation \p 1.0
*/
QUALIFIERS double2 curand_normal2_double(curandStatePhilox4_32_10_t *state)
{
uint4 _x;
double2 result;
_x = curand4(state);
double2 v1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
result.x = v1.x;
result.y = v1.y;
return result;
}
// not a part of API
QUALIFIERS double4 curand_normal4_double(curandStatePhilox4_32_10_t *state)
{
uint4 _x;
uint4 _y;
double4 result;
_x = curand4(state);
_y = curand4(state);
double2 v1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
double2 v2 = _curand_box_muller_double(_y.x, _y.y, _y.z, _y.w);
result.x = v1.x;
result.y = v1.y;
result.z = v2.x;
result.w = v2.y;
return result;
}
/**
* \brief Return two normally distributed doubles from an MRG32k3a generator.
*
* Return two normally distributed doubles with mean \p 0.0 and
* standard deviation \p 1.0 from the MRG32k3a generator in \p state,
* increment position of generator.
*
* The implementation uses a Box-Muller transform to generate two
* normally distributed results.
*
* \param state - Pointer to state to update
*
* \return Normally distributed double2 where each element is from a
* distribution with mean \p 0.0 and standard deviation \p 1.0
*/
QUALIFIERS double2 curand_normal2_double(curandStateMRG32k3a_t *state)
{
return curand_box_muller_mrg_double(state);
}
/**
* \brief Return a normally distributed double from an MTGP32 generator.
*
* Return a single normally distributed double with mean \p 0.0 and
* standard deviation \p 1.0 from the MTGP32 generator in \p state,
* increment position of generator.
*
* The implementation uses the inverse cumulative distribution function
* to generate normally distributed results.
*
* \param state - Pointer to state to update
*
* \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
*/
QUALIFIERS double curand_normal_double(curandStateMtgp32_t *state)
{
return _curand_normal_icdf_double(curand(state));
}
/**
* \brief Return a normally distributed double from an Sobol32 generator.
*
* Return a single normally distributed double with mean \p 0.0 and
* standard deviation \p 1.0 from the Sobol32 generator in \p state,
* increment position of generator by one.
*
* The implementation uses the inverse cumulative distribution function
* to generate normally distributed results.
*
* \param state - Pointer to state to update
*
* \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
*/
QUALIFIERS double curand_normal_double(curandStateSobol32_t *state)
{
return _curand_normal_icdf_double(curand(state));
}
/**
* \brief Return a normally distributed double from a scrambled Sobol32 generator.
*
* Return a single normally distributed double with mean \p 0.0 and
* standard deviation \p 1.0 from the scrambled Sobol32 generator in \p state,
* increment position of generator by one.
*
* The implementation uses the inverse cumulative distribution function
* to generate normally distributed results.
*
* \param state - Pointer to state to update
*
* \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
*/
QUALIFIERS double curand_normal_double(curandStateScrambledSobol32_t *state)
{
return _curand_normal_icdf_double(curand(state));
}
/**
* \brief Return a normally distributed double from a Sobol64 generator.
*
* Return a single normally distributed double with mean \p 0.0 and
* standard deviation \p 1.0 from the Sobol64 generator in \p state,
* increment position of generator by one.
*
* The implementation uses the inverse cumulative distribution function
* to generate normally distributed results.
*
* \param state - Pointer to state to update
*
* \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
*/
QUALIFIERS double curand_normal_double(curandStateSobol64_t *state)
{
return _curand_normal_icdf_double(curand(state));
}
/**
* \brief Return a normally distributed double from a scrambled Sobol64 generator.
*
* Return a single normally distributed double with mean \p 0.0 and
* standard deviation \p 1.0 from the scrambled Sobol64 generator in \p state,
* increment position of generator by one.
*
* The implementation uses the inverse cumulative distribution function
* to generate normally distributed results.
*
* \param state - Pointer to state to update
*
* \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
*/
QUALIFIERS double curand_normal_double(curandStateScrambledSobol64_t *state)
{
return _curand_normal_icdf_double(curand(state));
}
#endif // !defined(CURAND_NORMAL_H_)

View file

@ -1,127 +0,0 @@
/* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* The source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* The Licensed Deliverables contained herein are PROPRIETARY and
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef CURAND_NORMAL_STATIC_H
#define CURAND_NORMAL_STATIC_H
#define QUALIFIERS_STATIC __host__ __device__ __forceinline__
QUALIFIERS_STATIC float _curand_normal_icdf(unsigned int x)
{
#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
float s = CURAND_SQRT2;
// Mirror to avoid loss of precision
if(x > 0x80000000UL) {
x = 0xffffffffUL - x;
s = -s;
}
float p = x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
// p is in (0, 0.5], 2p is in (0, 1]
return s * erfcinvf(2.0f * p);
#else
x++; //suppress warnings
return 0.0f;
#endif
}
QUALIFIERS_STATIC float _curand_normal_icdf(unsigned long long x)
{
#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
unsigned int t = (unsigned int)(x >> 32);
float s = CURAND_SQRT2;
// Mirror to avoid loss of precision
if(t > 0x80000000UL) {
t = 0xffffffffUL - t;
s = -s;
}
float p = t * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
// p is in (0, 0.5], 2p is in (0, 1]
return s * erfcinvf(2.0f * p);
#else
x++;
return 0.0f;
#endif
}
QUALIFIERS_STATIC double _curand_normal_icdf_double(unsigned int x)
{
#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
double s = CURAND_SQRT2_DOUBLE;
// Mirror to avoid loss of precision
if(x > 0x80000000UL) {
x = 0xffffffffUL - x;
s = -s;
}
double p = x * CURAND_2POW32_INV_DOUBLE + (CURAND_2POW32_INV_DOUBLE/2.0);
// p is in (0, 0.5], 2p is in (0, 1]
return s * erfcinv(2.0 * p);
#else
x++;
return 0.0;
#endif
}
QUALIFIERS_STATIC double _curand_normal_icdf_double(unsigned long long x)
{
#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
double s = CURAND_SQRT2_DOUBLE;
x >>= 11;
// Mirror to avoid loss of precision
if(x > 0x10000000000000UL) {
x = 0x1fffffffffffffUL - x;
s = -s;
}
double p = x * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
// p is in (0, 0.5], 2p is in (0, 1]
return s * erfcinv(2.0 * p);
#else
x++;
return 0.0;
#endif
}
#undef QUALIFIERS_STATIC
#endif

View file

@ -1,194 +0,0 @@
/* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* The source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* The Licensed Deliverables contained herein are PROPRIETARY and
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef CURAND_PHILOX4X32_X__H_
#define CURAND_PHILOX4X32_X__H_
#if !defined(QUALIFIERS)
#define QUALIFIERS static __forceinline__ __device__
#endif
#define PHILOX_W32_0 (0x9E3779B9)
#define PHILOX_W32_1 (0xBB67AE85)
#define PHILOX_M4x32_0 (0xD2511F53)
#define PHILOX_M4x32_1 (0xCD9E8D57)
struct curandStatePhilox4_32_10 {
uint4 ctr;
uint4 output;
uint2 key;
unsigned int STATE;
int boxmuller_flag;
int boxmuller_flag_double;
float boxmuller_extra;
double boxmuller_extra_double;
};
typedef struct curandStatePhilox4_32_10 curandStatePhilox4_32_10_t;
QUALIFIERS void Philox_State_Incr(curandStatePhilox4_32_10_t* s, unsigned long long n)
{
unsigned int nlo = (unsigned int)(n);
unsigned int nhi = (unsigned int)(n>>32);
s->ctr.x += nlo;
if( s->ctr.x < nlo )
nhi++;
s->ctr.y += nhi;
if(nhi <= s->ctr.y)
return;
if(++s->ctr.z) return;
++s->ctr.w;
}
QUALIFIERS void Philox_State_Incr_hi(curandStatePhilox4_32_10_t* s, unsigned long long n)
{
unsigned int nlo = (unsigned int)(n);
unsigned int nhi = (unsigned int)(n>>32);
s->ctr.z += nlo;
if( s->ctr.z < nlo )
nhi++;
s->ctr.w += nhi;
}
QUALIFIERS void Philox_State_Incr(curandStatePhilox4_32_10_t* s)
{
if(++s->ctr.x) return;
if(++s->ctr.y) return;
if(++s->ctr.z) return;
++s->ctr.w;
}
QUALIFIERS unsigned int mulhilo32(unsigned int a, unsigned int b, unsigned int* hip)
{
#ifndef __CUDA_ARCH__
// host code
unsigned long long product = ((unsigned long long)a) * ((unsigned long long)b);
*hip = product >> 32;
return (unsigned int)product;
#else
// device code
*hip = __umulhi(a,b);
return a*b;
#endif
}
QUALIFIERS uint4 _philox4x32round(uint4 ctr, uint2 key)
{
unsigned int hi0;
unsigned int hi1;
unsigned int lo0 = mulhilo32(PHILOX_M4x32_0, ctr.x, &hi0);
unsigned int lo1 = mulhilo32(PHILOX_M4x32_1, ctr.z, &hi1);
uint4 ret = {hi1^ctr.y^key.x, lo1, hi0^ctr.w^key.y, lo0};
return ret;
}
QUALIFIERS uint4 curand_Philox4x32_10( uint4 c, uint2 k)
{
c = _philox4x32round(c, k); // 1
k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
c = _philox4x32round(c, k); // 2
k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
c = _philox4x32round(c, k); // 3
k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
c = _philox4x32round(c, k); // 4
k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
c = _philox4x32round(c, k); // 5
k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
c = _philox4x32round(c, k); // 6
k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
c = _philox4x32round(c, k); // 7
k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
c = _philox4x32round(c, k); // 8
k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
c = _philox4x32round(c, k); // 9
k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
return _philox4x32round(c, k); // 10
}
#endif

View file

@ -1,751 +0,0 @@
/* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* The source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* The Licensed Deliverables contained herein are PROPRIETARY and
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(CURAND_POISSON_H_)
#define CURAND_POISSON_H_
/**
* \defgroup DEVICE Device API
*
* @{
*/
#ifndef __CUDACC_RTC__
#include <math.h>
#endif // __CUDACC_RTC__
#include "curand_mrg32k3a.h"
#include "curand_mtgp32_kernel.h"
#include "curand_philox4x32_x.h"
#define CR_CUDART_PI 3.1415926535897931e+0
#define CR_CUDART_TWO_TO_52 4503599627370496.0
QUALIFIERS float __cr_rsqrt(float a)
{
#ifdef __CUDA_ARCH__
asm ("rsqrt.approx.f32.ftz %0, %1;" : "=f"(a) : "f"(a));
#else
a = 1.0f / sqrtf (a);
#endif
return a;
}
QUALIFIERS float __cr_exp (float a)
{
#ifdef __CUDA_ARCH__
a = a * 1.4426950408889634074;
asm ("ex2.approx.f32.ftz %0, %1;" : "=f"(a) : "f"(a));
#else
a = expf (a);
#endif
return a;
}
QUALIFIERS float __cr_log (float a)
{
#ifdef __CUDA_ARCH__
asm ("lg2.approx.f32.ftz %0, %1;" : "=f"(a) : "f"(a));
a = a * 0.69314718055994530942;
#else
a = logf (a);
#endif
return a;
}
QUALIFIERS float __cr_rcp (float a)
{
#ifdef __CUDA_ARCH__
asm ("rcp.approx.f32.ftz %0, %1;" : "=f"(a) : "f"(a));
#else
a = 1.0f / a;
#endif
return a;
}
/* Computes regularized gamma function: gammainc(a,x)/gamma(a) */
QUALIFIERS float __cr_pgammainc (float a, float x)
{
float t, alpha, beta;
/* First level parametrization constants */
float ma1 = 1.43248035075540910f,
ma2 = 0.12400979329415655f,
ma3 = 0.00025361074907033f,
mb1 = 0.21096734870196546f,
mb2 = 1.97381164089999420f,
mb3 = 0.94201734077887530f;
/* Second level parametrization constants (depends only on a) */
alpha = __cr_rsqrt (a - ma2);
alpha = ma1 * alpha + ma3;
beta = __cr_rsqrt (a - mb2);
beta = mb1 * beta + mb3;
/* Final approximation (depends on a and x) */
t = a - x;
t = alpha * t - beta;
t = 1.0f + __cr_exp (t);
t = t * t;
t = __cr_rcp (t);
/* Negative a,x or a,x=NAN requires special handling */
//t = !(x > 0 && a >= 0) ? 0.0 : t;
return t;
}
/* Computes inverse of pgammainc */
QUALIFIERS float __cr_pgammaincinv (float a, float y)
{
float t, alpha, beta;
/* First level parametrization constants */
float ma1 = 1.43248035075540910f,
ma2 = 0.12400979329415655f,
ma3 = 0.00025361074907033f,
mb1 = 0.21096734870196546f,
mb2 = 1.97381164089999420f,
mb3 = 0.94201734077887530f;
/* Second level parametrization constants (depends only on a) */
alpha = __cr_rsqrt (a - ma2);
alpha = ma1 * alpha + ma3;
beta = __cr_rsqrt (a - mb2);
beta = mb1 * beta + mb3;
/* Final approximation (depends on a and y) */
t = __cr_rsqrt (y) - 1.0f;
t = __cr_log (t);
t = beta + t;
t = - t * __cr_rcp (alpha) + a;
/* Negative a,x or a,x=NAN requires special handling */
//t = !(y > 0 && a >= 0) ? 0.0 : t;
return t;
}
#if defined(__CUDACC_RDC__) && (__cplusplus >= 201703L) && defined(__cpp_inline_variables)
inline __constant__ double __cr_lgamma_table [] = {
#else
static __constant__ double __cr_lgamma_table [] = {
#endif
0.000000000000000000e-1,
0.000000000000000000e-1,
6.931471805599453094e-1,
1.791759469228055001e0,
3.178053830347945620e0,
4.787491742782045994e0,
6.579251212010100995e0,
8.525161361065414300e0,
1.060460290274525023e1
};
QUALIFIERS double __cr_lgamma_integer(int a)
{
double s;
double t;
double fa = fabs((float)a);
double sum;
if (a > 8) {
/* Stirling approximation; coefficients from Hart et al, "Computer
* Approximations", Wiley 1968. Approximation 5404.
*/
s = 1.0 / fa;
t = s * s;
sum = -0.1633436431e-2;
sum = sum * t + 0.83645878922e-3;
sum = sum * t - 0.5951896861197e-3;
sum = sum * t + 0.793650576493454e-3;
sum = sum * t - 0.277777777735865004e-2;
sum = sum * t + 0.833333333333331018375e-1;
sum = sum * s + 0.918938533204672;
s = 0.5 * log (fa);
t = fa - 0.5;
s = s * t;
t = s - fa;
s = s + sum;
t = t + s;
return t;
} else {
#ifdef __CUDA_ARCH__
return __cr_lgamma_table [(int) fa-1];
#else
switch(a) {
case 1: return 0.000000000000000000e-1;
case 2: return 0.000000000000000000e-1;
case 3: return 6.931471805599453094e-1;
case 4: return 1.791759469228055001e0;
case 5: return 3.178053830347945620e0;
case 6: return 4.787491742782045994e0;
case 7: return 6.579251212010100995e0;
case 8: return 8.525161361065414300e0;
default: return 1.060460290274525023e1;
}
#endif
}
}
#define KNUTH_FLOAT_CONST 60.0
template <typename T>
// Donald E. Knuth Seminumerical Algorithms. The Art of Computer Programming, Volume 2
QUALIFIERS unsigned int curand_poisson_knuth(T *state, float lambda)
{
unsigned int k = 0;
float p = expf(lambda);
do{
k++;
p *= curand_uniform(state);
}while (p > 1.0);
return k-1;
}
template <typename T>
// Donald E. Knuth Seminumerical Algorithms. The Art of Computer Programming, Volume 2
QUALIFIERS uint4 curand_poisson_knuth4(T *state, float lambda)
{
uint4 k = {0,0,0,0};
float exp_lambda = expf(lambda);
float4 p={ exp_lambda,exp_lambda,exp_lambda,exp_lambda };
do{
k.x++;
p.x *= curand_uniform(state);
}while (p.x > 1.0);
do{
k.y++;
p.y *= curand_uniform(state);
}while (p.y > 1.0);
do{
k.z++;
p.z *= curand_uniform(state);
}while (p.z > 1.0);
do{
k.w++;
p.w *= curand_uniform(state);
}while (p.w > 1.0);
k.x--;
k.y--;
k.z--;
k.w--;
return k;
}
template <typename T>
// Marsaglia, Tsang, Wang Journal of Statistical Software, square histogram.
QUALIFIERS unsigned int _curand_M2_double(T x, curandDistributionM2Shift_t distributionM2)
{
double u = _curand_uniform_double(x);
int j = (int) floor(distributionM2->length*u);
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
double histogramVj = __ldg( &(distributionM2->histogram->V[j]));
unsigned int histogramKj = __ldg( &(distributionM2->histogram->K[j]));
#else
double histogramVj = distributionM2->histogram->V[j];
unsigned int histogramKj = distributionM2->histogram->K[j];
#endif
//if (u < distributionM2->histogram->V[j]) return distributionM2->shift + j;
//return distributionM2->shift + distributionM2->histogram->K[j];
if (u < histogramVj) return distributionM2->shift + j;
return distributionM2->shift + histogramKj;
}
template <typename T>
// Marsaglia, Tsang, Wang Journal of Statistical Software, square histogram.
QUALIFIERS uint4 _curand_M2_double4(T x, curandDistributionM2Shift_t distributionM2)
{
double4 u;
uint4 result = {0,0,0,0};
int4 flag = {1,1,1,1};
u.x = _curand_uniform_double(x.x);
u.y = _curand_uniform_double(x.y);
u.z = _curand_uniform_double(x.z);
u.w = _curand_uniform_double(x.w);
int4 j;
j.x = (int) floor(distributionM2->length*u.x);
j.y = (int) floor(distributionM2->length*u.y);
j.z = (int) floor(distributionM2->length*u.z);
j.w = (int) floor(distributionM2->length*u.w);
// int result;
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
double histogramVjx = __ldg( &(distributionM2->histogram->V[j.x]));
double histogramVjy = __ldg( &(distributionM2->histogram->V[j.y]));
double histogramVjz = __ldg( &(distributionM2->histogram->V[j.z]));
double histogramVjw = __ldg( &(distributionM2->histogram->V[j.w]));
unsigned int histogramKjx = __ldg( &(distributionM2->histogram->K[j.x]));
unsigned int histogramKjy = __ldg( &(distributionM2->histogram->K[j.y]));
unsigned int histogramKjz = __ldg( &(distributionM2->histogram->K[j.z]));
unsigned int histogramKjw = __ldg( &(distributionM2->histogram->K[j.w]));
#else
double histogramVjx = distributionM2->histogram->V[j.x];
double histogramVjy = distributionM2->histogram->V[j.y];
double histogramVjz = distributionM2->histogram->V[j.z];
double histogramVjw = distributionM2->histogram->V[j.w];
unsigned int histogramKjx = distributionM2->histogram->K[j.x];
unsigned int histogramKjy = distributionM2->histogram->K[j.y];
unsigned int histogramKjz = distributionM2->histogram->K[j.z];
unsigned int histogramKjw = distributionM2->histogram->K[j.w];
#endif
if (u.x < histogramVjx){ result.x = distributionM2->shift + j.x; flag.x = 0; }
if (u.y < histogramVjy){ result.y = distributionM2->shift + j.y; flag.y = 0; }
if (u.z < histogramVjz){ result.z = distributionM2->shift + j.z; flag.z = 0; }
if (u.w < histogramVjw){ result.w = distributionM2->shift + j.w; flag.w = 0; }
//return distributionM2->shift + distributionM2->histogram->K[j];
if(flag.x) result.x = distributionM2->shift + histogramKjx;
if(flag.y) result.y = distributionM2->shift + histogramKjy;
if(flag.z) result.z = distributionM2->shift + histogramKjz;
if(flag.w) result.w = distributionM2->shift + histogramKjw;
return result;
}
template <typename STATE>
QUALIFIERS unsigned int curand_M2_double(STATE *state, curandDistributionM2Shift_t distributionM2)
{
return _curand_M2_double(curand(state), distributionM2);
}
template <typename STATE>
QUALIFIERS uint4 curand_M2_double4(STATE *state, curandDistributionM2Shift_t distributionM2)
{
return _curand_M2_double4(curand4(state), distributionM2);
}
template <typename T>
QUALIFIERS unsigned int _curand_binary_search_double(T x, curandDistributionShift_t distribution)
{
double u = _curand_uniform_double(x);
int min = 0;
int max = distribution->length-1;
do{
int mid = (max + min)/2;
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
double probability_mid = __ldg( &(distribution->probability[mid]));
#else
double probability_mid = distribution->probability[mid];
#endif
if (u <= probability_mid){
max = mid;
}else{
min = mid+1;
}
}while (min < max);
return distribution->shift + min;
}
template <typename STATE>
QUALIFIERS unsigned int curand_binary_search_double(STATE *state, curandDistributionShift_t distribution)
{
return _curand_binary_search_double(curand(state), distribution);
}
// Generates uniformly distributed double values in range (0.0; 1.0) from uniformly distributed
// unsigned int. We can't use standard _curand_uniform_double since it can generate 1.0.
// This is required only for _curand_poisson_ITR_double.
QUALIFIERS double _curand_uniform_double_excluding_one(unsigned int x)
{
return x * CURAND_2POW32_INV_DOUBLE + (CURAND_2POW32_INV_DOUBLE/2.0);
}
// Overload for unsigned long long.
// This is required only for _curand_poisson_ITR_double.
QUALIFIERS double _curand_uniform_double_excluding_one(unsigned long long x)
{
return (x >> 11) * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/4.0);
}
#define MAGIC_DOUBLE_CONST 500.0
template <typename T>
//George S. Fishman Discrete-event simulation: modeling, programming, and analysis
QUALIFIERS unsigned int _curand_poisson_ITR_double(T x, double lambda)
{
double L,p = 1.0;
double q = 1.0;
unsigned int k = 0;
int pow=0;
// This algorithm requires u to be in (0;1) range, however, _curand_uniform_double
// returns a number in range (0;1]. If u is 1.0 the inner loop never ends. The
// following operation transforms the range from (0;1] to (0;1).
double u = _curand_uniform_double_excluding_one(x);
do{
if (lambda > (double)(pow+MAGIC_DOUBLE_CONST)){
L = exp(-MAGIC_DOUBLE_CONST);
}else{
L = exp((double)(pow - lambda));
}
p *= L;
q *= L;
pow += (int) MAGIC_DOUBLE_CONST;
while (u > q){
k++;
p *= ((double)lambda / (double) k);
q += p;
}
}while((double)pow < lambda);
return k;
}
template <typename T>
/* Rejection Method for Poisson distribution based on gammainc approximation */
QUALIFIERS unsigned int curand_poisson_gammainc(T state, float lambda){
float y, x, t, z,v;
float logl = __cr_log (lambda);
while (true) {
y = curand_uniform (state);
x = __cr_pgammaincinv (lambda, y);
x = floorf (x);
z = curand_uniform (state);
v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
z = z*v;
t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
if ((z < t) && (v>=1e-20))
break;
}
return (unsigned int)x;
}
template <typename T>
/* Rejection Method for Poisson distribution based on gammainc approximation */
QUALIFIERS uint4 curand_poisson_gammainc4(T state, float lambda){
uint4 result;
float y, x, t, z,v;
float logl = __cr_log (lambda);
while (true) {
y = curand_uniform(state);
x = __cr_pgammaincinv (lambda, y);
x = floorf (x);
z = curand_uniform (state);
v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
z = z*v;
t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
if ((z < t) && (v>=1e-20))
break;
}
result.x = (unsigned int)x;
while (true) {
y = curand_uniform(state);
x = __cr_pgammaincinv (lambda, y);
x = floorf (x);
z = curand_uniform (state);
v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
z = z*v;
t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
if ((z < t) && (v>=1e-20))
break;
}
result.y = (unsigned int)x;
while (true) {
y = curand_uniform(state);
x = __cr_pgammaincinv (lambda, y);
x = floorf (x);
z = curand_uniform (state);
v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
z = z*v;
t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
if ((z < t) && (v>=1e-20))
break;
}
result.z = (unsigned int)x;
while (true) {
y = curand_uniform(state);
x = __cr_pgammaincinv (lambda, y);
x = floorf (x);
z = curand_uniform (state);
v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
z = z*v;
t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
if ((z < t) && (v>=1e-20))
break;
}
result.w = (unsigned int)x;
return result;
}
// Note below that the round to nearest integer, where needed,is done in line with code that
// assumes the range of values is < 2**32
template <typename T>
QUALIFIERS unsigned int _curand_poisson(T x, double lambda)
{
if (lambda < 1000)
return _curand_poisson_ITR_double(x, lambda);
return (unsigned int)((sqrt(lambda) * _curand_normal_icdf_double(x)) + lambda + 0.5); //Round to nearest
}
template <typename T>
QUALIFIERS unsigned int _curand_poisson_from_normal(T x, double lambda)
{
return (unsigned int)((sqrt(lambda) * _curand_normal_icdf(x)) + lambda + 0.5); //Round to nearest
}
template <typename STATE>
QUALIFIERS unsigned int curand_poisson_from_normal(STATE state, double lambda)
{
return (unsigned int)((sqrt(lambda) * curand_normal(state)) + lambda + 0.5); //Round to nearest
}
template <typename STATE>
QUALIFIERS uint4 curand_poisson_from_normal4(STATE state, double lambda)
{
uint4 result;
float4 _res;
_res = curand_normal4(state);
result.x = (unsigned int)((sqrt(lambda) * _res.x) + lambda + 0.5); //Round to nearest
result.y = (unsigned int)((sqrt(lambda) * _res.y) + lambda + 0.5); //Round to nearest
result.z = (unsigned int)((sqrt(lambda) * _res.z) + lambda + 0.5); //Round to nearest
result.w = (unsigned int)((sqrt(lambda) * _res.w) + lambda + 0.5); //Round to nearest
return result; //Round to nearest
}
/**
* \brief Return a Poisson-distributed unsigned int from a XORWOW generator.
*
* Return a single unsigned int from a Poisson
* distribution with lambda \p lambda from the XORWOW generator in \p state,
* increment the position of the generator by a variable amount, depending
* on the algorithm used.
*
* \param state - Pointer to state to update
* \param lambda - Lambda of the Poisson distribution
*
* \return Poisson-distributed unsigned int with lambda \p lambda
*/
QUALIFIERS unsigned int curand_poisson(curandStateXORWOW_t *state, double lambda)
{
if (lambda < 64)
return curand_poisson_knuth(state, (float)lambda);
if (lambda > 4000)
return (unsigned int)((sqrt(lambda) * curand_normal_double(state)) + lambda + 0.5); //Round to nearest
return curand_poisson_gammainc(state, (float)lambda);
}
/**
* \brief Return a Poisson-distributed unsigned int from a Philox4_32_10 generator.
*
* Return a single unsigned int from a Poisson
* distribution with lambda \p lambda from the Philox4_32_10 generator in \p state,
* increment the position of the generator by a variable amount, depending
* on the algorithm used.
*
* \param state - Pointer to state to update
* \param lambda - Lambda of the Poisson distribution
*
* \return Poisson-distributed unsigned int with lambda \p lambda
*/
QUALIFIERS unsigned int curand_poisson(curandStatePhilox4_32_10_t *state, double lambda)
{
if (lambda < 64)
return curand_poisson_knuth(state, (float)lambda);
if (lambda > 4000)
return (unsigned int)((sqrt(lambda) * curand_normal_double(state)) + lambda + 0.5); //Round to nearest
return curand_poisson_gammainc(state, (float)lambda);
}
/**
* \brief Return four Poisson-distributed unsigned ints from a Philox4_32_10 generator.
*
* Return a four unsigned ints from a Poisson
* distribution with lambda \p lambda from the Philox4_32_10 generator in \p state,
* increment the position of the generator by a variable amount, depending
* on the algorithm used.
*
* \param state - Pointer to state to update
* \param lambda - Lambda of the Poisson distribution
*
* \return Poisson-distributed unsigned int with lambda \p lambda
*/
QUALIFIERS uint4 curand_poisson4(curandStatePhilox4_32_10_t *state, double lambda)
{
uint4 result;
double4 _res;
if (lambda < 64)
return curand_poisson_knuth4(state, (float)lambda);
if (lambda > 4000) {
_res = curand_normal4_double(state);
result.x = (unsigned int)((sqrt(lambda) * _res.x) + lambda + 0.5); //Round to nearest
result.y = (unsigned int)((sqrt(lambda) * _res.y) + lambda + 0.5); //Round to nearest
result.z = (unsigned int)((sqrt(lambda) * _res.z) + lambda + 0.5); //Round to nearest
result.w = (unsigned int)((sqrt(lambda) * _res.w) + lambda + 0.5); //Round to nearest
return result;
}
return curand_poisson_gammainc4(state, (float)lambda);
}
/**
* \brief Return a Poisson-distributed unsigned int from a MRG32k3A generator.
*
* Return a single unsigned int from a Poisson
* distribution with lambda \p lambda from the MRG32k3a generator in \p state,
* increment the position of the generator by a variable amount, depending
* on the algorithm used.
*
* \param state - Pointer to state to update
* \param lambda - Lambda of the Poisson distribution
*
* \return Poisson-distributed unsigned int with lambda \p lambda
*/
QUALIFIERS unsigned int curand_poisson(curandStateMRG32k3a_t *state, double lambda)
{
if (lambda < 64)
return curand_poisson_knuth(state, (float)lambda);
if (lambda > 4000)
return (unsigned int)((sqrt(lambda) * curand_normal_double(state)) + lambda + 0.5); //Round to nearest
return curand_poisson_gammainc(state, (float)lambda);
}
/**
* \brief Return a Poisson-distributed unsigned int from a MTGP32 generator.
*
* Return a single int from a Poisson
* distribution with lambda \p lambda from the MTGP32 generator in \p state,
* increment the position of the generator by one.
*
* \param state - Pointer to state to update
* \param lambda - Lambda of the Poisson distribution
*
* \return Poisson-distributed unsigned int with lambda \p lambda
*/
QUALIFIERS unsigned int curand_poisson(curandStateMtgp32_t *state, double lambda)
{
return _curand_poisson(curand(state), lambda);
}
/**
* \brief Return a Poisson-distributed unsigned int from a Sobol32 generator.
*
* Return a single unsigned int from a Poisson
* distribution with lambda \p lambda from the Sobol32 generator in \p state,
* increment the position of the generator by one.
*
* \param state - Pointer to state to update
* \param lambda - Lambda of the Poisson distribution
*
* \return Poisson-distributed unsigned int with lambda \p lambda
*/
QUALIFIERS unsigned int curand_poisson(curandStateSobol32_t *state, double lambda)
{
return _curand_poisson(curand(state), lambda);
}
/**
* \brief Return a Poisson-distributed unsigned int from a scrambled Sobol32 generator.
*
* Return a single unsigned int from a Poisson
* distribution with lambda \p lambda from the scrambled Sobol32 generator in \p state,
* increment the position of the generator by one.
*
* \param state - Pointer to state to update
* \param lambda - Lambda of the Poisson distribution
*
* \return Poisson-distributed unsigned int with lambda \p lambda
*/
QUALIFIERS unsigned int curand_poisson(curandStateScrambledSobol32_t *state, double lambda)
{
return _curand_poisson(curand(state), lambda);
}
/**
* \brief Return a Poisson-distributed unsigned int from a Sobol64 generator.
*
* Return a single unsigned int from a Poisson
* distribution with lambda \p lambda from the Sobol64 generator in \p state,
* increment position of generator by one.
*
* \param state - Pointer to state to update
* \param lambda - Lambda of the Poisson distribution
*
* \return Poisson-distributed unsigned int with lambda \p lambda
*/
QUALIFIERS unsigned int curand_poisson(curandStateSobol64_t *state, double lambda)
{
return _curand_poisson(curand(state), lambda);
}
/**
* \brief Return a Poisson-distributed unsigned int from a scrambled Sobol64 generator.
*
* Return a single unsigned int from a Poisson
* distribution with lambda \p lambda from the scrambled Sobol64 generator in \p state,
* increment position of generator by one.
*
* \param state - Pointer to state to update
* \param lambda - Lambda of the Poisson distribution
*
* \return Poisson-distributed unsigned int with lambda \p lambda
*/
QUALIFIERS unsigned int curand_poisson(curandStateScrambledSobol64_t *state, double lambda)
{
return _curand_poisson(curand(state), lambda);
}
#endif // !defined(CURAND_POISSON_H_)

File diff suppressed because it is too large Load diff

View file

@ -1,498 +0,0 @@
/* Copyright 2010-2018 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* The source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* The Licensed Deliverables contained herein are PROPRIETARY and
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(CURAND_UNIFORM_H_)
#define CURAND_UNIFORM_H_
/**
* \defgroup DEVICE Device API
*
* @{
*/
#ifndef __CUDACC_RTC__
#include <math.h>
#endif // __CUDACC_RTC__
#include "curand_mrg32k3a.h"
#include "curand_mtgp32_kernel.h"
#include "curand_philox4x32_x.h"
QUALIFIERS float _curand_uniform(unsigned int x)
{
return x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
}
QUALIFIERS float4 _curand_uniform4(uint4 x)
{
float4 y;
y.x = x.x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
y.y = x.y * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
y.z = x.z * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
y.w = x.w * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
return y;
}
QUALIFIERS float _curand_uniform(unsigned long long x)
{
unsigned int t;
t = (unsigned int)(x >> 32);
return t * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
}
QUALIFIERS double _curand_uniform_double(unsigned int x)
{
return x * CURAND_2POW32_INV_DOUBLE + CURAND_2POW32_INV_DOUBLE;
}
QUALIFIERS double _curand_uniform_double(unsigned long long x)
{
return (x >> 11) * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
}
QUALIFIERS double _curand_uniform_double_hq(unsigned int x, unsigned int y)
{
unsigned long long z = (unsigned long long)x ^
((unsigned long long)y << (53 - 32));
return z * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
}
QUALIFIERS float curand_uniform(curandStateTest_t *state)
{
return _curand_uniform(curand(state));
}
QUALIFIERS double curand_uniform_double(curandStateTest_t *state)
{
return _curand_uniform_double(curand(state));
}
/**
* \brief Return a uniformly distributed float from an XORWOW generator.
*
* Return a uniformly distributed float between \p 0.0f and \p 1.0f
* from the XORWOW generator in \p state, increment position of generator.
* Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
* point outputs are never returned.
*
* The implementation may use any number of calls to \p curand() to
* get enough random bits to create the return value. The current
* implementation uses one call.
*
* \param state - Pointer to state to update
*
* \return uniformly distributed float between \p 0.0f and \p 1.0f
*/
QUALIFIERS float curand_uniform(curandStateXORWOW_t *state)
{
return _curand_uniform(curand(state));
}
/**
* \brief Return a uniformly distributed double from an XORWOW generator.
*
* Return a uniformly distributed double between \p 0.0 and \p 1.0
* from the XORWOW generator in \p state, increment position of generator.
* Output range excludes \p 0.0 but includes \p 1.0. Denormalized floating
* point outputs are never returned.
*
* The implementation may use any number of calls to \p curand() to
* get enough random bits to create the return value. The current
* implementation uses exactly two calls.
*
* \param state - Pointer to state to update
*
* \return uniformly distributed double between \p 0.0 and \p 1.0
*/
QUALIFIERS double curand_uniform_double(curandStateXORWOW_t *state)
{
unsigned int x, y;
x = curand(state);
y = curand(state);
return _curand_uniform_double_hq(x, y);
}
/**
* \brief Return a uniformly distributed float from an MRG32k3a generator.
*
* Return a uniformly distributed float between \p 0.0f and \p 1.0f
* from the MRG32k3a generator in \p state, increment position of generator.
* Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
* point outputs are never returned.
*
* The implementation returns up to 23 bits of mantissa, with the minimum
* return value \f$ 2^{-32} \f$
*
* \param state - Pointer to state to update
*
* \return uniformly distributed float between \p 0.0f and \p 1.0f
*/
QUALIFIERS float curand_uniform(curandStateMRG32k3a_t *state)
{
return ((float)(curand_MRG32k3a(state)*MRG32K3A_NORM));
}
/**
* \brief Return a uniformly distributed double from an MRG32k3a generator.
*
* Return a uniformly distributed double between \p 0.0 and \p 1.0
* from the MRG32k3a generator in \p state, increment position of generator.
* Output range excludes \p 0.0 but includes \p 1.0. Denormalized floating
* point outputs are never returned.
*
* Note the implementation returns at most 32 random bits of mantissa as
* outlined in the seminal paper by L'Ecuyer.
*
* \param state - Pointer to state to update
*
* \return uniformly distributed double between \p 0.0 and \p 1.0
*/
QUALIFIERS double curand_uniform_double(curandStateMRG32k3a_t *state)
{
return curand_MRG32k3a(state)*MRG32K3A_NORM;
}
/**
* \brief Return a uniformly distributed tuple of 2 doubles from an Philox4_32_10 generator.
*
* Return a uniformly distributed 2 doubles (double4) between \p 0.0 and \p 1.0
* from the Philox4_32_10 generator in \p state, increment position of generator by 4.
* Output range excludes \p 0.0 but includes \p 1.0. Denormalized floating
* point outputs are never returned.
*
* \param state - Pointer to state to update
*
* \return 2 uniformly distributed doubles between \p 0.0 and \p 1.0
*/
QUALIFIERS double2 curand_uniform2_double(curandStatePhilox4_32_10_t *state)
{
uint4 _x;
double2 result;
_x = curand4(state);
result.x = _curand_uniform_double_hq(_x.x,_x.y);
result.y = _curand_uniform_double_hq(_x.z,_x.w);
return result;
}
// not a part of API
QUALIFIERS double4 curand_uniform4_double(curandStatePhilox4_32_10_t *state)
{
uint4 _x, _y;
double4 result;
_x = curand4(state);
_y = curand4(state);
result.x = _curand_uniform_double_hq(_x.x,_x.y);
result.y = _curand_uniform_double_hq(_x.z,_x.w);
result.z = _curand_uniform_double_hq(_y.x,_y.y);
result.w = _curand_uniform_double_hq(_y.z,_y.w);
return result;
}
/**
* \brief Return a uniformly distributed float from a Philox4_32_10 generator.
*
* Return a uniformly distributed float between \p 0.0f and \p 1.0f
* from the Philox4_32_10 generator in \p state, increment position of generator.
* Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
* point outputs are never returned.
*
* \param state - Pointer to state to update
*
* \return uniformly distributed float between \p 0.0 and \p 1.0
*
*/
QUALIFIERS float curand_uniform(curandStatePhilox4_32_10_t *state)
{
return _curand_uniform(curand(state));
}
/**
* \brief Return a uniformly distributed tuple of 4 floats from a Philox4_32_10 generator.
*
* Return a uniformly distributed 4 floats between \p 0.0f and \p 1.0f
* from the Philox4_32_10 generator in \p state, increment position of generator by 4.
* Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
* point outputs are never returned.
*
* \param state - Pointer to state to update
*
* \return uniformly distributed float between \p 0.0 and \p 1.0
*
*/
QUALIFIERS float4 curand_uniform4(curandStatePhilox4_32_10_t *state)
{
return _curand_uniform4(curand4(state));
}
/**
* \brief Return a uniformly distributed float from a MTGP32 generator.
*
* Return a uniformly distributed float between \p 0.0f and \p 1.0f
* from the MTGP32 generator in \p state, increment position of generator.
* Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
* point outputs are never returned.
*
* \param state - Pointer to state to update
*
* \return uniformly distributed float between \p 0.0f and \p 1.0f
*/
QUALIFIERS float curand_uniform(curandStateMtgp32_t *state)
{
return _curand_uniform(curand(state));
}
/**
* \brief Return a uniformly distributed double from a MTGP32 generator.
*
* Return a uniformly distributed double between \p 0.0f and \p 1.0f
* from the MTGP32 generator in \p state, increment position of generator.
* Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
* point outputs are never returned.
*
* Note that the implementation uses only 32 random bits to generate a single double
* precision value.
*
* \param state - Pointer to state to update
*
* \return uniformly distributed double between \p 0.0f and \p 1.0f
*/
QUALIFIERS double curand_uniform_double(curandStateMtgp32_t *state)
{
return _curand_uniform_double(curand(state));
}
/**
* \brief Return a uniformly distributed double from a Philox4_32_10 generator.
*
* Return a uniformly distributed double between \p 0.0f and \p 1.0f
* from the Philox4_32_10 generator in \p state, increment position of generator.
* Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
* point outputs are never returned.
*
* Note that the implementation uses only 32 random bits to generate a single double
* precision value.
*
* \p curand_uniform2_double() is recommended for higher quality uniformly distributed
* double precision values.
*
* \param state - Pointer to state to update
*
* \return uniformly distributed double between \p 0.0f and \p 1.0f
*/
QUALIFIERS double curand_uniform_double(curandStatePhilox4_32_10_t *state)
{
return _curand_uniform_double(curand(state));
}
/**
* \brief Return a uniformly distributed float from a Sobol32 generator.
*
* Return a uniformly distributed float between \p 0.0f and \p 1.0f
* from the Sobol32 generator in \p state, increment position of generator.
* Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
* point outputs are never returned.
*
* The implementation is guaranteed to use a single call to \p curand().
*
* \param state - Pointer to state to update
*
* \return uniformly distributed float between \p 0.0f and \p 1.0f
*/
QUALIFIERS float curand_uniform(curandStateSobol32_t *state)
{
return _curand_uniform(curand(state));
}
/**
* \brief Return a uniformly distributed double from a Sobol32 generator.
*
* Return a uniformly distributed double between \p 0.0 and \p 1.0
* from the Sobol32 generator in \p state, increment position of generator.
* Output range excludes \p 0.0 but includes \p 1.0. Denormalized floating
* point outputs are never returned.
*
* The implementation is guaranteed to use a single call to \p curand()
* to preserve the quasirandom properties of the sequence.
*
* Note that the implementation uses only 32 random bits to generate a single double
* precision value.
*
* \param state - Pointer to state to update
*
* \return uniformly distributed double between \p 0.0 and \p 1.0
*/
QUALIFIERS double curand_uniform_double(curandStateSobol32_t *state)
{
return _curand_uniform_double(curand(state));
}
/**
* \brief Return a uniformly distributed float from a scrambled Sobol32 generator.
*
* Return a uniformly distributed float between \p 0.0f and \p 1.0f
* from the scrambled Sobol32 generator in \p state, increment position of generator.
* Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
* point outputs are never returned.
*
* The implementation is guaranteed to use a single call to \p curand().
*
* \param state - Pointer to state to update
*
* \return uniformly distributed float between \p 0.0f and \p 1.0f
*/
QUALIFIERS float curand_uniform(curandStateScrambledSobol32_t *state)
{
return _curand_uniform(curand(state));
}
/**
* \brief Return a uniformly distributed double from a scrambled Sobol32 generator.
*
* Return a uniformly distributed double between \p 0.0 and \p 1.0
* from the scrambled Sobol32 generator in \p state, increment position of generator.
* Output range excludes \p 0.0 but includes \p 1.0. Denormalized floating
* point outputs are never returned.
*
* The implementation is guaranteed to use a single call to \p curand()
* to preserve the quasirandom properties of the sequence.
*
* Note that the implementation uses only 32 random bits to generate a single double
* precision value.
*
* \param state - Pointer to state to update
*
* \return uniformly distributed double between \p 0.0 and \p 1.0
*/
QUALIFIERS double curand_uniform_double(curandStateScrambledSobol32_t *state)
{
return _curand_uniform_double(curand(state));
}
/**
* \brief Return a uniformly distributed float from a Sobol64 generator.
*
* Return a uniformly distributed float between \p 0.0f and \p 1.0f
* from the Sobol64 generator in \p state, increment position of generator.
* Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
* point outputs are never returned.
*
* The implementation is guaranteed to use a single call to \p curand().
*
* \param state - Pointer to state to update
*
* \return uniformly distributed float between \p 0.0f and \p 1.0f
*/
QUALIFIERS float curand_uniform(curandStateSobol64_t *state)
{
return _curand_uniform(curand(state));
}
/**
* \brief Return a uniformly distributed double from a Sobol64 generator.
*
* Return a uniformly distributed double between \p 0.0 and \p 1.0
* from the Sobol64 generator in \p state, increment position of generator.
* Output range excludes \p 0.0 but includes \p 1.0. Denormalized floating
* point outputs are never returned.
*
* The implementation is guaranteed to use a single call to \p curand()
* to preserve the quasirandom properties of the sequence.
*
* \param state - Pointer to state to update
*
* \return uniformly distributed double between \p 0.0 and \p 1.0
*/
QUALIFIERS double curand_uniform_double(curandStateSobol64_t *state)
{
return _curand_uniform_double(curand(state));
}
/**
* \brief Return a uniformly distributed float from a scrambled Sobol64 generator.
*
* Return a uniformly distributed float between \p 0.0f and \p 1.0f
* from the scrambled Sobol64 generator in \p state, increment position of generator.
* Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
* point outputs are never returned.
*
* The implementation is guaranteed to use a single call to \p curand().
*
* \param state - Pointer to state to update
*
* \return uniformly distributed float between \p 0.0f and \p 1.0f
*/
QUALIFIERS float curand_uniform(curandStateScrambledSobol64_t *state)
{
return _curand_uniform(curand(state));
}
/**
* \brief Return a uniformly distributed double from a scrambled Sobol64 generator.
*
* Return a uniformly distributed double between \p 0.0 and \p 1.0
* from the scrambled Sobol64 generator in \p state, increment position of generator.
* Output range excludes \p 0.0 but includes \p 1.0. Denormalized floating
* point outputs are never returned.
*
* The implementation is guaranteed to use a single call to \p curand()
* to preserve the quasirandom properties of the sequence.
*
* \param state - Pointer to state to update
*
* \return uniformly distributed double between \p 0.0 and \p 1.0
*/
QUALIFIERS double curand_uniform_double(curandStateScrambledSobol64_t *state)
{
return _curand_uniform_double(curand(state));
}
#endif // !defined(CURAND_UNIFORM_H_)

Some files were not shown because too many files have changed in this diff Show more