Blender V2.61 - r43446

device_cuda.cpp

Go to the documentation of this file.
00001 /*
00002  * Copyright 2011, Blender Foundation.
00003  *
00004  * This program is free software; you can redistribute it and/or
00005  * modify it under the terms of the GNU General Public License
00006  * as published by the Free Software Foundation; either version 2
00007  * of the License, or (at your option) any later version.
00008  *
00009  * This program is distributed in the hope that it will be useful,
00010  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  * GNU General Public License for more details.
00013  *
00014  * You should have received a copy of the GNU General Public License
00015  * along with this program; if not, write to the Free Software Foundation,
00016  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
00017  */
00018 
00019 #include <stdio.h>
00020 #include <stdlib.h>
00021 #include <string.h>
00022 
00023 #include "device.h"
00024 #include "device_intern.h"
00025 
00026 #include "util_cuda.h"
00027 #include "util_debug.h"
00028 #include "util_map.h"
00029 #include "util_opengl.h"
00030 #include "util_path.h"
00031 #include "util_system.h"
00032 #include "util_types.h"
00033 #include "util_time.h"
00034 
00035 CCL_NAMESPACE_BEGIN
00036 
00037 class CUDADevice : public Device
00038 {
00039 public:
00040     CUdevice cuDevice;
00041     CUcontext cuContext;
00042     CUmodule cuModule;
00043     map<device_ptr, bool> tex_interp_map;
00044     int cuDevId;
00045 
00046     struct PixelMem {
00047         GLuint cuPBO;
00048         CUgraphicsResource cuPBOresource;
00049         GLuint cuTexId;
00050         int w, h;
00051     };
00052 
00053     map<device_ptr, PixelMem> pixel_mem_map;
00054 
00055     CUdeviceptr cuda_device_ptr(device_ptr mem)
00056     {
00057         return (CUdeviceptr)mem;
00058     }
00059 
00060     const char *cuda_error_string(CUresult result)
00061     {
00062         switch(result) {
00063             case CUDA_SUCCESS: return "No errors";
00064             case CUDA_ERROR_INVALID_VALUE: return "Invalid value";
00065             case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory";
00066             case CUDA_ERROR_NOT_INITIALIZED: return "Driver not initialized";
00067             case CUDA_ERROR_DEINITIALIZED: return "Driver deinitialized";
00068 
00069             case CUDA_ERROR_NO_DEVICE: return "No CUDA-capable device available";
00070             case CUDA_ERROR_INVALID_DEVICE: return "Invalid device";
00071 
00072             case CUDA_ERROR_INVALID_IMAGE: return "Invalid kernel image";
00073             case CUDA_ERROR_INVALID_CONTEXT: return "Invalid context";
00074             case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "Context already current";
00075             case CUDA_ERROR_MAP_FAILED: return "Map failed";
00076             case CUDA_ERROR_UNMAP_FAILED: return "Unmap failed";
00077             case CUDA_ERROR_ARRAY_IS_MAPPED: return "Array is mapped";
00078             case CUDA_ERROR_ALREADY_MAPPED: return "Already mapped";
00079             case CUDA_ERROR_NO_BINARY_FOR_GPU: return "No binary for GPU";
00080             case CUDA_ERROR_ALREADY_ACQUIRED: return "Already acquired";
00081             case CUDA_ERROR_NOT_MAPPED: return "Not mapped";
00082             case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "Mapped resource not available for access as an array";
00083             case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "Mapped resource not available for access as a pointer";
00084             case CUDA_ERROR_ECC_UNCORRECTABLE: return "Uncorrectable ECC error detected";
00085             case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUlimit not supported by device";
00086 
00087             case CUDA_ERROR_INVALID_SOURCE: return "Invalid source";
00088             case CUDA_ERROR_FILE_NOT_FOUND: return "File not found";
00089             case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "Link to a shared object failed to resolve";
00090             case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "Shared object initialization failed";
00091 
00092             case CUDA_ERROR_INVALID_HANDLE: return "Invalid handle";
00093 
00094             case CUDA_ERROR_NOT_FOUND: return "Not found";
00095 
00096             case CUDA_ERROR_NOT_READY: return "CUDA not ready";
00097 
00098             case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed";
00099             case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "Launch exceeded resources";
00100             case CUDA_ERROR_LAUNCH_TIMEOUT: return "Launch exceeded timeout";
00101             case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "Launch with incompatible texturing";
00102 
00103             case CUDA_ERROR_UNKNOWN: return "Unknown error";
00104 
00105             default: return "Unknown CUDA error value";
00106         }
00107     }
00108 
00109     static int cuda_align_up(int& offset, int alignment)
00110     {
00111         return (offset + alignment - 1) & ~(alignment - 1);
00112     }
00113 
00114 #ifdef NDEBUG
00115 #define cuda_abort()
00116 #else
00117 #define cuda_abort() abort()
00118 #endif
00119 
00120 #define cuda_assert(stmt) \
00121     { \
00122         CUresult result = stmt; \
00123         \
00124         if(result != CUDA_SUCCESS) { \
00125             string message = string_printf("CUDA error: %s in %s", cuda_error_string(result), #stmt); \
00126             if(error_msg == "") \
00127                 error_msg = message; \
00128             fprintf(stderr, "%s\n", message.c_str()); \
00129             cuda_abort(); \
00130         } \
00131     }
00132 
00133     bool cuda_error(CUresult result)
00134     {
00135         if(result == CUDA_SUCCESS)
00136             return false;
00137 
00138         string message = string_printf("CUDA error: %s", cuda_error_string(result));
00139         if(error_msg == "")
00140             error_msg = message;
00141         fprintf(stderr, "%s\n", message.c_str());
00142         return true;
00143     }
00144 
00145     void cuda_error(const string& message)
00146     {
00147         if(error_msg == "")
00148             error_msg = message;
00149         fprintf(stderr, "%s\n", message.c_str());
00150     }
00151 
00152     void cuda_push_context()
00153     {
00154         cuda_assert(cuCtxSetCurrent(cuContext))
00155     }
00156 
00157     void cuda_pop_context()
00158     {
00159         cuda_assert(cuCtxSetCurrent(NULL));
00160     }
00161 
00162     CUDADevice(DeviceInfo& info, bool background_)
00163     {
00164         background = background_;
00165 
00166         cuDevId = info.num;
00167         cuDevice = 0;
00168         cuContext = 0;
00169 
00170         /* intialize */
00171         if(cuda_error(cuInit(0)))
00172             return;
00173 
00174         /* setup device and context */
00175         if(cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
00176             return;
00177 
00178         CUresult result;
00179 
00180         if(background)
00181             result = cuCtxCreate(&cuContext, 0, cuDevice);
00182         else
00183             result = cuGLCtxCreate(&cuContext, 0, cuDevice);
00184 
00185         if(cuda_error(result))
00186             return;
00187 
00188         cuda_pop_context();
00189     }
00190 
00191     ~CUDADevice()
00192     {
00193         cuda_push_context();
00194         cuda_assert(cuCtxDetach(cuContext))
00195     }
00196 
00197     bool support_full_kernel()
00198     {
00199         int major, minor;
00200         cuDeviceComputeCapability(&major, &minor, cuDevId);
00201 
00202         return (major >= 2);
00203     }
00204 
00205     string description()
00206     {
00207         /* print device information */
00208         char deviceName[256];
00209 
00210         cuda_push_context();
00211         cuDeviceGetName(deviceName, 256, cuDevId);
00212         cuda_pop_context();
00213 
00214         return string("CUDA ") + deviceName;
00215     }
00216 
00217     bool support_device(bool experimental)
00218     {
00219         if(!experimental) {
00220             int major, minor;
00221             cuDeviceComputeCapability(&major, &minor, cuDevId);
00222 
00223             if(major <= 1 && minor <= 2) {
00224                 cuda_error(string_printf("CUDA device supported only with compute capability 1.3 or up, found %d.%d.", major, minor));
00225                 return false;
00226             }
00227         }
00228 
00229         return true;
00230     }
00231 
00232     string compile_kernel()
00233     {
00234         /* compute cubin name */
00235         int major, minor;
00236         cuDeviceComputeCapability(&major, &minor, cuDevId);
00237 
00238         /* attempt to use kernel provided with blender */
00239         string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
00240         if(path_exists(cubin))
00241             return cubin;
00242 
00243         /* not found, try to use locally compiled kernel */
00244         string kernel_path = path_get("kernel");
00245         string md5 = path_files_md5_hash(kernel_path);
00246 
00247         cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());;
00248         cubin = path_user_get(path_join("cache", cubin));
00249 
00250         /* if exists already, use it */
00251         if(path_exists(cubin))
00252             return cubin;
00253 
00254 #if defined(WITH_CUDA_BINARIES) && defined(_WIN32)
00255         if(major <= 1 && minor <= 2)
00256             cuda_error(string_printf("CUDA device supported only compute capability 1.3 or up, found %d.%d.", major, minor));
00257         else
00258             cuda_error(string_printf("CUDA binary kernel for this graphics card compute capability (%d.%d) not found.", major, minor));
00259         return "";
00260 #else
00261         /* if not, find CUDA compiler */
00262         string nvcc = cuCompilerPath();
00263 
00264         if(nvcc == "") {
00265             cuda_error("CUDA nvcc compiler not found. Install CUDA toolkit in default location.");
00266             return "";
00267         }
00268 
00269         /* compile */
00270         string kernel = path_join(kernel_path, "kernel.cu");
00271         string include = kernel_path;
00272         const int machine = system_cpu_bits();
00273         const int maxreg = 24;
00274 
00275         double starttime = time_dt();
00276         printf("Compiling CUDA kernel ...\n");
00277 
00278         path_create_directories(cubin);
00279 
00280         string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" --use_fast_math "
00281             "-o \"%s\" --ptxas-options=\"-v\" --maxrregcount=%d --opencc-options -OPT:Olimit=0 -I\"%s\" -DNVCC",
00282             nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), maxreg, include.c_str());
00283 
00284         if(system(command.c_str()) == -1) {
00285             cuda_error("Failed to execute compilation command, see console for details.");
00286             return "";
00287         }
00288 
00289         /* verify if compilation succeeded */
00290         if(!path_exists(cubin)) {
00291             cuda_error("CUDA kernel compilation failed, see console for details.");
00292             return "";
00293         }
00294 
00295         printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
00296 
00297         return cubin;
00298 #endif
00299     }
00300 
00301     bool load_kernels(bool experimental)
00302     {
00303         /* check if cuda init succeeded */
00304         if(cuContext == 0)
00305             return false;
00306 
00307         if(!support_device(experimental))
00308             return false;
00309 
00310         /* get kernel */
00311         string cubin = compile_kernel();
00312 
00313         if(cubin == "")
00314             return false;
00315 
00316         /* open module */
00317         cuda_push_context();
00318 
00319         CUresult result = cuModuleLoad(&cuModule, cubin.c_str());
00320         if(cuda_error(result))
00321             cuda_error(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
00322 
00323         cuda_pop_context();
00324 
00325         return (result == CUDA_SUCCESS);
00326     }
00327 
00328     void mem_alloc(device_memory& mem, MemoryType type)
00329     {
00330         cuda_push_context();
00331         CUdeviceptr device_pointer;
00332         cuda_assert(cuMemAlloc(&device_pointer, mem.memory_size()))
00333         mem.device_pointer = (device_ptr)device_pointer;
00334         cuda_pop_context();
00335     }
00336 
00337     void mem_copy_to(device_memory& mem)
00338     {
00339         cuda_push_context();
00340         cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size()))
00341         cuda_pop_context();
00342     }
00343 
00344     void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
00345     {
00346         size_t offset = elem*y*w;
00347         size_t size = elem*w*h;
00348 
00349         cuda_push_context();
00350         cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
00351             (CUdeviceptr)((uchar*)mem.device_pointer + offset), size))
00352         cuda_pop_context();
00353     }
00354 
00355     void mem_zero(device_memory& mem)
00356     {
00357         memset((void*)mem.data_pointer, 0, mem.memory_size());
00358 
00359         cuda_push_context();
00360         cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()))
00361         cuda_pop_context();
00362     }
00363 
00364     void mem_free(device_memory& mem)
00365     {
00366         if(mem.device_pointer) {
00367             cuda_push_context();
00368             cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)))
00369             cuda_pop_context();
00370 
00371             mem.device_pointer = 0;
00372         }
00373     }
00374 
00375     void const_copy_to(const char *name, void *host, size_t size)
00376     {
00377         CUdeviceptr mem;
00378         size_t bytes;
00379 
00380         cuda_push_context();
00381         cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name))
00382         //assert(bytes == size);
00383         cuda_assert(cuMemcpyHtoD(mem, host, size))
00384         cuda_pop_context();
00385     }
00386 
00387     void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
00388     {
00389         /* determine format */
00390         CUarray_format_enum format;
00391         size_t dsize = datatype_size(mem.data_type);
00392         size_t size = mem.memory_size();
00393 
00394         switch(mem.data_type) {
00395             case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
00396             case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
00397             case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
00398             case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
00399             default: assert(0); return;
00400         }
00401 
00402         CUtexref texref;
00403 
00404         cuda_push_context();
00405         cuda_assert(cuModuleGetTexRef(&texref, cuModule, name))
00406 
00407         if(interpolation) {
00408             CUarray handle;
00409             CUDA_ARRAY_DESCRIPTOR desc;
00410 
00411             desc.Width = mem.data_width;
00412             desc.Height = mem.data_height;
00413             desc.Format = format;
00414             desc.NumChannels = mem.data_elements;
00415 
00416             cuda_assert(cuArrayCreate(&handle, &desc))
00417 
00418             if(mem.data_height > 1) {
00419                 CUDA_MEMCPY2D param;
00420                 memset(&param, 0, sizeof(param));
00421                 param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
00422                 param.dstArray = handle;
00423                 param.srcMemoryType = CU_MEMORYTYPE_HOST;
00424                 param.srcHost = (void*)mem.data_pointer;
00425                 param.srcPitch = mem.data_width*dsize*mem.data_elements;
00426                 param.WidthInBytes = param.srcPitch;
00427                 param.Height = mem.data_height;
00428 
00429                 cuda_assert(cuMemcpy2D(&param))
00430             }
00431             else
00432                 cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size))
00433 
00434             cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT))
00435 
00436             cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR))
00437             cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES))
00438 
00439             mem.device_pointer = (device_ptr)handle;
00440         }
00441         else {
00442             cuda_pop_context();
00443 
00444             mem_alloc(mem, MEM_READ_ONLY);
00445             mem_copy_to(mem);
00446 
00447             cuda_push_context();
00448 
00449             cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size))
00450             cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT))
00451             cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER))
00452         }
00453 
00454         if(periodic) {
00455             cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP))
00456             cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP))
00457         }
00458         else {
00459             cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP))
00460             cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP))
00461         }
00462         cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements))
00463 
00464         cuda_pop_context();
00465 
00466         tex_interp_map[mem.device_pointer] = interpolation;
00467     }
00468 
00469     void tex_free(device_memory& mem)
00470     {
00471         if(mem.device_pointer) {
00472             if(tex_interp_map[mem.device_pointer]) {
00473                 cuda_push_context();
00474                 cuArrayDestroy((CUarray)mem.device_pointer);
00475                 cuda_pop_context();
00476 
00477                 tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
00478                 mem.device_pointer = 0;
00479             }
00480             else {
00481                 tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
00482                 mem_free(mem);
00483             }
00484         }
00485     }
00486 
00487     void path_trace(DeviceTask& task)
00488     {
00489         cuda_push_context();
00490 
00491         CUfunction cuPathTrace;
00492         CUdeviceptr d_buffer = cuda_device_ptr(task.buffer);
00493         CUdeviceptr d_rng_state = cuda_device_ptr(task.rng_state);
00494 
00495         /* get kernel function */
00496         cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"))
00497         
00498         /* pass in parameters */
00499         int offset = 0;
00500         
00501         cuda_assert(cuParamSetv(cuPathTrace, offset, &d_buffer, sizeof(d_buffer)))
00502         offset += sizeof(d_buffer);
00503 
00504         cuda_assert(cuParamSetv(cuPathTrace, offset, &d_rng_state, sizeof(d_rng_state)))
00505         offset += sizeof(d_rng_state);
00506 
00507         int sample = task.sample;
00508         offset = cuda_align_up(offset, __alignof(sample));
00509 
00510         cuda_assert(cuParamSeti(cuPathTrace, offset, task.sample))
00511         offset += sizeof(task.sample);
00512 
00513         cuda_assert(cuParamSeti(cuPathTrace, offset, task.x))
00514         offset += sizeof(task.x);
00515 
00516         cuda_assert(cuParamSeti(cuPathTrace, offset, task.y))
00517         offset += sizeof(task.y);
00518 
00519         cuda_assert(cuParamSeti(cuPathTrace, offset, task.w))
00520         offset += sizeof(task.w);
00521 
00522         cuda_assert(cuParamSeti(cuPathTrace, offset, task.h))
00523         offset += sizeof(task.h);
00524 
00525         cuda_assert(cuParamSeti(cuPathTrace, offset, task.offset))
00526         offset += sizeof(task.offset);
00527 
00528         cuda_assert(cuParamSeti(cuPathTrace, offset, task.stride))
00529         offset += sizeof(task.stride);
00530 
00531         cuda_assert(cuParamSetSize(cuPathTrace, offset))
00532 
00533         /* launch kernel: todo find optimal size, cache config for fermi */
00534 #ifndef __APPLE__
00535         int xthreads = 16;
00536         int ythreads = 16;
00537 #else
00538         int xthreads = 8;
00539         int ythreads = 8;
00540 #endif
00541         int xblocks = (task.w + xthreads - 1)/xthreads;
00542         int yblocks = (task.h + ythreads - 1)/ythreads;
00543 
00544         cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1))
00545         cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1))
00546         cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks))
00547 
00548         cuda_pop_context();
00549     }
00550 
00551     void tonemap(DeviceTask& task)
00552     {
00553         cuda_push_context();
00554 
00555         CUfunction cuFilmConvert;
00556         CUdeviceptr d_rgba = map_pixels(task.rgba);
00557         CUdeviceptr d_buffer = cuda_device_ptr(task.buffer);
00558 
00559         /* get kernel function */
00560         cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_tonemap"))
00561 
00562         /* pass in parameters */
00563         int offset = 0;
00564 
00565         cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_rgba, sizeof(d_rgba)))
00566         offset += sizeof(d_rgba);
00567         
00568         cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_buffer, sizeof(d_buffer)))
00569         offset += sizeof(d_buffer);
00570 
00571         int sample = task.sample;
00572         offset = cuda_align_up(offset, __alignof(sample));
00573 
00574         cuda_assert(cuParamSeti(cuFilmConvert, offset, task.sample))
00575         offset += sizeof(task.sample);
00576 
00577         cuda_assert(cuParamSeti(cuFilmConvert, offset, task.resolution))
00578         offset += sizeof(task.resolution);
00579 
00580         cuda_assert(cuParamSeti(cuFilmConvert, offset, task.x))
00581         offset += sizeof(task.x);
00582 
00583         cuda_assert(cuParamSeti(cuFilmConvert, offset, task.y))
00584         offset += sizeof(task.y);
00585 
00586         cuda_assert(cuParamSeti(cuFilmConvert, offset, task.w))
00587         offset += sizeof(task.w);
00588 
00589         cuda_assert(cuParamSeti(cuFilmConvert, offset, task.h))
00590         offset += sizeof(task.h);
00591 
00592         cuda_assert(cuParamSeti(cuFilmConvert, offset, task.offset))
00593         offset += sizeof(task.offset);
00594 
00595         cuda_assert(cuParamSeti(cuFilmConvert, offset, task.stride))
00596         offset += sizeof(task.stride);
00597 
00598         cuda_assert(cuParamSetSize(cuFilmConvert, offset))
00599 
00600         /* launch kernel: todo find optimal size, cache config for fermi */
00601 #ifndef __APPLE__
00602         int xthreads = 16;
00603         int ythreads = 16;
00604 #else
00605         int xthreads = 8;
00606         int ythreads = 8;
00607 #endif
00608         int xblocks = (task.w + xthreads - 1)/xthreads;
00609         int yblocks = (task.h + ythreads - 1)/ythreads;
00610 
00611         cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1))
00612         cuda_assert(cuFuncSetBlockShape(cuFilmConvert, xthreads, ythreads, 1))
00613         cuda_assert(cuLaunchGrid(cuFilmConvert, xblocks, yblocks))
00614 
00615         unmap_pixels(task.rgba);
00616 
00617         cuda_pop_context();
00618     }
00619 
00620     void shader(DeviceTask& task)
00621     {
00622         cuda_push_context();
00623 
00624         CUfunction cuDisplace;
00625         CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
00626         CUdeviceptr d_offset = cuda_device_ptr(task.shader_output);
00627 
00628         /* get kernel function */
00629         cuda_assert(cuModuleGetFunction(&cuDisplace, cuModule, "kernel_cuda_shader"))
00630         
00631         /* pass in parameters */
00632         int offset = 0;
00633         
00634         cuda_assert(cuParamSetv(cuDisplace, offset, &d_input, sizeof(d_input)))
00635         offset += sizeof(d_input);
00636 
00637         cuda_assert(cuParamSetv(cuDisplace, offset, &d_offset, sizeof(d_offset)))
00638         offset += sizeof(d_offset);
00639 
00640         int shader_eval_type = task.shader_eval_type;
00641         offset = cuda_align_up(offset, __alignof(shader_eval_type));
00642 
00643         cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_eval_type))
00644         offset += sizeof(task.shader_eval_type);
00645 
00646         cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_x))
00647         offset += sizeof(task.shader_x);
00648 
00649         cuda_assert(cuParamSetSize(cuDisplace, offset))
00650 
00651         /* launch kernel: todo find optimal size, cache config for fermi */
00652 #ifndef __APPLE__
00653         int xthreads = 16;
00654 #else
00655         int xthreads = 8;
00656 #endif
00657         int xblocks = (task.shader_w + xthreads - 1)/xthreads;
00658 
00659         cuda_assert(cuFuncSetCacheConfig(cuDisplace, CU_FUNC_CACHE_PREFER_L1))
00660         cuda_assert(cuFuncSetBlockShape(cuDisplace, xthreads, 1, 1))
00661         cuda_assert(cuLaunchGrid(cuDisplace, xblocks, 1))
00662 
00663         cuda_pop_context();
00664     }
00665 
00666     CUdeviceptr map_pixels(device_ptr mem)
00667     {
00668         if(!background) {
00669             PixelMem pmem = pixel_mem_map[mem];
00670             CUdeviceptr buffer;
00671             
00672             size_t bytes;
00673             cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0))
00674             cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource))
00675             
00676             return buffer;
00677         }
00678 
00679         return cuda_device_ptr(mem);
00680     }
00681 
00682     void unmap_pixels(device_ptr mem)
00683     {
00684         if(!background) {
00685             PixelMem pmem = pixel_mem_map[mem];
00686 
00687             cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0))
00688         }
00689     }
00690 
00691     void pixels_alloc(device_memory& mem)
00692     {
00693         if(!background) {
00694             PixelMem pmem;
00695 
00696             pmem.w = mem.data_width;
00697             pmem.h = mem.data_height;
00698 
00699             cuda_push_context();
00700 
00701             glGenBuffers(1, &pmem.cuPBO);
00702             glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
00703             glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLfloat)*3, NULL, GL_DYNAMIC_DRAW);
00704             
00705             glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
00706             
00707             glGenTextures(1, &pmem.cuTexId);
00708             glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
00709             glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
00710             glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
00711             glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
00712             glBindTexture(GL_TEXTURE_2D, 0);
00713             
00714             cuda_assert(cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE))
00715 
00716             cuda_pop_context();
00717 
00718             mem.device_pointer = pmem.cuTexId;
00719             pixel_mem_map[mem.device_pointer] = pmem;
00720 
00721             return;
00722         }
00723 
00724         Device::pixels_alloc(mem);
00725     }
00726 
00727     void pixels_copy_from(device_memory& mem, int y, int w, int h)
00728     {
00729         if(!background) {
00730             PixelMem pmem = pixel_mem_map[mem.device_pointer];
00731 
00732             cuda_push_context();
00733 
00734             glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
00735             uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
00736             size_t offset = sizeof(uchar)*4*y*w;
00737             memcpy((uchar*)mem.data_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h);
00738             glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
00739             glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
00740 
00741             cuda_pop_context();
00742 
00743             return;
00744         }
00745 
00746         Device::pixels_copy_from(mem, y, w, h);
00747     }
00748 
00749     void pixels_free(device_memory& mem)
00750     {
00751         if(mem.device_pointer) {
00752             if(!background) {
00753                 PixelMem pmem = pixel_mem_map[mem.device_pointer];
00754 
00755                 cuda_push_context();
00756 
00757                 cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource))
00758                 glDeleteBuffers(1, &pmem.cuPBO);
00759                 glDeleteTextures(1, &pmem.cuTexId);
00760 
00761                 cuda_pop_context();
00762 
00763                 pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
00764                 mem.device_pointer = 0;
00765 
00766                 return;
00767             }
00768 
00769             Device::pixels_free(mem);
00770         }
00771     }
00772 
00773     void draw_pixels(device_memory& mem, int y, int w, int h, int dy, int width, int height, bool transparent)
00774     {
00775         if(!background) {
00776             PixelMem pmem = pixel_mem_map[mem.device_pointer];
00777 
00778             cuda_push_context();
00779 
00780             /* for multi devices, this assumes the ineffecient method that we allocate
00781                all pixels on the device even though we only render to a subset */
00782             size_t offset = sizeof(uint8_t)*4*y*w;
00783 
00784             glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pmem.cuPBO);
00785             glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
00786             glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void*)offset);
00787             glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
00788             
00789             glEnable(GL_TEXTURE_2D);
00790             
00791             if(transparent) {
00792                 glEnable(GL_BLEND);
00793                 glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
00794             }
00795 
00796             glColor3f(1.0f, 1.0f, 1.0f);
00797 
00798             glPushMatrix();
00799             glTranslatef(0.0f, (float)dy, 0.0f);
00800                 
00801             glBegin(GL_QUADS);
00802             
00803             glTexCoord2f(0.0f, 0.0f);
00804             glVertex2f(0.0f, 0.0f);
00805             glTexCoord2f((float)w/(float)pmem.w, 0.0f);
00806             glVertex2f((float)width, 0.0f);
00807             glTexCoord2f((float)w/(float)pmem.w, (float)h/(float)pmem.h);
00808             glVertex2f((float)width, (float)height);
00809             glTexCoord2f(0.0f, (float)h/(float)pmem.h);
00810             glVertex2f(0.0f, (float)height);
00811 
00812             glEnd();
00813 
00814             glPopMatrix();
00815 
00816             if(transparent)
00817                 glDisable(GL_BLEND);
00818             
00819             glBindTexture(GL_TEXTURE_2D, 0);
00820             glDisable(GL_TEXTURE_2D);
00821 
00822             cuda_pop_context();
00823 
00824             return;
00825         }
00826 
00827         Device::draw_pixels(mem, y, w, h, dy, width, height, transparent);
00828     }
00829 
00830     void task_add(DeviceTask& task)
00831     {
00832         if(task.type == DeviceTask::TONEMAP)
00833             tonemap(task);
00834         else if(task.type == DeviceTask::PATH_TRACE)
00835             path_trace(task);
00836         else if(task.type == DeviceTask::SHADER)
00837             shader(task);
00838     }
00839 
00840     void task_wait()
00841     {
00842         cuda_push_context();
00843 
00844         cuda_assert(cuCtxSynchronize())
00845 
00846         cuda_pop_context();
00847     }
00848 
00849     void task_cancel()
00850     {
00851     }
00852 };
00853 
00854 Device *device_cuda_create(DeviceInfo& info, bool background)
00855 {
00856     return new CUDADevice(info, background);
00857 }
00858 
00859 void device_cuda_info(vector<DeviceInfo>& devices)
00860 {
00861     int count = 0;
00862 
00863     if(cuInit(0) != CUDA_SUCCESS)
00864         return;
00865     if(cuDeviceGetCount(&count) != CUDA_SUCCESS)
00866         return;
00867     
00868     vector<DeviceInfo> display_devices;
00869     
00870     for(int num = 0; num < count; num++) {
00871         char name[256];
00872         int attr;
00873         
00874         if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS)
00875             continue;
00876 
00877         DeviceInfo info;
00878 
00879         info.type = DEVICE_CUDA;
00880         info.description = string(name);
00881         info.id = string_printf("CUDA_%d", num);
00882         info.num = num;
00883 
00884         /* if device has a kernel timeout, assume it is used for display */
00885         if(cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num) == CUDA_SUCCESS && attr == 1) {
00886             info.display_device = true;
00887             display_devices.push_back(info);
00888         }
00889         else
00890             devices.push_back(info);
00891     }
00892 
00893     if(!display_devices.empty())
00894         devices.insert(devices.end(), display_devices.begin(), display_devices.end());
00895 }
00896 
00897 CCL_NAMESPACE_END
00898