Blender V2.61 - r43446
|
00001 /* 00002 * Adapted from code Copyright 2009-2010 NVIDIA Corporation 00003 * Modifications Copyright 2011, Blender Foundation. 00004 * 00005 * Licensed under the Apache License, Version 2.0 (the "License"); 00006 * you may not use this file except in compliance with the License. 00007 * You may obtain a copy of the License at 00008 * 00009 * http://www.apache.org/licenses/LICENSE-2.0 00010 * 00011 * Unless required by applicable law or agreed to in writing, software 00012 * distributed under the License is distributed on an "AS IS" BASIS, 00013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 * See the License for the specific language governing permissions and 00015 * limitations under the License. 00016 */ 00017 00018 CCL_NAMESPACE_BEGIN 00019 00020 /* 00021 * "Persistent while-while kernel" used in: 00022 * 00023 * "Understanding the Efficiency of Ray Traversal on GPUs", 00024 * Timo Aila and Samuli Laine, 00025 * Proc. High-Performance Graphics 2009 00026 */ 00027 00028 /* bottom-most stack entry, indicating the end of traversal */ 00029 00030 #define ENTRYPOINT_SENTINEL 0x76543210 00031 /* 64 object BVH + 64 mesh BVH + 64 object node splitting */ 00032 #define BVH_STACK_SIZE 192 00033 #define BVH_NODE_SIZE 4 00034 #define TRI_NODE_SIZE 3 00035 00036 /* silly workaround for float extended precision that happens when compiling 00037 without sse support on x86, it results in different results for float ops 00038 that you would otherwise expect to compare correctly */ 00039 #if !defined(__i386__) || defined(__SSE__) 00040 #define NO_EXTENDED_PRECISION 00041 #else 00042 #define NO_EXTENDED_PRECISION volatile 00043 #endif 00044 00045 __device_inline float3 bvh_inverse_direction(float3 dir) 00046 { 00047 /* avoid divide by zero (ooeps = exp2f(-80.0f)) */ 00048 float ooeps = 0.00000000000000000000000082718061255302767487140869206996285356581211090087890625f; 00049 float3 idir; 00050 00051 idir.x = 1.0f/((fabsf(dir.x) > ooeps)? dir.x: copysignf(ooeps, dir.x)); 00052 idir.y = 1.0f/((fabsf(dir.y) > ooeps)? dir.y: copysignf(ooeps, dir.y)); 00053 idir.z = 1.0f/((fabsf(dir.z) > ooeps)? dir.z: copysignf(ooeps, dir.z)); 00054 00055 return idir; 00056 } 00057 00058 __device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax) 00059 { 00060 Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); 00061 00062 *P = transform(&tfm, ray->P); 00063 00064 float3 dir = transform_direction(&tfm, ray->D); 00065 00066 float len; 00067 dir = normalize_len(dir, &len); 00068 00069 *idir = bvh_inverse_direction(dir); 00070 00071 if(*t != FLT_MAX) 00072 *t *= len; 00073 } 00074 00075 __device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax) 00076 { 00077 Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); 00078 00079 if(*t != FLT_MAX) 00080 *t *= len(transform_direction(&tfm, 1.0f/(*idir))); 00081 00082 *P = ray->P; 00083 *idir = bvh_inverse_direction(ray->D); 00084 } 00085 00086 /* intersect two bounding boxes */ 00087 __device_inline void bvh_node_intersect(KernelGlobals *kg, 00088 bool *traverseChild0, bool *traverseChild1, 00089 bool *closestChild1, int *nodeAddr0, int *nodeAddr1, 00090 float3 P, float3 idir, float t, uint visibility, int nodeAddr) 00091 { 00092 /* fetch node data */ 00093 float4 n0xy = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0); 00094 float4 n1xy = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1); 00095 float4 nz = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2); 00096 float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3); 00097 00098 /* intersect ray against child nodes */ 00099 float3 ood = P * idir; 00100 float c0lox = n0xy.x * idir.x - ood.x; 00101 float c0hix = n0xy.y * idir.x - ood.x; 00102 float c0loy = n0xy.z * idir.y - ood.y; 00103 float c0hiy = n0xy.w * idir.y - ood.y; 00104 float c0loz = nz.x * idir.z - ood.z; 00105 float c0hiz = nz.y * idir.z - ood.z; 00106 NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); 00107 NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); 00108 00109 float c1loz = nz.z * idir.z - ood.z; 00110 float c1hiz = nz.w * idir.z - ood.z; 00111 float c1lox = n1xy.x * idir.x - ood.x; 00112 float c1hix = n1xy.y * idir.x - ood.x; 00113 float c1loy = n1xy.z * idir.y - ood.y; 00114 float c1hiy = n1xy.w * idir.y - ood.y; 00115 NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); 00116 NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); 00117 00118 /* decide which nodes to traverse next */ 00119 #ifdef __VISIBILITY_FLAG__ 00120 /* this visibility test gives a 5% performance hit, how to solve? */ 00121 *traverseChild0 = (c0max >= c0min) && (__float_as_int(cnodes.z) & visibility); 00122 *traverseChild1 = (c1max >= c1min) && (__float_as_int(cnodes.w) & visibility); 00123 #else 00124 *traverseChild0 = (c0max >= c0min); 00125 *traverseChild1 = (c1max >= c1min); 00126 #endif 00127 00128 *nodeAddr0 = __float_as_int(cnodes.x); 00129 *nodeAddr1 = __float_as_int(cnodes.y); 00130 00131 *closestChild1 = (c1min < c0min); 00132 } 00133 00134 /* Sven Woop's algorithm */ 00135 __device_inline void bvh_triangle_intersect(KernelGlobals *kg, Intersection *isect, 00136 float3 P, float3 idir, uint visibility, int object, int triAddr) 00137 { 00138 /* compute and check intersection t-value */ 00139 float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0); 00140 float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1); 00141 float3 dir = 1.0f/idir; 00142 00143 float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z; 00144 float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z); 00145 float t = Oz * invDz; 00146 00147 if(t > 0.0f && t < isect->t) { 00148 /* compute and check barycentric u */ 00149 float Ox = v11.w + P.x*v11.x + P.y*v11.y + P.z*v11.z; 00150 float Dx = dir.x*v11.x + dir.y*v11.y + dir.z*v11.z; 00151 float u = Ox + t*Dx; 00152 00153 if(u >= 0.0f) { 00154 /* compute and check barycentric v */ 00155 float4 v22 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2); 00156 float Oy = v22.w + P.x*v22.x + P.y*v22.y + P.z*v22.z; 00157 float Dy = dir.x*v22.x + dir.y*v22.y + dir.z*v22.z; 00158 float v = Oy + t*Dy; 00159 00160 if(v >= 0.0f && u + v <= 1.0f) { 00161 #ifdef __VISIBILITY_FLAG__ 00162 /* visibility flag test. we do it here under the assumption 00163 that most triangles are culled by node flags */ 00164 if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility) 00165 #endif 00166 { 00167 /* record intersection */ 00168 isect->prim = triAddr; 00169 isect->object = object; 00170 isect->u = u; 00171 isect->v = v; 00172 isect->t = t; 00173 } 00174 } 00175 } 00176 } 00177 } 00178 00179 __device_inline bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect) 00180 { 00181 /* traversal stack in CUDA thread-local memory */ 00182 int traversalStack[BVH_STACK_SIZE]; 00183 traversalStack[0] = ENTRYPOINT_SENTINEL; 00184 00185 /* traversal variables in registers */ 00186 int stackPtr = 0; 00187 int nodeAddr = kernel_data.bvh.root; 00188 00189 /* ray parameters in registers */ 00190 const float tmax = ray->t; 00191 float3 P = ray->P; 00192 float3 idir = bvh_inverse_direction(ray->D); 00193 int object = ~0; 00194 00195 isect->t = tmax; 00196 isect->object = ~0; 00197 isect->prim = ~0; 00198 isect->u = 0.0f; 00199 isect->v = 0.0f; 00200 00201 /* traversal loop */ 00202 do { 00203 do 00204 { 00205 /* traverse internal nodes */ 00206 while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) 00207 { 00208 bool traverseChild0, traverseChild1, closestChild1; 00209 int nodeAddrChild1; 00210 00211 bvh_node_intersect(kg, &traverseChild0, &traverseChild1, 00212 &closestChild1, &nodeAddr, &nodeAddrChild1, 00213 P, idir, isect->t, visibility, nodeAddr); 00214 00215 if(traverseChild0 != traverseChild1) { 00216 /* one child was intersected */ 00217 if(traverseChild1) { 00218 nodeAddr = nodeAddrChild1; 00219 } 00220 } 00221 else { 00222 if(!traverseChild0) { 00223 /* neither child was intersected */ 00224 nodeAddr = traversalStack[stackPtr]; 00225 --stackPtr; 00226 } 00227 else { 00228 /* both children were intersected, push the farther one */ 00229 if(closestChild1) { 00230 int tmp = nodeAddr; 00231 nodeAddr = nodeAddrChild1; 00232 nodeAddrChild1 = tmp; 00233 } 00234 00235 ++stackPtr; 00236 traversalStack[stackPtr] = nodeAddrChild1; 00237 } 00238 } 00239 } 00240 00241 /* if node is leaf, fetch triangle list */ 00242 if(nodeAddr < 0) { 00243 float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1)); 00244 int primAddr = __float_as_int(leaf.x); 00245 00246 #ifdef __INSTANCING__ 00247 if(primAddr >= 0) { 00248 #endif 00249 int primAddr2 = __float_as_int(leaf.y); 00250 00251 /* pop */ 00252 nodeAddr = traversalStack[stackPtr]; 00253 --stackPtr; 00254 00255 /* triangle intersection */ 00256 while(primAddr < primAddr2) { 00257 /* intersect ray against triangle */ 00258 bvh_triangle_intersect(kg, isect, P, idir, visibility, object, primAddr); 00259 00260 /* shadow ray early termination */ 00261 if(visibility == PATH_RAY_SHADOW_OPAQUE && isect->prim != ~0) 00262 return true; 00263 00264 primAddr++; 00265 } 00266 #ifdef __INSTANCING__ 00267 } 00268 else { 00269 /* instance push */ 00270 object = kernel_tex_fetch(__prim_object, -primAddr-1); 00271 00272 bvh_instance_push(kg, object, ray, &P, &idir, &isect->t, tmax); 00273 00274 ++stackPtr; 00275 traversalStack[stackPtr] = ENTRYPOINT_SENTINEL; 00276 00277 nodeAddr = kernel_tex_fetch(__object_node, object); 00278 } 00279 #endif 00280 } 00281 } while(nodeAddr != ENTRYPOINT_SENTINEL); 00282 00283 #ifdef __INSTANCING__ 00284 if(stackPtr >= 0) { 00285 kernel_assert(object != ~0); 00286 00287 /* instance pop */ 00288 bvh_instance_pop(kg, object, ray, &P, &idir, &isect->t, tmax); 00289 object = ~0; 00290 nodeAddr = traversalStack[stackPtr]; 00291 --stackPtr; 00292 } 00293 #endif 00294 } while(nodeAddr != ENTRYPOINT_SENTINEL); 00295 00296 return (isect->prim != ~0); 00297 } 00298 00299 __device_inline float3 ray_offset(float3 P, float3 Ng) 00300 { 00301 #ifdef __INTERSECTION_REFINE__ 00302 const float epsilon_f = 1e-5f; 00303 const int epsilon_i = 32; 00304 00305 float3 res; 00306 00307 /* x component */ 00308 if(fabsf(P.x) < epsilon_f) { 00309 res.x = P.x + Ng.x*epsilon_f; 00310 } 00311 else { 00312 uint ix = __float_as_uint(P.x); 00313 ix += ((ix ^ __float_as_uint(Ng.x)) >> 31)? -epsilon_i: epsilon_i; 00314 res.x = __uint_as_float(ix); 00315 } 00316 00317 /* y component */ 00318 if(fabsf(P.y) < epsilon_f) { 00319 res.y = P.y + Ng.y*epsilon_f; 00320 } 00321 else { 00322 uint iy = __float_as_uint(P.y); 00323 iy += ((iy ^ __float_as_uint(Ng.y)) >> 31)? -epsilon_i: epsilon_i; 00324 res.y = __uint_as_float(iy); 00325 } 00326 00327 /* z component */ 00328 if(fabsf(P.z) < epsilon_f) { 00329 res.z = P.z + Ng.z*epsilon_f; 00330 } 00331 else { 00332 uint iz = __float_as_uint(P.z); 00333 iz += ((iz ^ __float_as_uint(Ng.z)) >> 31)? -epsilon_i: epsilon_i; 00334 res.z = __uint_as_float(iz); 00335 } 00336 00337 return res; 00338 #else 00339 const float epsilon_f = 1e-4f; 00340 return P + epsilon_f*Ng; 00341 #endif 00342 } 00343 00344 __device_inline float3 bvh_triangle_refine(KernelGlobals *kg, const Intersection *isect, const Ray *ray) 00345 { 00346 float3 P = ray->P; 00347 float3 D = ray->D; 00348 float t = isect->t; 00349 00350 #ifdef __INTERSECTION_REFINE__ 00351 if(isect->object != ~0) { 00352 Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); 00353 00354 P = transform(&tfm, P); 00355 D = transform_direction(&tfm, D*t); 00356 D = normalize_len(D, &t); 00357 } 00358 00359 P = P + D*t; 00360 00361 float4 v00 = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0); 00362 float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z; 00363 float invDz = 1.0f/(D.x*v00.x + D.y*v00.y + D.z*v00.z); 00364 float rt = Oz * invDz; 00365 00366 P = P + D*rt; 00367 00368 if(isect->object != ~0) { 00369 Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); 00370 P = transform(&tfm, P); 00371 } 00372 00373 return P; 00374 #else 00375 return P + D*t; 00376 #endif 00377 } 00378 00379 CCL_NAMESPACE_END 00380