Blender: kernel_bvh.h Source File

Blender V2.61 - r43446
00001 /*
00002  * Adapted from code Copyright 2009-2010 NVIDIA Corporation
00003  * Modifications Copyright 2011, Blender Foundation.
00004  *
00005  * Licensed under the Apache License, Version 2.0 (the "License");
00006  * you may not use this file except in compliance with the License.
00007  * You may obtain a copy of the License at
00008  *
00009  * http://www.apache.org/licenses/LICENSE-2.0
00010  *
00011  * Unless required by applicable law or agreed to in writing, software
00012  * distributed under the License is distributed on an "AS IS" BASIS,
00013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014  * See the License for the specific language governing permissions and
00015  * limitations under the License.
00016  */
00017 
00018 CCL_NAMESPACE_BEGIN
00019 
00020 /*
00021  * "Persistent while-while kernel" used in:
00022  *
00023  * "Understanding the Efficiency of Ray Traversal on GPUs",
00024  * Timo Aila and Samuli Laine,
00025  * Proc. High-Performance Graphics 2009
00026  */
00027 
00028 /* bottom-most stack entry, indicating the end of traversal */
00029 
00030 #define ENTRYPOINT_SENTINEL 0x76543210
00031 /* 64 object BVH + 64 mesh BVH + 64 object node splitting */
00032 #define BVH_STACK_SIZE 192
00033 #define BVH_NODE_SIZE 4
00034 #define TRI_NODE_SIZE 3
00035 
00036 /* silly workaround for float extended precision that happens when compiling
00037    without sse support on x86, it results in different results for float ops
00038    that you would otherwise expect to compare correctly */
00039 #if !defined(__i386__) || defined(__SSE__)
00040 #define NO_EXTENDED_PRECISION
00041 #else
00042 #define NO_EXTENDED_PRECISION volatile
00043 #endif
00044 
00045 __device_inline float3 bvh_inverse_direction(float3 dir)
00046 {
00047     /* avoid divide by zero (ooeps = exp2f(-80.0f)) */
00048     float ooeps = 0.00000000000000000000000082718061255302767487140869206996285356581211090087890625f;
00049     float3 idir;
00050 
00051     idir.x = 1.0f/((fabsf(dir.x) > ooeps)? dir.x: copysignf(ooeps, dir.x));
00052     idir.y = 1.0f/((fabsf(dir.y) > ooeps)? dir.y: copysignf(ooeps, dir.y));
00053     idir.z = 1.0f/((fabsf(dir.z) > ooeps)? dir.z: copysignf(ooeps, dir.z));
00054 
00055     return idir;
00056 }
00057 
00058 __device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax)
00059 {
00060     Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
00061 
00062     *P = transform(&tfm, ray->P);
00063 
00064     float3 dir = transform_direction(&tfm, ray->D);
00065 
00066     float len;
00067     dir = normalize_len(dir, &len);
00068 
00069     *idir = bvh_inverse_direction(dir);
00070 
00071     if(*t != FLT_MAX)
00072         *t *= len;
00073 }
00074 
00075 __device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax)
00076 {
00077     Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
00078 
00079     if(*t != FLT_MAX)
00080         *t *= len(transform_direction(&tfm, 1.0f/(*idir)));
00081 
00082     *P = ray->P;
00083     *idir = bvh_inverse_direction(ray->D);
00084 }
00085 
00086 /* intersect two bounding boxes */
00087 __device_inline void bvh_node_intersect(KernelGlobals *kg,
00088     bool *traverseChild0, bool *traverseChild1,
00089     bool *closestChild1, int *nodeAddr0, int *nodeAddr1,
00090     float3 P, float3 idir, float t, uint visibility, int nodeAddr)
00091 {
00092     /* fetch node data */
00093     float4 n0xy = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
00094     float4 n1xy = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
00095     float4 nz = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
00096     float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
00097 
00098     /* intersect ray against child nodes */
00099     float3 ood = P * idir;
00100     float c0lox = n0xy.x * idir.x - ood.x;
00101     float c0hix = n0xy.y * idir.x - ood.x;
00102     float c0loy = n0xy.z * idir.y - ood.y;
00103     float c0hiy = n0xy.w * idir.y - ood.y;
00104     float c0loz = nz.x * idir.z - ood.z;
00105     float c0hiz = nz.y * idir.z - ood.z;
00106     NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
00107     NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
00108 
00109     float c1loz = nz.z * idir.z - ood.z;
00110     float c1hiz = nz.w * idir.z - ood.z;
00111     float c1lox = n1xy.x * idir.x - ood.x;
00112     float c1hix = n1xy.y * idir.x - ood.x;
00113     float c1loy = n1xy.z * idir.y - ood.y;
00114     float c1hiy = n1xy.w * idir.y - ood.y;
00115     NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
00116     NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
00117 
00118     /* decide which nodes to traverse next */
00119 #ifdef __VISIBILITY_FLAG__
00120     /* this visibility test gives a 5% performance hit, how to solve? */
00121     *traverseChild0 = (c0max >= c0min) && (__float_as_int(cnodes.z) & visibility);
00122     *traverseChild1 = (c1max >= c1min) && (__float_as_int(cnodes.w) & visibility);
00123 #else
00124     *traverseChild0 = (c0max >= c0min);
00125     *traverseChild1 = (c1max >= c1min);
00126 #endif
00127 
00128     *nodeAddr0 = __float_as_int(cnodes.x);
00129     *nodeAddr1 = __float_as_int(cnodes.y);
00130 
00131     *closestChild1 = (c1min < c0min);
00132 }
00133 
00134 /* Sven Woop's algorithm */
00135 __device_inline void bvh_triangle_intersect(KernelGlobals *kg, Intersection *isect,
00136     float3 P, float3 idir, uint visibility, int object, int triAddr)
00137 {
00138     /* compute and check intersection t-value */
00139     float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0);
00140     float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1);
00141     float3 dir = 1.0f/idir;
00142 
00143     float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
00144     float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z);
00145     float t = Oz * invDz;
00146 
00147     if(t > 0.0f && t < isect->t) {
00148         /* compute and check barycentric u */
00149         float Ox = v11.w + P.x*v11.x + P.y*v11.y + P.z*v11.z;
00150         float Dx = dir.x*v11.x + dir.y*v11.y + dir.z*v11.z;
00151         float u = Ox + t*Dx;
00152 
00153         if(u >= 0.0f) {
00154             /* compute and check barycentric v */
00155             float4 v22 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2);
00156             float Oy = v22.w + P.x*v22.x + P.y*v22.y + P.z*v22.z;
00157             float Dy = dir.x*v22.x + dir.y*v22.y + dir.z*v22.z;
00158             float v = Oy + t*Dy;
00159 
00160             if(v >= 0.0f && u + v <= 1.0f) {
00161 #ifdef __VISIBILITY_FLAG__
00162                 /* visibility flag test. we do it here under the assumption
00163                    that most triangles are culled by node flags */
00164                 if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility)
00165 #endif
00166                 {
00167                     /* record intersection */
00168                     isect->prim = triAddr;
00169                     isect->object = object;
00170                     isect->u = u;
00171                     isect->v = v;
00172                     isect->t = t;
00173                 }
00174             }
00175         }
00176     }
00177 }
00178 
00179 __device_inline bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect)
00180 {
00181     /* traversal stack in CUDA thread-local memory */
00182     int traversalStack[BVH_STACK_SIZE];
00183     traversalStack[0] = ENTRYPOINT_SENTINEL;
00184 
00185     /* traversal variables in registers */
00186     int stackPtr = 0;
00187     int nodeAddr = kernel_data.bvh.root;
00188 
00189     /* ray parameters in registers */
00190     const float tmax = ray->t;
00191     float3 P = ray->P;
00192     float3 idir = bvh_inverse_direction(ray->D);
00193     int object = ~0;
00194 
00195     isect->t = tmax;
00196     isect->object = ~0;
00197     isect->prim = ~0;
00198     isect->u = 0.0f;
00199     isect->v = 0.0f;
00200 
00201     /* traversal loop */
00202     do {
00203         do
00204         {
00205             /* traverse internal nodes */
00206             while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL)
00207             {
00208                 bool traverseChild0, traverseChild1, closestChild1;
00209                 int nodeAddrChild1;
00210 
00211                 bvh_node_intersect(kg, &traverseChild0, &traverseChild1,
00212                     &closestChild1, &nodeAddr, &nodeAddrChild1,
00213                     P, idir, isect->t, visibility, nodeAddr);
00214 
00215                 if(traverseChild0 != traverseChild1) {
00216                     /* one child was intersected */
00217                     if(traverseChild1) {
00218                         nodeAddr = nodeAddrChild1;
00219                     }
00220                 }
00221                 else {
00222                     if(!traverseChild0) {
00223                         /* neither child was intersected */
00224                         nodeAddr = traversalStack[stackPtr];
00225                         --stackPtr;
00226                     }
00227                     else {
00228                         /* both children were intersected, push the farther one */
00229                         if(closestChild1) {
00230                             int tmp = nodeAddr;
00231                             nodeAddr = nodeAddrChild1;
00232                             nodeAddrChild1 = tmp;
00233                         }
00234 
00235                         ++stackPtr;
00236                         traversalStack[stackPtr] = nodeAddrChild1;
00237                     }
00238                 }
00239             }
00240 
00241             /* if node is leaf, fetch triangle list */
00242             if(nodeAddr < 0) {
00243                 float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1));
00244                 int primAddr = __float_as_int(leaf.x);
00245 
00246 #ifdef __INSTANCING__
00247                 if(primAddr >= 0) {
00248 #endif
00249                     int primAddr2 = __float_as_int(leaf.y);
00250 
00251                     /* pop */
00252                     nodeAddr = traversalStack[stackPtr];
00253                     --stackPtr;
00254 
00255                     /* triangle intersection */
00256                     while(primAddr < primAddr2) {
00257                         /* intersect ray against triangle */
00258                         bvh_triangle_intersect(kg, isect, P, idir, visibility, object, primAddr);
00259 
00260                         /* shadow ray early termination */
00261                         if(visibility == PATH_RAY_SHADOW_OPAQUE && isect->prim != ~0)
00262                             return true;
00263 
00264                         primAddr++;
00265                     }
00266 #ifdef __INSTANCING__
00267                 }
00268                 else {
00269                     /* instance push */
00270                     object = kernel_tex_fetch(__prim_object, -primAddr-1);
00271 
00272                     bvh_instance_push(kg, object, ray, &P, &idir, &isect->t, tmax);
00273 
00274                     ++stackPtr;
00275                     traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
00276 
00277                     nodeAddr = kernel_tex_fetch(__object_node, object);
00278                 }
00279 #endif
00280             }
00281         } while(nodeAddr != ENTRYPOINT_SENTINEL);
00282 
00283 #ifdef __INSTANCING__
00284         if(stackPtr >= 0) {
00285             kernel_assert(object != ~0);
00286 
00287             /* instance pop */
00288             bvh_instance_pop(kg, object, ray, &P, &idir, &isect->t, tmax);
00289             object = ~0;
00290             nodeAddr = traversalStack[stackPtr];
00291             --stackPtr;
00292         }
00293 #endif
00294     } while(nodeAddr != ENTRYPOINT_SENTINEL);
00295 
00296     return (isect->prim != ~0);
00297 }
00298 
00299 __device_inline float3 ray_offset(float3 P, float3 Ng)
00300 {
00301 #ifdef __INTERSECTION_REFINE__
00302     const float epsilon_f = 1e-5f;
00303     const int epsilon_i = 32;
00304 
00305     float3 res;
00306 
00307     /* x component */
00308     if(fabsf(P.x) < epsilon_f) {
00309         res.x = P.x + Ng.x*epsilon_f;
00310     }
00311     else {
00312         uint ix = __float_as_uint(P.x);
00313         ix += ((ix ^ __float_as_uint(Ng.x)) >> 31)? -epsilon_i: epsilon_i;
00314         res.x = __uint_as_float(ix);
00315     }
00316 
00317     /* y component */
00318     if(fabsf(P.y) < epsilon_f) {
00319         res.y = P.y + Ng.y*epsilon_f;
00320     }
00321     else {
00322         uint iy = __float_as_uint(P.y);
00323         iy += ((iy ^ __float_as_uint(Ng.y)) >> 31)? -epsilon_i: epsilon_i;
00324         res.y = __uint_as_float(iy);
00325     }
00326 
00327     /* z component */
00328     if(fabsf(P.z) < epsilon_f) {
00329         res.z = P.z + Ng.z*epsilon_f;
00330     }
00331     else {
00332         uint iz = __float_as_uint(P.z);
00333         iz += ((iz ^ __float_as_uint(Ng.z)) >> 31)? -epsilon_i: epsilon_i;
00334         res.z = __uint_as_float(iz);
00335     }
00336 
00337     return res;
00338 #else
00339     const float epsilon_f = 1e-4f;
00340     return P + epsilon_f*Ng;
00341 #endif
00342 }
00343 
00344 __device_inline float3 bvh_triangle_refine(KernelGlobals *kg, const Intersection *isect, const Ray *ray)
00345 {
00346     float3 P = ray->P;
00347     float3 D = ray->D;
00348     float t = isect->t;
00349 
00350 #ifdef __INTERSECTION_REFINE__
00351     if(isect->object != ~0) {
00352         Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
00353 
00354         P = transform(&tfm, P);
00355         D = transform_direction(&tfm, D*t);
00356         D = normalize_len(D, &t);
00357     }
00358 
00359     P = P + D*t;
00360 
00361     float4 v00 = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0);
00362     float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
00363     float invDz = 1.0f/(D.x*v00.x + D.y*v00.y + D.z*v00.z);
00364     float rt = Oz * invDz;
00365 
00366     P = P + D*rt;
00367 
00368     if(isect->object != ~0) {
00369         Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
00370         P = transform(&tfm, P);
00371     }
00372 
00373     return P;
00374 #else
00375     return P + D*t;
00376 #endif
00377 }
00378 
00379 CCL_NAMESPACE_END
00380