Blender V2.61 - r43446
|
00001 00005 /* 00006 * -- SuperLU routine (version 3.0) -- 00007 * Univ. of California Berkeley, Xerox Palo Alto Research Center, 00008 * and Lawrence Berkeley National Lab. 00009 * October 15, 2003 00010 * 00011 */ 00012 /* 00013 Copyright (c) 1994 by Xerox Corporation. All rights reserved. 00014 00015 THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY 00016 EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. 00017 00018 Permission is hereby granted to use or copy this program for any 00019 purpose, provided the above notices are retained on all copies. 00020 Permission to modify the code and to distribute modified code is 00021 granted, provided the above notices are retained, and a notice that 00022 the code was modified is included with the above copyright notice. 00023 */ 00024 00025 #include <stdio.h> 00026 #include <stdlib.h> 00027 #include "ssp_defs.h" 00028 00029 /* 00030 * Function prototypes 00031 */ 00032 void slsolve(int, int, float *, float *); 00033 void smatvec(int, int, int, float *, float *, float *); 00034 extern void scheck_tempv(); 00035 00036 void 00037 spanel_bmod ( 00038 const int m, /* in - number of rows in the matrix */ 00039 const int w, /* in */ 00040 const int jcol, /* in */ 00041 const int nseg, /* in */ 00042 float *dense, /* out, of size n by w */ 00043 float *tempv, /* working array */ 00044 int *segrep, /* in */ 00045 int *repfnz, /* in, of size n by w */ 00046 GlobalLU_t *Glu, /* modified */ 00047 SuperLUStat_t *stat /* output */ 00048 ) 00049 { 00050 /* 00051 * Purpose 00052 * ======= 00053 * 00054 * Performs numeric block updates (sup-panel) in topological order. 00055 * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. 00056 * Special processing on the supernodal portion of L\U[*,j] 00057 * 00058 * Before entering this routine, the original nonzeros in the panel 00059 * were already copied into the spa[m,w]. 00060 * 00061 * Updated/Output parameters- 00062 * dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned 00063 * collectively in the m-by-w vector dense[*]. 00064 * 00065 */ 00066 00067 #ifdef USE_VENDOR_BLAS 00068 #ifdef _CRAY 00069 _fcd ftcs1 = _cptofcd("L", strlen("L")), 00070 ftcs2 = _cptofcd("N", strlen("N")), 00071 ftcs3 = _cptofcd("U", strlen("U")); 00072 #endif 00073 int incx = 1, incy = 1; 00074 float alpha, beta; 00075 #endif 00076 00077 register int k, ksub; 00078 int fsupc, nsupc, nsupr, nrow; 00079 int krep, krep_ind; 00080 float ukj, ukj1, ukj2; 00081 int luptr, luptr1, luptr2; 00082 int segsze; 00083 int block_nrow; /* no of rows in a block row */ 00084 register int lptr; /* Points to the row subscripts of a supernode */ 00085 int kfnz, irow, no_zeros; 00086 register int isub, isub1, i; 00087 register int jj; /* Index through each column in the panel */ 00088 int *xsup, *supno; 00089 int *lsub, *xlsub; 00090 float *lusup; 00091 int *xlusup; 00092 int *repfnz_col; /* repfnz[] for a column in the panel */ 00093 float *dense_col; /* dense[] for a column in the panel */ 00094 float *tempv1; /* Used in 1-D update */ 00095 float *TriTmp, *MatvecTmp; /* used in 2-D update */ 00096 float zero = 0.0; 00097 register int ldaTmp; 00098 register int r_ind, r_hi; 00099 static int first = 1, maxsuper, rowblk, colblk; 00100 flops_t *ops = stat->ops; 00101 00102 xsup = Glu->xsup; 00103 supno = Glu->supno; 00104 lsub = Glu->lsub; 00105 xlsub = Glu->xlsub; 00106 lusup = Glu->lusup; 00107 xlusup = Glu->xlusup; 00108 00109 if ( first ) { 00110 maxsuper = sp_ienv(3); 00111 rowblk = sp_ienv(4); 00112 colblk = sp_ienv(5); 00113 first = 0; 00114 } 00115 ldaTmp = maxsuper + rowblk; 00116 00117 /* 00118 * For each nonz supernode segment of U[*,j] in topological order 00119 */ 00120 k = nseg - 1; 00121 for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */ 00122 00123 /* krep = representative of current k-th supernode 00124 * fsupc = first supernodal column 00125 * nsupc = no of columns in a supernode 00126 * nsupr = no of rows in a supernode 00127 */ 00128 krep = segrep[k--]; 00129 fsupc = xsup[supno[krep]]; 00130 nsupc = krep - fsupc + 1; 00131 nsupr = xlsub[fsupc+1] - xlsub[fsupc]; 00132 nrow = nsupr - nsupc; 00133 lptr = xlsub[fsupc]; 00134 krep_ind = lptr + nsupc - 1; 00135 00136 repfnz_col = repfnz; 00137 dense_col = dense; 00138 00139 if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */ 00140 00141 TriTmp = tempv; 00142 00143 /* Sequence through each column in panel -- triangular solves */ 00144 for (jj = jcol; jj < jcol + w; jj++, 00145 repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) { 00146 00147 kfnz = repfnz_col[krep]; 00148 if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ 00149 00150 segsze = krep - kfnz + 1; 00151 luptr = xlusup[fsupc]; 00152 00153 ops[TRSV] += segsze * (segsze - 1); 00154 ops[GEMV] += 2 * nrow * segsze; 00155 00156 /* Case 1: Update U-segment of size 1 -- col-col update */ 00157 if ( segsze == 1 ) { 00158 ukj = dense_col[lsub[krep_ind]]; 00159 luptr += nsupr*(nsupc-1) + nsupc; 00160 00161 for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { 00162 irow = lsub[i]; 00163 dense_col[irow] -= ukj * lusup[luptr]; 00164 ++luptr; 00165 } 00166 00167 } else if ( segsze <= 3 ) { 00168 ukj = dense_col[lsub[krep_ind]]; 00169 ukj1 = dense_col[lsub[krep_ind - 1]]; 00170 luptr += nsupr*(nsupc-1) + nsupc-1; 00171 luptr1 = luptr - nsupr; 00172 00173 if ( segsze == 2 ) { 00174 ukj -= ukj1 * lusup[luptr1]; 00175 dense_col[lsub[krep_ind]] = ukj; 00176 for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { 00177 irow = lsub[i]; 00178 luptr++; luptr1++; 00179 dense_col[irow] -= (ukj*lusup[luptr] 00180 + ukj1*lusup[luptr1]); 00181 } 00182 } else { 00183 ukj2 = dense_col[lsub[krep_ind - 2]]; 00184 luptr2 = luptr1 - nsupr; 00185 ukj1 -= ukj2 * lusup[luptr2-1]; 00186 ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; 00187 dense_col[lsub[krep_ind]] = ukj; 00188 dense_col[lsub[krep_ind-1]] = ukj1; 00189 for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { 00190 irow = lsub[i]; 00191 luptr++; luptr1++; luptr2++; 00192 dense_col[irow] -= ( ukj*lusup[luptr] 00193 + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); 00194 } 00195 } 00196 00197 } else { /* segsze >= 4 */ 00198 00199 /* Copy U[*,j] segment from dense[*] to TriTmp[*], which 00200 holds the result of triangular solves. */ 00201 no_zeros = kfnz - fsupc; 00202 isub = lptr + no_zeros; 00203 for (i = 0; i < segsze; ++i) { 00204 irow = lsub[isub]; 00205 TriTmp[i] = dense_col[irow]; /* Gather */ 00206 ++isub; 00207 } 00208 00209 /* start effective triangle */ 00210 luptr += nsupr * no_zeros + no_zeros; 00211 00212 #ifdef USE_VENDOR_BLAS 00213 #ifdef _CRAY 00214 STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 00215 &nsupr, TriTmp, &incx ); 00216 #else 00217 strsv_( "L", "N", "U", &segsze, &lusup[luptr], 00218 &nsupr, TriTmp, &incx ); 00219 #endif 00220 #else 00221 slsolve ( nsupr, segsze, &lusup[luptr], TriTmp ); 00222 #endif 00223 00224 00225 } /* else ... */ 00226 00227 } /* for jj ... end tri-solves */ 00228 00229 /* Block row updates; push all the way into dense[*] block */ 00230 for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) { 00231 00232 r_hi = SUPERLU_MIN(nrow, r_ind + rowblk); 00233 block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind); 00234 luptr = xlusup[fsupc] + nsupc + r_ind; 00235 isub1 = lptr + nsupc + r_ind; 00236 00237 repfnz_col = repfnz; 00238 TriTmp = tempv; 00239 dense_col = dense; 00240 00241 /* Sequence through each column in panel -- matrix-vector */ 00242 for (jj = jcol; jj < jcol + w; jj++, 00243 repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { 00244 00245 kfnz = repfnz_col[krep]; 00246 if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ 00247 00248 segsze = krep - kfnz + 1; 00249 if ( segsze <= 3 ) continue; /* skip unrolled cases */ 00250 00251 /* Perform a block update, and scatter the result of 00252 matrix-vector to dense[]. */ 00253 no_zeros = kfnz - fsupc; 00254 luptr1 = luptr + nsupr * no_zeros; 00255 MatvecTmp = &TriTmp[maxsuper]; 00256 00257 #ifdef USE_VENDOR_BLAS 00258 alpha = one; 00259 beta = zero; 00260 #ifdef _CRAY 00261 SGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], 00262 &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); 00263 #else 00264 sgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1], 00265 &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); 00266 #endif 00267 #else 00268 smatvec(nsupr, block_nrow, segsze, &lusup[luptr1], 00269 TriTmp, MatvecTmp); 00270 #endif 00271 00272 /* Scatter MatvecTmp[*] into SPA dense[*] temporarily 00273 * such that MatvecTmp[*] can be re-used for the 00274 * the next blok row update. dense[] will be copied into 00275 * global store after the whole panel has been finished. 00276 */ 00277 isub = isub1; 00278 for (i = 0; i < block_nrow; i++) { 00279 irow = lsub[isub]; 00280 dense_col[irow] -= MatvecTmp[i]; 00281 MatvecTmp[i] = zero; 00282 ++isub; 00283 } 00284 00285 } /* for jj ... */ 00286 00287 } /* for each block row ... */ 00288 00289 /* Scatter the triangular solves into SPA dense[*] */ 00290 repfnz_col = repfnz; 00291 TriTmp = tempv; 00292 dense_col = dense; 00293 00294 for (jj = jcol; jj < jcol + w; jj++, 00295 repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { 00296 kfnz = repfnz_col[krep]; 00297 if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ 00298 00299 segsze = krep - kfnz + 1; 00300 if ( segsze <= 3 ) continue; /* skip unrolled cases */ 00301 00302 no_zeros = kfnz - fsupc; 00303 isub = lptr + no_zeros; 00304 for (i = 0; i < segsze; i++) { 00305 irow = lsub[isub]; 00306 dense_col[irow] = TriTmp[i]; 00307 TriTmp[i] = zero; 00308 ++isub; 00309 } 00310 00311 } /* for jj ... */ 00312 00313 } else { /* 1-D block modification */ 00314 00315 00316 /* Sequence through each column in the panel */ 00317 for (jj = jcol; jj < jcol + w; jj++, 00318 repfnz_col += m, dense_col += m) { 00319 00320 kfnz = repfnz_col[krep]; 00321 if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ 00322 00323 segsze = krep - kfnz + 1; 00324 luptr = xlusup[fsupc]; 00325 00326 ops[TRSV] += segsze * (segsze - 1); 00327 ops[GEMV] += 2 * nrow * segsze; 00328 00329 /* Case 1: Update U-segment of size 1 -- col-col update */ 00330 if ( segsze == 1 ) { 00331 ukj = dense_col[lsub[krep_ind]]; 00332 luptr += nsupr*(nsupc-1) + nsupc; 00333 00334 for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { 00335 irow = lsub[i]; 00336 dense_col[irow] -= ukj * lusup[luptr]; 00337 ++luptr; 00338 } 00339 00340 } else if ( segsze <= 3 ) { 00341 ukj = dense_col[lsub[krep_ind]]; 00342 luptr += nsupr*(nsupc-1) + nsupc-1; 00343 ukj1 = dense_col[lsub[krep_ind - 1]]; 00344 luptr1 = luptr - nsupr; 00345 00346 if ( segsze == 2 ) { 00347 ukj -= ukj1 * lusup[luptr1]; 00348 dense_col[lsub[krep_ind]] = ukj; 00349 for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { 00350 irow = lsub[i]; 00351 ++luptr; ++luptr1; 00352 dense_col[irow] -= (ukj*lusup[luptr] 00353 + ukj1*lusup[luptr1]); 00354 } 00355 } else { 00356 ukj2 = dense_col[lsub[krep_ind - 2]]; 00357 luptr2 = luptr1 - nsupr; 00358 ukj1 -= ukj2 * lusup[luptr2-1]; 00359 ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; 00360 dense_col[lsub[krep_ind]] = ukj; 00361 dense_col[lsub[krep_ind-1]] = ukj1; 00362 for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { 00363 irow = lsub[i]; 00364 ++luptr; ++luptr1; ++luptr2; 00365 dense_col[irow] -= ( ukj*lusup[luptr] 00366 + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); 00367 } 00368 } 00369 00370 } else { /* segsze >= 4 */ 00371 /* 00372 * Perform a triangular solve and block update, 00373 * then scatter the result of sup-col update to dense[]. 00374 */ 00375 no_zeros = kfnz - fsupc; 00376 00377 /* Copy U[*,j] segment from dense[*] to tempv[*]: 00378 * The result of triangular solve is in tempv[*]; 00379 * The result of matrix vector update is in dense_col[*] 00380 */ 00381 isub = lptr + no_zeros; 00382 for (i = 0; i < segsze; ++i) { 00383 irow = lsub[isub]; 00384 tempv[i] = dense_col[irow]; /* Gather */ 00385 ++isub; 00386 } 00387 00388 /* start effective triangle */ 00389 luptr += nsupr * no_zeros + no_zeros; 00390 00391 #ifdef USE_VENDOR_BLAS 00392 #ifdef _CRAY 00393 STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 00394 &nsupr, tempv, &incx ); 00395 #else 00396 strsv_( "L", "N", "U", &segsze, &lusup[luptr], 00397 &nsupr, tempv, &incx ); 00398 #endif 00399 00400 luptr += segsze; /* Dense matrix-vector */ 00401 tempv1 = &tempv[segsze]; 00402 alpha = one; 00403 beta = zero; 00404 #ifdef _CRAY 00405 SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 00406 &nsupr, tempv, &incx, &beta, tempv1, &incy ); 00407 #else 00408 sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 00409 &nsupr, tempv, &incx, &beta, tempv1, &incy ); 00410 #endif 00411 #else 00412 slsolve ( nsupr, segsze, &lusup[luptr], tempv ); 00413 00414 luptr += segsze; /* Dense matrix-vector */ 00415 tempv1 = &tempv[segsze]; 00416 smatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1); 00417 #endif 00418 00419 /* Scatter tempv[*] into SPA dense[*] temporarily, such 00420 * that tempv[*] can be used for the triangular solve of 00421 * the next column of the panel. They will be copied into 00422 * ucol[*] after the whole panel has been finished. 00423 */ 00424 isub = lptr + no_zeros; 00425 for (i = 0; i < segsze; i++) { 00426 irow = lsub[isub]; 00427 dense_col[irow] = tempv[i]; 00428 tempv[i] = zero; 00429 isub++; 00430 } 00431 00432 /* Scatter the update from tempv1[*] into SPA dense[*] */ 00433 /* Start dense rectangular L */ 00434 for (i = 0; i < nrow; i++) { 00435 irow = lsub[isub]; 00436 dense_col[irow] -= tempv1[i]; 00437 tempv1[i] = zero; 00438 ++isub; 00439 } 00440 00441 } /* else segsze>=4 ... */ 00442 00443 } /* for each column in the panel... */ 00444 00445 } /* else 1-D update ... */ 00446 00447 } /* for each updating supernode ... */ 00448 00449 } 00450 00451 00452