Blender: ssp_blas2.c Source File

Blender V2.61 - r43446
00001 
00005 /*
00006  * -- SuperLU routine (version 3.0) --
00007  * Univ. of California Berkeley, Xerox Palo Alto Research Center,
00008  * and Lawrence Berkeley National Lab.
00009  * October 15, 2003
00010  *
00011  */
00012 /*
00013  * File name:       ssp_blas2.c
00014  * Purpose:     Sparse BLAS 2, using some dense BLAS 2 operations.
00015  */
00016 
00017 #include "ssp_defs.h"
00018 
00019 /* 
00020  * Function prototypes 
00021  */
00022 void susolve(int, int, float*, float*);
00023 void slsolve(int, int, float*, float*);
00024 void smatvec(int, int, int, float*, float*, float*);
00025 int strsv_(char*, char*, char*, int*, float*, int*, float*, int*);
00026 
00027 int
00028 sp_strsv(char *uplo, char *trans, char *diag, SuperMatrix *L, 
00029          SuperMatrix *U, float *x, SuperLUStat_t *stat, int *info)
00030 {
00031 /*
00032  *   Purpose
00033  *   =======
00034  *
00035  *   sp_strsv() solves one of the systems of equations   
00036  *       A*x = b,   or   A'*x = b,
00037  *   where b and x are n element vectors and A is a sparse unit , or   
00038  *   non-unit, upper or lower triangular matrix.   
00039  *   No test for singularity or near-singularity is included in this   
00040  *   routine. Such tests must be performed before calling this routine.   
00041  *
00042  *   Parameters   
00043  *   ==========   
00044  *
00045  *   uplo   - (input) char*
00046  *            On entry, uplo specifies whether the matrix is an upper or   
00047  *             lower triangular matrix as follows:   
00048  *                uplo = 'U' or 'u'   A is an upper triangular matrix.   
00049  *                uplo = 'L' or 'l'   A is a lower triangular matrix.   
00050  *
00051  *   trans  - (input) char*
00052  *             On entry, trans specifies the equations to be solved as   
00053  *             follows:   
00054  *                trans = 'N' or 'n'   A*x = b.   
00055  *                trans = 'T' or 't'   A'*x = b.
00056  *                trans = 'C' or 'c'   A'*x = b.   
00057  *
00058  *   diag   - (input) char*
00059  *             On entry, diag specifies whether or not A is unit   
00060  *             triangular as follows:   
00061  *                diag = 'U' or 'u'   A is assumed to be unit triangular.   
00062  *                diag = 'N' or 'n'   A is not assumed to be unit   
00063  *                                    triangular.   
00064  *       
00065  *   L       - (input) SuperMatrix*
00066  *         The factor L from the factorization Pr*A*Pc=L*U. Use
00067  *             compressed row subscripts storage for supernodes,
00068  *             i.e., L has types: Stype = SC, Dtype = SLU_S, Mtype = TRLU.
00069  *
00070  *   U       - (input) SuperMatrix*
00071  *          The factor U from the factorization Pr*A*Pc=L*U.
00072  *          U has types: Stype = NC, Dtype = SLU_S, Mtype = TRU.
00073  *    
00074  *   x       - (input/output) float*
00075  *             Before entry, the incremented array X must contain the n   
00076  *             element right-hand side vector b. On exit, X is overwritten 
00077  *             with the solution vector x.
00078  *
00079  *   info    - (output) int*
00080  *             If *info = -i, the i-th argument had an illegal value.
00081  *
00082  */
00083 #ifdef _CRAY
00084     _fcd ftcs1 = _cptofcd("L", strlen("L")),
00085      ftcs2 = _cptofcd("N", strlen("N")),
00086      ftcs3 = _cptofcd("U", strlen("U"));
00087 #endif
00088     SCformat *Lstore;
00089     NCformat *Ustore;
00090     float   *Lval, *Uval;
00091     int incx = 1;
00092     int nrow;
00093     int fsupc, nsupr, nsupc, luptr, istart, irow;
00094     int i, k, iptr, jcol;
00095     float *work;
00096     flops_t solve_ops;
00097 
00098     /* Test the input parameters */
00099     *info = 0;
00100     if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1;
00101     else if ( !lsame_(trans, "N") && !lsame_(trans, "T") && 
00102               !lsame_(trans, "C")) *info = -2;
00103     else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3;
00104     else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4;
00105     else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5;
00106     if ( *info ) {
00107     i = -(*info);
00108     xerbla_("sp_strsv", &i);
00109     return 0;
00110     }
00111 
00112     Lstore = L->Store;
00113     Lval = Lstore->nzval;
00114     Ustore = U->Store;
00115     Uval = Ustore->nzval;
00116     solve_ops = 0;
00117 
00118     if ( !(work = floatCalloc(L->nrow)) )
00119     ABORT("Malloc fails for work in sp_strsv().");
00120     
00121     if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */
00122     
00123     if ( lsame_(uplo, "L") ) {
00124         /* Form x := inv(L)*x */
00125             if ( L->nrow == 0 ) {
00126             SUPERLU_FREE(work);
00127             return 0; /* Quick return */
00128         }
00129         
00130         for (k = 0; k <= Lstore->nsuper; k++) {
00131         fsupc = L_FST_SUPC(k);
00132         istart = L_SUB_START(fsupc);
00133         nsupr = L_SUB_START(fsupc+1) - istart;
00134         nsupc = L_FST_SUPC(k+1) - fsupc;
00135         luptr = L_NZ_START(fsupc);
00136         nrow = nsupr - nsupc;
00137 
00138             solve_ops += nsupc * (nsupc - 1);
00139             solve_ops += 2 * nrow * nsupc;
00140 
00141         if ( nsupc == 1 ) {
00142             for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) {
00143             irow = L_SUB(iptr);
00144             ++luptr;
00145             x[irow] -= x[fsupc] * Lval[luptr];
00146             }
00147         } else {
00148 #ifdef USE_VENDOR_BLAS
00149 #ifdef _CRAY
00150             STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
00151                 &x[fsupc], &incx);
00152         
00153             SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
00154                 &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
00155 #else
00156             strsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr,
00157                 &x[fsupc], &incx);
00158         
00159             sgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
00160                 &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
00161 #endif
00162 #else
00163             slsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]);
00164         
00165             smatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc],
00166                              &x[fsupc], &work[0] );
00167 #endif      
00168         
00169             iptr = istart + nsupc;
00170             for (i = 0; i < nrow; ++i, ++iptr) {
00171             irow = L_SUB(iptr);
00172             x[irow] -= work[i]; /* Scatter */
00173             work[i] = 0.0;
00174 
00175             }
00176         }
00177         } /* for k ... */
00178         
00179     } else {
00180         /* Form x := inv(U)*x */
00181         
00182         if ( U->nrow == 0 ) return 0; /* Quick return */
00183         
00184         for (k = Lstore->nsuper; k >= 0; k--) {
00185             fsupc = L_FST_SUPC(k);
00186             nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
00187             nsupc = L_FST_SUPC(k+1) - fsupc;
00188             luptr = L_NZ_START(fsupc);
00189         
00190                 solve_ops += nsupc * (nsupc + 1);
00191 
00192         if ( nsupc == 1 ) {
00193             x[fsupc] /= Lval[luptr];
00194             for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) {
00195             irow = U_SUB(i);
00196             x[irow] -= x[fsupc] * Uval[i];
00197             }
00198         } else {
00199 #ifdef USE_VENDOR_BLAS
00200 #ifdef _CRAY
00201             STRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr,
00202                &x[fsupc], &incx);
00203 #else
00204             strsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr,
00205                            &x[fsupc], &incx);
00206 #endif
00207 #else       
00208             susolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] );
00209 #endif      
00210 
00211             for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
00212                 solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
00213                 for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); 
00214                 i++) {
00215                 irow = U_SUB(i);
00216                 x[irow] -= x[jcol] * Uval[i];
00217                 }
00218                     }
00219         }
00220         } /* for k ... */
00221         
00222     }
00223     } else { /* Form x := inv(A')*x */
00224     
00225     if ( lsame_(uplo, "L") ) {
00226         /* Form x := inv(L')*x */
00227             if ( L->nrow == 0 ) return 0; /* Quick return */
00228         
00229         for (k = Lstore->nsuper; k >= 0; --k) {
00230             fsupc = L_FST_SUPC(k);
00231             istart = L_SUB_START(fsupc);
00232             nsupr = L_SUB_START(fsupc+1) - istart;
00233             nsupc = L_FST_SUPC(k+1) - fsupc;
00234             luptr = L_NZ_START(fsupc);
00235 
00236         solve_ops += 2 * (nsupr - nsupc) * nsupc;
00237 
00238         for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
00239             iptr = istart + nsupc;
00240             for (i = L_NZ_START(jcol) + nsupc; 
00241                 i < L_NZ_START(jcol+1); i++) {
00242             irow = L_SUB(iptr);
00243             x[jcol] -= x[irow] * Lval[i];
00244             iptr++;
00245             }
00246         }
00247         
00248         if ( nsupc > 1 ) {
00249             solve_ops += nsupc * (nsupc - 1);
00250 #ifdef _CRAY
00251                     ftcs1 = _cptofcd("L", strlen("L"));
00252                     ftcs2 = _cptofcd("T", strlen("T"));
00253                     ftcs3 = _cptofcd("U", strlen("U"));
00254             STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
00255             &x[fsupc], &incx);
00256 #else
00257             strsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
00258             &x[fsupc], &incx);
00259 #endif
00260         }
00261         }
00262     } else {
00263         /* Form x := inv(U')*x */
00264         if ( U->nrow == 0 ) return 0; /* Quick return */
00265         
00266         for (k = 0; k <= Lstore->nsuper; k++) {
00267             fsupc = L_FST_SUPC(k);
00268             nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
00269             nsupc = L_FST_SUPC(k+1) - fsupc;
00270             luptr = L_NZ_START(fsupc);
00271 
00272         for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
00273             solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
00274             for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) {
00275             irow = U_SUB(i);
00276             x[jcol] -= x[irow] * Uval[i];
00277             }
00278         }
00279 
00280         solve_ops += nsupc * (nsupc + 1);
00281 
00282         if ( nsupc == 1 ) {
00283             x[fsupc] /= Lval[luptr];
00284         } else {
00285 #ifdef _CRAY
00286                     ftcs1 = _cptofcd("U", strlen("U"));
00287                     ftcs2 = _cptofcd("T", strlen("T"));
00288                     ftcs3 = _cptofcd("N", strlen("N"));
00289             STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
00290                 &x[fsupc], &incx);
00291 #else
00292             strsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
00293                 &x[fsupc], &incx);
00294 #endif
00295         }
00296         } /* for k ... */
00297     }
00298     }
00299 
00300     stat->ops[SOLVE] += solve_ops;
00301     SUPERLU_FREE(work);
00302     return 0;
00303 }
00304 
00305 
00306 
00307 
00308 int
00309 sp_sgemv(char *trans, float alpha, SuperMatrix *A, float *x, 
00310      int incx, float beta, float *y, int incy)
00311 {
00312 /*  Purpose   
00313     =======   
00314 
00315     sp_sgemv()  performs one of the matrix-vector operations   
00316        y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   
00317     where alpha and beta are scalars, x and y are vectors and A is a
00318     sparse A->nrow by A->ncol matrix.   
00319 
00320     Parameters   
00321     ==========   
00322 
00323     TRANS  - (input) char*
00324              On entry, TRANS specifies the operation to be performed as   
00325              follows:   
00326                 TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
00327                 TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
00328                 TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.   
00329 
00330     ALPHA  - (input) float
00331              On entry, ALPHA specifies the scalar alpha.   
00332 
00333     A      - (input) SuperMatrix*
00334              Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
00335              Currently, the type of A can be:
00336                  Stype = NC or NCP; Dtype = SLU_S; Mtype = GE. 
00337              In the future, more general A can be handled.
00338 
00339     X      - (input) float*, array of DIMENSION at least   
00340              ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
00341              and at least   
00342              ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
00343              Before entry, the incremented array X must contain the   
00344              vector x.   
00345 
00346     INCX   - (input) int
00347              On entry, INCX specifies the increment for the elements of   
00348              X. INCX must not be zero.   
00349 
00350     BETA   - (input) float
00351              On entry, BETA specifies the scalar beta. When BETA is   
00352              supplied as zero then Y need not be set on input.   
00353 
00354     Y      - (output) float*,  array of DIMENSION at least   
00355              ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
00356              and at least   
00357              ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
00358              Before entry with BETA non-zero, the incremented array Y   
00359              must contain the vector y. On exit, Y is overwritten by the 
00360              updated vector y.
00361          
00362     INCY   - (input) int
00363              On entry, INCY specifies the increment for the elements of   
00364              Y. INCY must not be zero.   
00365 
00366     ==== Sparse Level 2 Blas routine.   
00367 */
00368 
00369     /* Local variables */
00370     NCformat *Astore;
00371     float   *Aval;
00372     int info;
00373     float temp;
00374     int lenx, leny, i, j, irow;
00375     int iy, jx, jy, kx, ky;
00376     int notran;
00377 
00378     notran = lsame_(trans, "N");
00379     Astore = A->Store;
00380     Aval = Astore->nzval;
00381     
00382     /* Test the input parameters */
00383     info = 0;
00384     if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1;
00385     else if ( A->nrow < 0 || A->ncol < 0 ) info = 3;
00386     else if (incx == 0) info = 5;
00387     else if (incy == 0) info = 8;
00388     if (info != 0) {
00389     xerbla_("sp_sgemv ", &info);
00390     return 0;
00391     }
00392 
00393     /* Quick return if possible. */
00394     if (A->nrow == 0 || A->ncol == 0 || (alpha == 0. && beta == 1.))
00395     return 0;
00396 
00397     /* Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
00398        up the start points in  X  and  Y. */
00399     if (lsame_(trans, "N")) {
00400     lenx = A->ncol;
00401     leny = A->nrow;
00402     } else {
00403     lenx = A->nrow;
00404     leny = A->ncol;
00405     }
00406     if (incx > 0) kx = 0;
00407     else kx =  - (lenx - 1) * incx;
00408     if (incy > 0) ky = 0;
00409     else ky =  - (leny - 1) * incy;
00410 
00411     /* Start the operations. In this version the elements of A are   
00412        accessed sequentially with one pass through A. */
00413     /* First form  y := beta*y. */
00414     if (beta != 1.) {
00415     if (incy == 1) {
00416         if (beta == 0.)
00417         for (i = 0; i < leny; ++i) y[i] = 0.;
00418         else
00419         for (i = 0; i < leny; ++i) y[i] = beta * y[i];
00420     } else {
00421         iy = ky;
00422         if (beta == 0.)
00423         for (i = 0; i < leny; ++i) {
00424             y[iy] = 0.;
00425             iy += incy;
00426         }
00427         else
00428         for (i = 0; i < leny; ++i) {
00429             y[iy] = beta * y[iy];
00430             iy += incy;
00431         }
00432     }
00433     }
00434     
00435     if (alpha == 0.) return 0;
00436 
00437     if ( notran ) {
00438     /* Form  y := alpha*A*x + y. */
00439     jx = kx;
00440     if (incy == 1) {
00441         for (j = 0; j < A->ncol; ++j) {
00442         if (x[jx] != 0.) {
00443             temp = alpha * x[jx];
00444             for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
00445             irow = Astore->rowind[i];
00446             y[irow] += temp * Aval[i];
00447             }
00448         }
00449         jx += incx;
00450         }
00451     } else {
00452         ABORT("Not implemented.");
00453     }
00454     } else {
00455     /* Form  y := alpha*A'*x + y. */
00456     jy = ky;
00457     if (incx == 1) {
00458         for (j = 0; j < A->ncol; ++j) {
00459         temp = 0.;
00460         for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
00461             irow = Astore->rowind[i];
00462             temp += Aval[i] * x[irow];
00463         }
00464         y[jy] += alpha * temp;
00465         jy += incy;
00466         }
00467     } else {
00468         ABORT("Not implemented.");
00469     }
00470     }
00471     return 0;
00472 } /* sp_sgemv */
00473 
00474 
00475