#include "tools.h"
void CpdatrmvLT(UNIT, N, alpha, A, IA, JA, descA, X, IX, JX, descX, incX, beta, 
                Y, IY, JY, descY, incY)
int UNIT;
int N;
double alpha;
double *A;
int IA;
int JA;
int *descA;
double *X;
int IX;
int JX;
int *descX;
int incX;
double beta;
double *Y;
int IY;
int JY;
int *descY;
int  incY;
/*
 *
 *             ======(Yr)======
 *
 *                    N
 *              ---------------
 *       |     |\_             |          A - N x N
 *       |     |  \_           |          Y - N x 1
 *       |     |    \_         |          X - 1 x N
 *      (Xc) N |      \_       |
 *       |     |        \_     |          X will be replicated on every
 *       |     |  (A)     \_   |            process column
 *       |     |            \  |          Y will have space on every
 *       |     |             \_|            process row
 *              ---------------
 */
{
/*
 * .. External routines ..
 */
   char *ptop();
   void pberror_();
   void Cinfog2l();
   int Cnumroc2();
   void Cblacs_gridinfo();
   void Cdgebs2d();
   void Cdgebr2d();
   void Cdgsum2d();
   F_INTG_FCT dgemv_();
   void Cpdscal1();
   void Cpdcopy1();
   void Cpdaxpy1();

   F_CHAR trans;
   char *top;
   int Ir, Jr, Ic, Jc, i, j, kb, nb, lld, LOCp, LOCq, one=1, I=0;
   int ctxt, nprow, npcol, myrow, mycol, arow, acol, currow, curcol;
   int descXc[DLEN_], descYr[DLEN_];
   double *a, *aa, *ac, *absA, *Xc, *Yr, *xc, *yr, *yr2, zero=0.0;

   trans = C2F_CHAR("T");
   ctxt = descA[CTXT_];
   Cblacs_gridinfo(descA[CTXT_], &nprow, &npcol, &myrow, &mycol);
/*
 * Scale Y by beta: Y = beta * Y; this allows us to later add in
 * alpha*A*x to get Y = alpha*A*x + Y*beta
 */
   Cpdscal1(N, beta, Y, IY, JY, descY, incY);
/*
 * Get local information about our matrix
 */
   Cinfog2l(IA, JA, descA, nprow, npcol, myrow, mycol, &i, &j, &arow, &acol);
   ac = &A[ i+j*descA[LLD_] ];
   nb = descA[NB_];
   lld = descA[LLD_];
   LOCp = Cnumroc2(N, IA, nb, myrow, descA[RSRC_], nprow);
   LOCq = Cnumroc2(N, JA, nb, mycol, descA[CSRC_], npcol);
/*
 * Set up Xc, Yr, and absA
 */
   Ir = Jc = 0;
   Ic = IA % nb;
   Jr = JA % nb;
   i = Ic + LOCp;
   j = Jr + LOCq;
   Mmalloc(Yr, double, i+j+nb*LOCp, kb, ctxt);
   Xc = &Yr[j];
   absA = &Xc[i];
   Mdescset(descXc, N+Ic, 1, nb, nb, arow,
            MCindxg2p(JX, descX[NB_], descX[CSRC_], npcol), ctxt, MAX(1,i));
   Mdescset(descYr, 1, N+Jr, nb, nb,
            MCindxg2p(IY, descY[MB_], descY[RSRC_], nprow), acol, ctxt, 1);
   if (myrow == arow) xc = &Xc[Ic];
   else xc = Xc;
   if (mycol == acol) yr2 = yr = &Yr[Jr];
   else yr2 = yr = Yr;
/*
 * Copy X and broadcast it to all process columns
 */
   Cpdcopy1(N, X, IX, JX, descX, incX, Xc, Ic, Jc, descXc, one);
   top = ptop("B", "R", "!");
   if (mycol == descXc[CSRC_])
   {
      for (i=0; i != LOCp; i++) xc[i] = ABS( xc[i] );
      Cdgebs2d(ctxt, "r", top, LOCp, 1, xc, LOCp);
   }
   else Cdgebr2d(ctxt, "r", top, LOCp, 1, xc, LOCp, myrow, descXc[CSRC_]);

   kb = nb - Ic;
   kb = MIN(kb, N);
   currow = arow;
   curcol = acol;
   do
   {
      if (mycol == curcol)
      {
         aa = absA;
         a = ac;
         if (myrow == currow) /* I have diagonal block */
         {
            for (j=0; j != kb; j++)
            {
               for (i=0; i != j; i++) aa[i] = 0.0;
               if (UNIT) aa[i] = 1.0;
               else aa[i] = ABS( a[i] );
               for (i++; i < LOCp; i++) aa[i] = ABS( a[i] );
               a += lld;
               aa += LOCp;
            }
            dgemv_(trans, &LOCp, &kb, &alpha, absA, &LOCp, xc, &one, &zero,
                   yr, &one);
            LOCp -= kb;
            ac += (kb * (lld + 1));
            xc += kb;
         }
         else
         {
            if (LOCp)
            {
               for (j=kb; j; j--)
               {
                  for (i=0; i != LOCp; i++) aa[i] = ABS( a[i] );
                  aa += LOCp;
                  a += lld;
               }
               dgemv_(trans, &LOCp, &kb, &alpha, absA, &LOCp, xc, &one, &zero,
                      yr, &one);
            }
            else for (i=0; i != kb; i++) yr[i] = 0.0;
            ac += (kb * lld);
         }
         yr += kb;
      }
      else if (myrow == currow)
      {
         ac += kb;
         xc += kb;
         LOCp -= kb;
      }
      I += kb;
      kb = MIN(nb, N-I);
      if (++currow == nprow) currow=0;
      if (++curcol == npcol) curcol=0;
   }
   while (I != N);
/*
 * Collect distributed Y
 */
   if (LOCq)
   {
      top = ptop("C", "C", "!");
      Cdgsum2d(ctxt, "col", top, LOCq, 1, yr2, LOCq, descYr[RSRC_], mycol);
   }
/*
 * Set y = A*x + beta*y (Y has beta*y, and Yr has A*x)
 */
   Cpdaxpy1(N, 1.0, Yr, Ir, Jr, descYr, one, Y, IY, JY, descY, incY);
   if (Yr) free(Yr);
}
