/*
 *             Automatically Tuned Linear Algebra Software v3.0
 *                    (C) Copyright 1997 R. Clint Whaley                     
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *   1. Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions, and the following disclaimer in the
 *      documentation and/or other materials provided with the distribution.
 *   3. The name of the University, the ATLAS group, or the names of its 
 *      contributers may not be used to endorse or promote products derived
 *      from this software without specific written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 *
 */

#include "atlas_misc.h"
#include "atlas_lvl3.h"
#include "atlas_cacheedge.h"

#ifndef L1CacheSize
   #define L1CacheSize 8192
#endif

int Mjoin(PATL,mm_tMK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, 
                       const int M, const int N, const int K, 
                       const SCALAR alpha, const TYPE *A, const int lda,
                       const TYPE *B, const int ldb, const SCALAR beta, 
                       TYPE *C, const int ldc)
/*
 * Special code for when M & K <= NB; no reuse of panels, so need to promote
 * internal reuse within either L1 or CacheEdge (typically L2)
 * If C & B are same matrix, we can use aX code to handle a=0.0, since we
 * know the matrix is initialized.
 */
{
   const int nNb = ATL_DivByNB(N);
   const int incpB=ATL_MulByNB(K), incC = ATL_MulByNB(ldc);
   int incB, Np0, Np, Npb, CS, J, j;
   void *vA=NULL, *vB=NULL;
   TYPE *b, *pB, *pA;
   MATSCAL gescal;
   NBMM0 NBmm0;
   MAT2BLK copyB;

   ATL_assert(M <= NB && K <= NB);
   CS = ATL_DivBySize(L1CacheSize);
   if (ldc == ldb && C == B && M == K && TB == AtlasNoTrans)
      Npb = ATL_DivByNB((CS - M*K) / (2*K));
   else Npb = ATL_DivByNB((CS - M*K) / (2*K+M));
   if (Npb == 0)  /* if it won't fit in L1, try in CacheEdge */
   {
      #ifndef CacheEdge
         Npb = ATL_DivByNB(1000);
      #elif CacheEdge == 0
         Npb = ATL_DivByNB(1000);
      #else
         CS = ATL_DivBySize(CacheEdge);
         if (ldc == ldb && C == B && M == K && TB == AtlasNoTrans)
            Npb = ATL_DivByNB((CS - M*K) / (2*K));
         else Npb = ATL_DivByNB((CS - M*K) / (2*K+M));
         if (!Npb) Npb = 1;
      #endif
   }
   if (Npb > nNb) Npb = nNb;
   Np = ATL_MulByNB(Npb);

   if (K == lda && TA == AtlasTrans && alpha == ATL_rone) pA = (TYPE *) A;
   else
   {
      vA = malloc(ATL_MulBySize(M*K) + ATL_Cachelen);
      if (!vA) return(-1);
      pA = ATL_AlignPtr(vA);
      if (TA == AtlasNoTrans) 
      {
         if (alpha == ATL_rone) 
            Mjoin(PATL,row2blkT_a1)(K, M, A, lda, pA, alpha);
         else Mjoin(PATL,row2blkT_aX)(K, M, A, lda, pA, alpha);
      }
      else
      {
         if (alpha == ATL_rone) Mjoin(PATL,col2blk_a1)(K, M, A, lda, pA, alpha);
         else Mjoin(PATL,col2blk_aX)(K, M, A, lda, pA, alpha);
      }
   }

   if (ldb == K && TB == AtlasNoTrans)
   {
      pB = (TYPE *) B;
      copyB = NULL;
      incB = Np * ldb;
   }
   else
   {
      vB = malloc(ATL_MulBySize(K*Np) + ATL_Cachelen);
      while (!vB && Np)
      {
         Npb--; Np -= NB;
         if (Np) vB = malloc(ATL_MulBySize(K*Np) + ATL_Cachelen);
      }
      if (!vB) return(-1);
      pB = ATL_AlignPtr(vB);
      if (TB == AtlasNoTrans)
      {
         copyB = Mjoin(PATL,col2blk_a1);
         incB = Np * ldb;
      }
      else
      {
         copyB = Mjoin(PATL,row2blkT_a1);
         incB = Np;
      }
   }

   if (K == NB)
   {
      gescal = NULL;
      if (M == NB)
      {
         if (beta == ATL_rzero) NBmm0 = NBmm_b0;
         else if (beta == ATL_rone) NBmm0 = NBmm_b1;
         else NBmm0 = NBmm_bX;
      }
      else
      {
         if (beta == ATL_rzero) NBmm0 = Mjoin(PATL,pMBmm_b0);
         else if (beta == ATL_rone) NBmm0 = Mjoin(PATL,pMBmm_b1);
         else NBmm0 = Mjoin(PATL,pMBmm_bX);
      }
   }
   else
   {
      NBmm0 = Mjoin(PATL,pKBmm);
      if ( beta == ATL_rzero && 
           ((B != C) || (ldc != ldb) || (TB != AtlasNoTrans)) )
         gescal = Mjoin(PATL,gescal_b0);
      else gescal = NULL;
   }
   Np0 = Np;

   for (J=nNb; J; J -= Npb)
   {
      if (Npb > J)
      {
         Npb = J;
         Np = ATL_MulByNB(Npb);
      }
      if (copyB)
      {
         copyB(K, Np, B, ldb, pB, ATL_rone);
         B += incB;
      }
      if (gescal) gescal(M, Np, beta, C, ldc);
      for (b=pB, j=Npb; j; j--, b += incpB, C += incC)
         NBmm0(M, NB, K, ATL_rone, pA, K, b, K, beta, C, ldc);
      if (copyB == NULL) pB += incB;
   }
   if (j = (N-ATL_MulByNB(nNb)))
   {
      if (copyB)
      {
         if (TB == AtlasNoTrans) B -= (Np0 - Np)*ldb;
         else B -= Np0 - Np;
         copyB(K, j, B, ldb, pB, ATL_rone);
      }
      else pB -= (Np0 - Np)*K;
      if (M == MB && K == NB)
      {
         if (beta == ATL_rzero)
            Mjoin(PATL,pNBmm_b0)(M, j, K, ATL_rone, pA, K, pB, K, beta, C, ldc);
         else if (beta == ATL_rone)
            Mjoin(PATL,pNBmm_b0)(M, j, K, ATL_rone, pA, K, pB, K, beta, C, ldc);
         else
            Mjoin(PATL,pNBmm_b1)(M, j, K, ATL_rone, pA, K, pB, K, beta, C, ldc);
      }
      else
      {
         if (gescal) Mjoin(PATL,gezero)(M, j, C, ldc);
         Mjoin(PATL,pKBmm)(M, j, K, ATL_rone, pA, K, pB, K, beta, C, ldc);
      }
   }
   if (vA) free(vA);
   if (vB) free(vB);
}
