/*
 *             Automatically Tuned Linear Algebra Software v3.0Beta
 *                    (C) Copyright 1997 R. Clint Whaley                     
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *   1. Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions, and the following disclaimer in the
 *      documentation and/or other materials provided with the distribution.
 *   3. The name of the University, the ATLAS group, or the names of its 
 *      contributers may not be used to endorse or promote products derived
 *      from this software without specific written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include "atlas_misc.h"

#define Mmin(x, y) ( (x) > (y) ? (y) : (x) )
double time00(void);

#define TOLERANCE 1.2
#define REPS 4096
#define L1FNAME "L1CacheSize"
#define NTIM 3
#define MAXLAT 6

char LANG;
int ComplexFind=0, Zmuladd=0, Znb=0, Zmu=0, Znu=0, Zku=0, Zlat=0;
int ZFFetch=0, Zifetch=(-1), Znfetch=1;
int RmuladdB=0, RnbB=0, RmuB=0, RnuB=0, RkuB=0, RlatB=0;
int RFFetch=0, Rifetch=(-1), Rnfetch=1;
double RmfB=0.0, ZmfB=0.0;

void PrintUsage(char *xnam)
{
   fprintf(stderr, "\n\nUsage: %s [-r #][-h][-f][-l #][-p s/d/c/z][-m #]\n",
           xnam);
   fprintf(stderr, "-h         : Print this help screen\n");
   fprintf(stderr, "-f         : Force complete search over given parameters\n");
   fprintf(stderr, "-p s/d/c/z : set the precision to search for\n");
   fprintf(stderr, "-r #       : Set max number of registers to use to # (default 32)\n");
   fprintf(stderr, "-m #       : Set max L1 cache size (kilobytes) to #\n");
   fprintf(stderr, "-L <c/f>   : Select what language to use (C or Fortran77)\n");
   fprintf(stderr, "-K #       : Set K-loop unrolling to # (-1 = K).\n");
   fprintf(stderr, "-l #       : Use latency factor #.  If set to 0,\n");
   fprintf(stderr, 
"             do not do latency checking.  By default, latency checking is\n");
   fprintf(stderr, 
"             done only if initial timings show it is a win.\n");
   exit(-1);
}

void GetSettings(int nargs, char *args[], char *pre, char *lang, int *ku, 
                 int *LAT, int *FRC, int *nreg, int *MaxL1Size, int *ROUT)
{
   int i;

   *FRC = 0;
   *LAT = -1;
   *nreg = -1;
   *MaxL1Size = 64;
   *pre = 'd';
   *lang = 'C';
   *ku = 0;
   *ROUT = 0;
   for (i=1; i < nargs; i++)
   {
      if (*args[i] != '-') PrintUsage(args[0]);
      switch(args[i][1])
      {
      case 'K':
         *ku = atoi(args[++i]);
         break;
      case 'L':
         i++;
         if ( (*args[i] == 'F') || (*args[i] == 'f') ) *lang = 'F';
         break;
      case 'm' :
         *MaxL1Size = atoi(args[++i]);
         break;
      case 'r' :
         *nreg = atoi(args[++i]);
         break;
      case 'f' :
         *FRC = atoi(args[++i]);
         break;
      case 'l' :
         *LAT = atoi(args[++i]);
         break;
      case 'p' :
         *pre = *args[++i];
         break;
      default:
      case 'R':
         *ROUT = atoi(args[++i]);
         break;
      case 'h' :
         PrintUsage(args[0]);
      }
   }
}

void findNBs(char prec, char *NBnam, int MaxL1Size)
{
   FILE *L1f, *NBf;
   char ln[80];
   int i, L1Size, tmp, tsize, tL1Size, CL, nNB;
   int NB[100];

   fprintf(stderr, "NB setting not supplied; calculating:\n");
   L1f = fopen("res/L1CacheSize", "r");
   if (!L1f)
   {
      sprintf(ln, "make RunL1 MaxL1=%d\n",MaxL1Size);
      if (system(ln) != 0)
      {
         remove("res/L1CacheSize");
         fprintf(stderr, "Error in command: %s", ln);
         exit(-1);
      }
      L1f = fopen("res/L1CacheSize", "r");
      assert(L1f != NULL);
   }
   fscanf(L1f, "%d", &L1Size);
   fclose(L1f);
   fprintf(stderr, "\n      Read in L1 Cache size as = %dKB.\n",L1Size);

   switch (prec)
   {
      case 's':
         tsize = sizeof(float);
         break;
      case 'd':
         tsize = sizeof(double);
         break;
      case 'q':
         tsize = sizeof(long double);
         break;
      case 'c':
         tsize = sizeof(float);
         break;
      case 'z':
         tsize = sizeof(double);
         break;
   }

   tL1Size = L1Size * (1024 / tsize);
   tmp = CL = ATL_Cachelen / tsize;
   if (!tmp) tmp=1;
   nNB = 0;
   fprintf(stderr, "tmp=%d, tL1size=%d\n",tmp, tL1Size);
   while (tmp*tmp <= tL1Size)
   {
      if (tmp >= 16)        /* no block sizes smaller than 16 */
         NB[nNB++] = tmp;
      if (tmp >= 64) break;  /* no block sizes bigger than 64 */
      tmp += CL;
   }
   if (!nNB)  /* this should never happen */
   {
      nNB = 3;
      NB[0] = 8;
      NB[1] = 4;
      NB[2] = 16;
   }
   else  /* put second biggest blocking factor first in list */
   {
      if (nNB > 2)
      {
         tmp = NB[nNB-2];
         NB[nNB-2] = NB[0];
         NB[0] = tmp;
      }
   }

   NBf = fopen(NBnam, "w");
   fprintf(NBf, "%d\n", nNB);
   for (i=0; i != nNB; i++) fprintf(NBf, "%d\n", NB[i]);
   fclose(NBf);
}

double GetAvg(int n, double tolerance, double *mflop)
{
   int i, j;
   double t0, tavg;
/*
 * Sort results, largest first
 */
   for (i=0; i != n; i++)
   {
      for (j=i+1; j < n; j++)
      {
         if (mflop[i] < mflop[j])
         {
            t0 = mflop[i];
            mflop[i] = mflop[j];
            mflop[j] = t0;
         }
      }
   }

/*
 * Throw out result if it is outside tolerance; rerun if two mflop not within
 * tolerance;  this code assumes n == 3
 */
   if (tolerance*mflop[1] < mflop[0])  /* too big a range in results */
   {
      if (tolerance*mflop[2] < mflop[1]) return(-1.0);
      tavg = mflop[1] + mflop[2] / 2.0;
   }
   else if (tolerance*mflop[2] < mflop[0]) tavg = (mflop[0] + mflop[1]) / 2.0;
   else tavg = (mflop[0] + mflop[1] + mflop[2]) / 3.0;

   return(tavg);
}

double mms_case(char pre, int MULADD, int NB, int mu, int nu, int ku, int lat)
{
   char fnam[128], ln[256];
   int i;
   double mflop[NTIM], t0;
   FILE *fp;

   if (ku > NB) ku = NB;
   else if (ku == -1) ku = NB;
   sprintf(fnam, "res/%c%smm%c%c%d_%dx%dx%d_%dx%dx%d_%dx%dx%d%s%s_%dx%d_%d",
           pre, "JIK", 'T', 'N', NB, NB, NB, NB, NB, NB, 0, mu, nu, ku, 
           "_a1", "_b1", MULADD, lat, 1);
#if 0
   sprintf(fnam, "res/%c%cNB%d_%dx%dx%d_%d-%d.mflop", LANG, pre, NB, mu, nu,
           ku, MULADD, lat);
#endif
   fp = fopen(fnam, "r");
   if (fp == NULL)
   {
      if (pre == 'c' || pre == 'z')
         sprintf(ln,
" make mmcase pre=%c loopO=%s ta=%c tb=%c mb=%d nb=%d kb=%d lda=%d ldb=%d ldc=%d mu=%d nu=%d ku=%d alpha=%d beta=%d muladd=%d lat=%d csA=1 csB=1 csC=2 cleanup=%d\n",
                   pre, "JIK", 'T', 'N', NB, NB, NB, NB, NB, 0, mu, nu, ku,
                   1, 1, MULADD, lat, 1);
      else sprintf(ln,
" make mmcase pre=%c loopO=%s ta=%c tb=%c mb=%d nb=%d kb=%d lda=%d ldb=%d ldc=%d mu=%d nu=%d ku=%d alpha=%d beta=%d muladd=%d lat=%d cleanup=%d\n",
                   pre, "JIK", 'T', 'N', NB, NB, NB, NB, NB, 0, mu, nu, ku,
                   1, 1, MULADD, lat, 1);
      fprintf(stderr, "%s:\n",ln);
      if (system(ln) != 0)
      {
         fprintf(stderr, "Error in command: %s", ln);
         sprintf(ln, "rm -f %s\n", fnam);
         system(ln);
         exit(-1);
      }
      assert( (fp = fopen(fnam, "r")) != NULL );
   }
   for (i=0; i != NTIM; i++)
   {
      assert( fscanf(fp, "%lf", &mflop[i]) );
   }
   fclose(fp);

   t0 = GetAvg(NTIM, TOLERANCE, mflop);
   if (t0 == -1.0)
   {
      fprintf(stderr, "NB=%d, MU=%d, NU=%d, KU=%d: rerun with higher reps; variation exceeds tolerence\n", NB, mu, nu, ku);
      sprintf(ln, "rm -f res/%s\n", fnam);
      system(ln);
      exit(-1);
   }
   fprintf(stdout, 
      "\npre=%c, muladd=%d, lat=%d, nb=%d, mu=%d, nu=%d, ku=%d, mflop=%.2f\n",
           pre, MULADD, lat, NB, mu, nu, ku, t0);
   return(t0);
}

double mmcase(char *nam, char pre, char *loopO, char ta, char tb, 
              int M, int N, int K, int mb, int nb, int kb, 
              int lda, int ldb, int ldc, int mu, int nu, int ku, 
              int muladd, int lat, int beta, int csA, int csB, int csC, 
              int FFetch, int ifetch, int nfetch)
{
   char fnam[128], ln[256], bnam[16], casnam[128];
   int i, N0;
   double mflop[NTIM], t0;
   FILE *fp;

   if (ifetch == -1 || nfetch == -1) { ifetch = mu*nu; nfetch = 1; }
   if (beta == 1) sprintf(bnam, "_b1");
   else if (beta == -1) sprintf(bnam, "_bn1");
   else if (beta == 0) sprintf(bnam, "_b0");
   else sprintf(bnam, "_bX");
   N0 = Mmax(M,N);
   if (N0 < K) N0 = K;
   if (ku > K) ku = K;
   else if (ku == -1) ku = K;
   if (nam)
   {
      strcpy(fnam, nam);
      sprintf(casnam, "casnam=%s", nam);
   }
   else
   {
      sprintf(fnam, "res/%c%smm%c%c%d_%dx%dx%d_%dx%dx%d_%dx%dx%d%s%s_%dx%d_%d",
              pre, loopO, ta, tb, N0, mb, nb, kb, lda, ldb, ldc, mu, nu, ku, 
              "_a1", bnam, muladd, lat, 1);
      casnam[0] = '\0';
   }
   fp = fopen(fnam, "r");
   if (fp == NULL)
   {
      if (pre == 'c' || pre == 'z')
         sprintf(ln,
" make mmcase pre=%c loopO=%s ta=%c tb=%c M=%d N=%d K=%d mb=%d nb=%d kb=%d lda=%d ldb=%d ldc=%d mu=%d nu=%d ku=%d alpha=%d beta=%d muladd=%d lat=%d cleanup=%d csA=%d csB=%d csC=%d ff=%d if=%d nf=%d %s\n",
                 pre, loopO, ta, tb, M, N, K, mb, nb, kb, lda, ldb, ldc, 
                 mu, nu, ku, 1, beta, muladd, lat, 1, csA, csB, csC, FFetch,
                 ifetch, nfetch, casnam);
      else sprintf(ln,
" make mmcase pre=%c loopO=%s ta=%c tb=%c M=%d N=%d K=%d mb=%d nb=%d kb=%d lda=%d ldb=%d ldc=%d mu=%d nu=%d ku=%d alpha=%d beta=%d muladd=%d lat=%d cleanup=%d ff=%d if=%d nf=%d %s\n",
                   pre, loopO, ta, tb, M, N, K, mb, nb, kb, lda, ldb, ldc, 
                   mu, nu, ku, 1, beta, muladd, lat, 1, FFetch, ifetch, nfetch,
                   casnam);
      fprintf(stderr, "%s:\n",ln);
      if (system(ln) != 0)
      {
         fprintf(stderr, "Error in command: %s", ln);
         sprintf(ln, "rm -f %s\n", fnam);
         system(ln);
         exit(-1);
      }
      fp = fopen(fnam, "r");
      if (!fp) fprintf(stderr, "ERROR: can't find file=%s\n", fnam);
      assert(fp);
   }
   for (i=0; i != NTIM; i++)
   {
      assert( fscanf(fp, "%lf", &mflop[i]) );
   }
   fclose(fp);

   t0 = GetAvg(NTIM, TOLERANCE, mflop);
   if (t0 == -1.0)
   {
      fprintf(stderr, 
      "case=%s: rerun with higher reps; variation exceeds tolerence\n", fnam);
      sprintf(ln, "rm -f %s\n", fnam);
      system(ln);
      exit(-1);
   }
   fprintf(stdout, 
"\n   pre=%c, loopO=%s, ta=%c tb=%c, mb=%d, nb=%d, kb=%d, lda=%d, ldb=%d, ldc=%d\n",
           pre, loopO, ta, tb, mb, nb, kb, lda, ldb, ldc);
  fprintf(stdout, "   mu=%d, nu=%d, ku=%d, muladd=%d, lat=%d ====> mflop=%f\n",
          mu, nu, ku, muladd, lat, t0);
   return(t0);
}

int GetGoodLat(int MULADD, int kb, int mu, int nu, int ku, int lat)
{
   int slat, blat, i, ii = mu*nu*ku;
   if (MULADD) return(lat);
   if ( (lat > 1) && (kb > ku) && ((ii/lat)*lat != ii) )  /* lat won't work */
   {
      for (i=lat; i; i--) if ( (ii/i) * i == ii ) break;
      slat = i;
      for (i=lat; i < MAXLAT; i++) if ( (ii/i) * i == ii ) break;
      blat = i;
      if ( (ii/blat)*blat != ii ) blat = slat;
      if (slat < 2) lat = blat;
      else if (lat-slat < blat-lat) lat = slat;
      else lat = blat;
   }
   return(lat);
}

void FindMUNU(int muladd, int lat, int nr, int *MU, int *NU)
/*
 * Find near-square muxnu using nr registers or less
 */
{
   int j, mu, nu;

   if (nr < 1)
   {
      *MU = lat;
      *NU = 1;
      return;
   }
   if (muladd) j = nr;
   else j = nr - lat;
   if (j < 3) mu = nu = 1;
   else
   {
      mu = j + 1;
      for (nu=1; nu*nu < mu; nu++);
      if (nu*nu > mu) nu -= 2;
      else nu--;
      if (nu < 1) mu = nu = 1;
      else
      {
         mu = (nr-nu) / (1+nu);
         if (mu < 1) mu = 1;
      }
      if (mu < nu)
      {
         j = mu;
         mu = nu;
         nu = j;
      }
   }
   *MU = mu;
   *NU = nu;
}

void PutInstLogFile(FILE *fp, int muladd, int lat, int nb, 
                    int mu, int nu, int ku, int ForceFetch, 
                    int ifetch, int nfetch, double mflop)
{
   fprintf(fp, "MULADD  LAT  NB  MU  NU  KU  FFTCH  IFTCH  NFTCH    MFLOP\n");
   fprintf(fp, "%6d  %3d %3d %3d %3d %3d  %5d  %5d  %5d  %7.2lf\n",
           muladd, lat, nb, mu, nu, ku, ForceFetch, ifetch, nfetch, mflop);
}
void PutInstLogFile1(char *fnam, char pre, int muladd, int lat, int nb, 
                    int mu, int nu, int ku, int ForceFetch, 
                    int ifetch, int nfetch, double mflop)
{
   FILE *fp;

   fp = fopen(fnam, "w");
   assert(fp);
   PutInstLogFile(fp, muladd, lat, nb, mu, nu, ku, ForceFetch, ifetch, nfetch, 
                  mflop);
   fclose(fp);
}
void GetInstLogFile(char *nam, char pre, int *muladd, int *lat, int *nb, 
                    int *mu, int *nu, int *ku, int *ForceFetch,
                    int *ifetch, int *nfetch, double *mflop)
{
   char ln[128];
   FILE *fp;

   fp = fopen(nam, "r");
   if (fp == NULL) fprintf(stderr, "file %s not found!!\n\n", nam);
   assert(fp);
   fgets(ln, 128, fp);
   fscanf(fp, " %d  %d %d %d %d %d %d %d %d %lf\n",
          muladd, lat, nb, mu, nu, ku, ForceFetch, ifetch, nfetch, mflop);
   fclose(fp);
}

void CreateSummary(char pre, int maxreg, int muladd, int lat, int nb,
                   int mu, int nu, int ku, int FFetch, int ifetch, int nfetch,
                   double mflop)
{
   char ln[256];
   FILE *fp;

   sprintf(ln, "res/%cMMRES", pre);
   fp = fopen(ln, "w");
   assert(fp);
   PutInstLogFile(fp, muladd, lat, nb, mu, nu, ku, FFetch, ifetch, nfetch, 
                  mflop);
   fprintf(fp, "%d\n", maxreg);
   fclose(fp);
}

void FindFetch(char ta, char tb, char pre, int mb, int nb, int kb, 
               int mu, int nu, int ku, int muladd, int lat,
               int *FFetch0, int *ifetch0, int *nfetch0)
/*
 * See what fetch patterns are appropriate
 */
{
   char fnam[128];
   const int nelts = mu+nu;
   int csA=1, csB=1, csC=1, nleft, i, j;
   int ifetch = mu*nu, nfetch = 1;
   double mf, mf0;

   if (pre == 'c' || pre == 'z') csC = 2;

   mf0 = mmcase(NULL, pre, "JIK", ta,  tb, mb, nb, kb, mb, nb, kb, 
                kb, kb, 0, mu, nu, ku, muladd, lat, 0, csA, csB, csC, 
                0, ifetch, nfetch);

   for (i=2; i < nelts; i++)
   {
      nleft = nelts - i;
      for (j=1; j <= nleft; j++)
      {
         sprintf(fnam, "res/%cMMfetch%d_%d", pre, i, j);
         mf = mmcase(fnam, pre, "JIK", ta,  tb, mb, nb, kb, mb, nb, kb, 
                     kb, kb, 0, mu, nu, ku, muladd, lat, 0, csA, csB, csC, 
                     0, i, j);
         if (mf > mf0)
         {
            mf = mf0;
            ifetch = i;
            nfetch = j;
         }
      }
   }
/*
 * See if prefetching good idea for beta=0 case
 */
   sprintf(fnam, "res/%cMM_b0", pre);
   mf0 = mmcase(fnam, pre, "JIK", ta,  tb, mb, nb, kb, mb, nb, kb, 
                kb, kb, 0, mu, nu, ku, muladd, lat, 0, csA, csB, csC, 
                0, ifetch, nfetch);

   sprintf(fnam, "res/%cMM_b0_pref", pre);
   mf = mmcase(fnam, pre, "JIK", ta,  tb, mb, nb, kb, mb, nb, kb, 
               kb, kb, 0, mu, nu, ku, muladd, lat, 0, csA, csB, csC, 
               1, ifetch, nfetch);

   *FFetch0 = (mf > mf0);
   *ifetch0 = ifetch;
   *nfetch0 = nfetch;
   fprintf(stdout, "\n\nFORCEFETCH=%d, IFETCH = %d, NFETCH = %d\n\n",
           *FFetch0, *ifetch0, *nfetch0);
}

void mmsearch(char pre, int MULADD, int Fku, int nNBs, int *NBs, int nreg,
              int LAT, int Fnb)
{
   int latB, muB, nuB, kuB, nbB;
   int nb, ku, lat, ifetch, nfetch, FFetch;
   int TEST_MU, TEST_NU;
   int i, j, k, NB, nNB, maxreg, nr2;
   int NO1D=0;
   double mfB, mf, mf0, mf1;
   char ln[80];
   FILE *fp;

   lat = LAT;
/*
 * Try not to tempt fate by using all registers
 */
   if (nreg > 16) i = nreg-2;
   else i = nreg;
   FindMUNU(MULADD, lat, i, &TEST_MU, &TEST_NU);
/*
 * First, find a good NB
 */
   mfB = 0.0;
   fprintf(stderr, "Doing initial NB search:\n");
   i = 0;
   for (k=0; k != nNBs; k++)
   {
      NB = NBs[k];
      ku = NB;
      mf = mms_case(pre, MULADD, NB, TEST_MU, TEST_NU, ku, lat);
      if (mf > mfB)
      {
         mfB = mf;
         nbB = NB;
         muB = TEST_MU;
         nuB = TEST_NU;
         kuB = ku;
         latB = lat;
         i = k;
      }
      if (Fku == 0)  /* try no K-loop unrolling */
      {
         lat = GetGoodLat(MULADD, NB, TEST_MU, TEST_NU, 1, LAT);
         mf = mms_case(pre, MULADD, NB, TEST_MU, TEST_NU, 1, lat);
         if (mf > mfB)
         {
            mfB = mf;
            nbB = NB;
            muB = TEST_MU;
            nuB = TEST_NU;
            kuB = 1;
            latB = lat;
            i = k;
         }
      }
      if (i)
      {
         j = NBs[i];
         NBs[i] = NBs[0];
         NBs[0] = j;
      }
      fprintf(stderr, "NB=%d selected:\n", NBs[0]);
   }

   if (!Fnb) nNB = 1;
   if (MULADD)
      fprintf(stderr, "\nCombined multiply add, latency factor=%d, NB=%d ku=%d, chosen; initial MFLOP=%f.  Beginning unroll search:\n", latB, NBs[0], kuB, mfB);
   else
      fprintf(stderr, "\nSeparate multiply and add, latency factor=%d, NB=%d ku=%d, chosen; initial MFLOP=%f.  Beginning unroll search:\n", latB, NBs[0], kuB, mfB);

/*
 * See if we can skip 1D cases
 */
   lat = GetGoodLat(MULADD, NBs[0], 3, 3, 1, LAT);
   if (nreg >= 15+(!MULADD)*Mmax(LAT,lat))
   {
      mf0 = mms_case(pre, MULADD, NBs[0], 3, 3, 1, lat);
      mf1 = mms_case(pre, MULADD, NBs[0], 3, 3, NBs[0], LAT);
      mf = Mmax(mf1, mf0);
      mf0 = mms_case(pre, MULADD, NBs[0], 9, 1, 1, lat);
      if (mf0 > mf) NO1D = 0;
      else if (mms_case(pre, MULADD, NBs[0], 9, 1, NBs[0], LAT) > mf) NO1D = 0;
      else if (mms_case(pre, MULADD, NBs[0], 1, 9, NBs[0], LAT) > mf) NO1D = 0;
      else if (mms_case(pre, MULADD, NBs[0], 1, 9, 1, lat) > mf) NO1D = 0;
      else NO1D = 1;
   }
   else NO1D = 0;
   if (NO1D) fprintf(stderr, "\n\nSkipping most 1D cases\n\n");
   else fprintf(stderr, "\n\nTiming 1D cases\n\n");
   for (k=0; k != nNB; k++)
   {
      NB = NBs[k];
      if (NB >= nreg) maxreg = nreg;
      else maxreg = NB;
      for (i=1; i <= maxreg; i++)
      {
         nr2 = nreg / i;
         if (nr2 > NB) nr2 = NB;
         for (j=1; j <= maxreg; j++)
         {
            if ( (((i==1) && (j > 4)) || ((j==1) && (i > 4))) && NO1D) continue;
            if (Fku == -1 || (!Fku) ) ku = NB;
            else if (Fku) ku = Fku;
            if (ku != NB) lat = GetGoodLat(MULADD, NB, i, j, ku, LAT);
            else lat = LAT;
            if (j*i+j+i+(!MULADD)*lat > nreg) continue;  /* not enough regs */
            mf = mms_case(pre, MULADD, NB, i, j, ku, lat);
            if (mf > mfB)
            {
               mfB = mf;
               nbB = NB;
               muB = i;
               nuB = j;
               kuB = ku;
               latB = LAT;
            }
            if (!Fku)
            {
               lat = GetGoodLat(MULADD, NB, i, j, 1, LAT);
               mf = mms_case(pre, MULADD, NB, i, j, 1, lat);
               if (mf > mfB)
               {
                  mfB = mf;
                  nbB = NB;
                  muB = i;
                  nuB = j;
                  kuB = 1;
                  latB = lat;
               }
            }
         }
      }
   }
   fprintf(stderr, "\n\nBest case so far: nb=%d, mu=%d, nu=%d, ku=%d, lat=%d; MFLOPS=%f.\n",
           nbB, muB, nuB, kuB, latB, mfB);
   fprintf(stderr, "Trying various other NB and KU settings:\n\n");
/*
 * If we haven't checked all permutations, try other blocking factors
 */
   nb = nbB;
   if (!Fnb)
   {
      if (nNBs > 1) fprintf(stderr, "Trying various blocking factors:\n");
      mf = mms_case(pre, MULADD, NBs[0], muB, nuB, kuB, latB);
      for (k=0; k < nNBs; k++)
      {
         NB = NBs[k];
         if (Fku == -1) ku = NB;
         else if (Fku) ku = Fku;
         else if (kuB == nbB) ku = NB;
         else ku = kuB;
         if (ku != NB) lat = GetGoodLat(MULADD, NB, muB, nuB, ku, latB);
         else lat = latB;
         mf = mms_case(pre, MULADD, NB, muB, nuB, ku, lat);
         if (mf > mfB)
         {
            kuB = ku;
            mfB = mf;
            nbB = NB;
            latB = lat;
         }
      }
   }
   if (nb != nbB) fprintf(stderr, "\nNew block factor of %d chosen!!\n\n", nbB);
   NB = nbB;
/*
 * Save NB we've found 
 */
   sprintf(ln, "res/%cNB", pre);
   fp = fopen(ln, "w");
   fprintf(fp, "%d\n%d\n", 1, nbB);
   fclose(fp);

/*
 * For best case, try various ku's
 */
   fprintf(stderr, "Confirming K-loop unrollings for chosen NB:\n");
   mf = mms_case(pre, MULADD, nbB, muB, nuB, nbB, LAT);
   if (mf > mfB)
   {
      kuB = nbB;
      mfB = mf;
      latB = LAT;
   }
   for (k=1; k < nbB; k += 4)
   {
      if (k == 5) k = 4;
      if (k > nbB/2) k = nbB;
      lat = GetGoodLat(MULADD, nbB, muB, nuB, k, latB);
      mf = mms_case(pre, MULADD, nbB, muB, nuB, k, lat);
      if (mf > mfB)
      {
         latB = lat;
         kuB = k;
         mfB = mf;
      }
   }

   fprintf(stderr, "\nConfirming latency factors for chosen parameters:\n");
   for (i=1; i <= MAXLAT; i++)
   {
      lat = GetGoodLat(MULADD, nbB, muB, nuB, kuB, i);
      if (lat == i)
      {
         mf = mms_case(pre, MULADD, nbB, muB, nuB, kuB, lat);
         if (mf > mfB)
         {
            mfB = mf;
            latB = i;
         }
      }
   }
   fprintf(stderr, "\n\n   Best latency factor=%d\n\n", latB);

/*
 * Make sure MULADD is correct
 */
   lat = GetGoodLat(!MULADD, nbB, muB, nuB, kuB, latB);
   mf = mms_case(pre, !MULADD, nbB, muB, nuB, kuB, lat);
   if (mf > mfB)
   {
      fprintf(stderr, "\n\nMULLADD MAY BE WRONG!!, old=%f, new=%f\n", mfB, mf);
   }
/*
 * Try various fetch patterns
 */
   FindFetch('T', 'N', pre, nbB, nbB, nbB, muB, nuB, kuB, MULADD, latB,
             &FFetch, &ifetch, &nfetch);
/*
 * Save best case parameters we have found
 */
   if (ComplexFind)
   {
      Zmuladd=MULADD;  Znb=nbB; Zmu=muB; Znu=nuB; Zku=kuB; Zlat=latB;
      ZFFetch=FFetch; Zifetch=ifetch; Znfetch=nfetch;
      return;
   }
   RmuladdB=MULADD;  RnbB=nbB;  RmuB=muB;  RnuB=nuB;  RkuB=kuB;  RlatB=latB;
   RFFetch=FFetch; Rifetch=ifetch; Rnfetch=nfetch;
   RmfB = mfB;
   CreateSummary(pre, nreg, MULADD, latB, nbB, muB, nuB, kuB, 0, muB*nuB, 1, 
                 mfB);
   sprintf(ln, "res/%cBEST", pre);
   fp = fopen(ln, "w");
   fprintf(fp,"muladd=%d beta=1 pre=%c ldc=%d nb=%d mu=%d nu=%d ku=%d lat=%d\n",
           MULADD, pre, 0, nbB, muB, nuB, kuB, latB);
   fclose(fp);

}

void FindNC_0(char ta, char tb, char pre, int N, int mb, int nb, int kb, 
              int mu, int nu, int ku, int muladd, int lat,
              int FFetch, int ifetch, int nfetch)
{
   int kuB=ku, latB=lat, lat0=lat, kb0=kb;
   int i, j, k, csA=1, csB=1, csC=1, kmax;
   double mf0, mf;
   char fnam[128];
   FILE *fp;

   if (pre == 'c' || pre == 'z') csA = csB = csC = 2;
   assert(N > 0);
   if (kb == 0)
   {
      kb0 = 100000;
      if ((mb*nb)/lat != lat) lat0 = GetGoodLat(muladd, kb0, mu, nu, 1, lat);
   }
   k = 1024 / (mu*nu);
   for (kmax=4; kmax*kmax < k; kmax += 4);
   if (pre == 'd' || pre == 's') kmax *= 2;
   if (kmax >= N) kmax = N;
   else if (kmax > N/2) kmax = N/2;
   if (kb == 0) kuB = k = Mmin(ku,kmax);
   else k = ku;
/*
 * Find best non-cleanup case
 */
   mf0 = mmcase(NULL, pre, "JIK", ta, tb, N, N, N, mb, nb, kb, 0, 0, 0, 
                mu, nu, k, muladd, lat0, 1, csA, csB, csC, 
                FFetch, ifetch, nfetch);
   latB = lat0;
/*
 * If kb is not known, try all available K unrollings; for large mu*nu*N
 * combinations, don't try maximal unrollings in order to avoid having
 * the compiler run out of space trying to optimize
 */
   if (kb == 0)
   {
      for (k=1; k < kmax; k += 4)
      {
         if (k == 5) k = 4;
         if (k > N/2) k = kmax;
         j = k;
         if (kb == 0) j = 1;
         i = GetGoodLat(muladd, kb0, mu, nu, j, lat);
         mf = mmcase(NULL, pre, "JIK", ta, tb, N, N, N, mb, nb, kb, 0, 0, 0, 
                     mu, nu, k, muladd, i, 1, csA, csB, csC, 
                     FFetch, ifetch, nfetch);
         if (mf > mf0) 
         {
            mf0 = mf;
            kuB = k;
            latB = i;
         }
      }
   }
/*
 * If K is known, try only the most common unrollings
 */
   else
   {
      i = GetGoodLat(muladd, kb0, mu, nu, 1, lat);
      mf = mmcase(NULL, pre, "JIK", ta, tb, N, N, N, mb, nb, kb, 0, 0, 0, 
                  mu, nu, 1, muladd, i, 1, csA, csB, csC, 
                  FFetch, ifetch, nfetch);
      if (mf > mf0) 
      {
         mf0 = mf;
         kuB = 1;
         latB = i;
      }
      i = GetGoodLat(muladd, kb0, mu, nu, 4, lat);
      mf = mmcase(NULL, pre, "JIK", ta, tb, N, N, N, mb, nb, kb, 0, 0, 0, 
                  mu, nu, 4, muladd, i, 1, csA, csB, csC,
                  FFetch, ifetch, nfetch);
      if (mf > mf0) 
      {
         mf0 = mf;
         kuB = 4;
         latB = i;
      }
      mf = mmcase(NULL, pre, "JIK", ta, tb, N, N, N, mb, nb, kb, 0, 0, 0, 
                  mu, nu, kb, muladd, lat, 1, csA, csB, csC,
                  FFetch, ifetch, nfetch);
      if (mf > mf0) 
      {
         mf0 = mf;
         kuB = kb;
         latB = lat;
      }
   }
/*
 * Try various latencies
 */
   if (kb) i = kuB;
   else i = 1;
   for (k=2; k < 9; k++)
   {
      if (((mu*nu*i)/k)*k == mu*nu*i)
      {
         mf = mmcase(NULL, pre, "JIK", ta, tb, N, N, N, mb, nb, kb, 0, 0, 0, 
                     mu, nu, kuB, muladd, k, 1, csA, csB, csC,
                     FFetch, ifetch, nfetch);
         if (mf > mf0)
         {
            mf0 = mf;
            latB = k;
         }
      }
   }
   fprintf(stdout, "BEST for %c%c_%dx%dx%d: mflop=%.2f\n", 
           ta, tb, mb, nb, kb, mf0);
   fprintf(stdout, 
           "pre=%c ta=%c tb=%c nb=%d mu=%d nu=%d ku=%d muladd=%d lat=%d\n",
           pre, ta, tb, nb, mu, nu, kuB, muladd, latB);
   sprintf(fnam, "res/%cbest%c%c_%dx%dx%d", pre, ta, tb, mb, nb, kb);
   fp = fopen(fnam, "w");
   assert(fp);
   PutInstLogFile(fp,muladd, latB, N, mu, nu, kuB, FFetch, ifetch, nfetch, mf0);
   fclose(fp);
}

void FindNC0(char ta, char tb, char pre, int nb, int mu, int nu, int ku, 
             int muladd, int lat, int FFetch, int ifetch, int nfetch)
{
   FindNC_0(ta, tb, pre, nb, nb, nb, nb, mu, nu, ku, muladd, lat, FFetch, 
            ifetch, nfetch);
   FindNC_0(ta, tb, pre, nb, 0, 0, nb, mu, nu, ku, muladd, lat, FFetch,
            ifetch, nfetch);
   FindNC_0(ta, tb, pre, nb, 0, 0, 0, mu, nu, ku, muladd, lat, FFetch,
            ifetch, nfetch);
}

int FindNoCopyNB(char pre, int nb, int mu, int nu, int ku0, int muladd, int lat,
                 int FFetch, int ifetch, int nfetch)
/*
 * For complex, see if a smaller blocking factor is needed for no-copy
 */
{
   char fnam[128];
   int i, ku, nbB=nb, csA=2, csB=2, csC=2;
   double mf, mfB, mf0;
   FILE *fp;

   if (pre == 'z' || pre == 'c')
   {
      mfB = mmcase(NULL, pre, "JIK", 'T', 'N', nb, nb, nb, nb, nb, nb, 0, 0, 0, 
                   mu, nu, ku0, muladd, lat, 1, csA, csB, csC, 
                   FFetch, ifetch, nfetch);
      mf0 = mfB;
      for (i=nb-4; i >= 12; i -= 4)
      {
         ku = Mmin(i,ku0);
         mf = mmcase(NULL, pre, "JIK", 'T', 'N', i, i, i, i, i, i, 0, 0, 0, 
                     mu, nu, ku, muladd, lat, 1, csA, csB, csC,
                     FFetch, ifetch, nfetch);
         if (mf > mfB)
         {
            mfB = mf;
            nbB = i;
         }
      }
   }
   sprintf(fnam, "res/%cNCNB", pre);
   fp = fopen(fnam, "w");
   assert(fp);
   fprintf(fp, "%d\n", nbB);
   fclose(fp);
   fprintf(stdout, "\n%cNB = %d (%.2f), No copy %cNB = %d (%.2f)\n\n", 
           pre, nb, mf0, pre, nbB, mfB);
   return(nbB);
}

void FindNoCopy(char pre, int nb, int mu, int nu, int ku, int muladd, int lat,
                int FFetch, int ifetch, int nfetch)
{
   nb = FindNoCopyNB(pre, nb, mu, nu, ku, muladd, lat, FFetch, ifetch, nfetch);
   ku = Mmin(ku, nb);
   FindNC0('N', 'N', pre, nb, mu, nu, ku, muladd, lat, FFetch, ifetch, nfetch);
   FindNC0('N', 'T', pre, nb, mu, nu, ku, muladd, lat, FFetch, ifetch, nfetch);
   FindNC0('T', 'N', pre, nb, mu, nu, ku, muladd, lat, FFetch, ifetch, nfetch);
   FindNC0('T', 'T', pre, nb, mu, nu, ku, muladd, lat, FFetch, ifetch, nfetch);
}

void FindCleanupK(char pre, int nb, int mu, int nu, int ku0, int muladd, 
                  int lat0, int FFetch, int ifetch, int nfetch)
{
   char fnam[256];
   int genlat, genku, speclat, ku, kumax;
   int kb, beta, csC;
   double mf, genmf, specmf;
   int i;
   FILE *fp;

   i = 1024 / (mu*nu);
   for (kumax=4; kumax*kumax < i; kumax += 4);
   if (pre == 'd' || pre == 's') kumax *= 2;
   if (kumax >= nb) kumax = nb;
   else if (kumax > nb/2) kumax = nb/2;
   if (ifetch == -1 || nfetch == -1) { ifetch = mu*nu; nfetch = 1; }
   if (pre == 's' || pre == 'd')
   {
      csC = 1;
      beta = 1;
   }
   else
   {
      csC = 2;
      beta = 8;
   }
   sprintf(fnam, "res/%cCleanK", pre);
   fp = fopen(fnam, "r");
   if (fp == NULL)
   {
      fp = fopen(fnam, "w");
      assert(fp);
      fprintf(fp, " KB  MULADD  LAT  NB  MU  NU  KU  FFTCH  IFTCH  NFTCH  GEN-MFLOP  SPC-MFLOP\n");

      for (kb = nb; kb; kb--)
      {
         ku = Mmin(ku0, kb);
         sprintf(fnam, "res/%cKB_%d", pre, kb);
         speclat = GetGoodLat(muladd, kb, mu, nu, ku, lat0);
         specmf = mmcase(fnam, pre, "JIK", 'T', 'N', nb, nb, kb, 0, 0, 
                         kb, kb, kb, 0, mu, nu, ku, muladd, speclat, beta, 
                         1, 1, csC, FFetch, ifetch, nfetch);

         sprintf(fnam, "res/%cKB_0_%d", pre, ku);
         genlat = GetGoodLat(muladd, 8000, mu, nu, 1, lat0);
         genku = Mmin(kumax, ku);
         genmf = mmcase(fnam,pre, "JIK", 'T', 'N', nb, nb, kb, 0, 0, 0, 0, 0, 0,
                        mu, nu, genku, muladd, genlat, beta, 1, 1, csC,
                        FFetch, ifetch, nfetch);
         if (ku != 1)  /* always try ku == 1 for general case */
         {
            sprintf(fnam, "res/%cKB_0_1", pre);
            mf = mmcase(fnam,pre, "JIK", 'T', 'N', nb, nb, kb, 0, 0, 0, 0, 0, 0,
                        mu, nu, 1, muladd, genlat, beta, 1, 1, csC,
                        FFetch, ifetch, nfetch);
            if (mf > genmf) { genku = 1; genmf = mf; }
         }
         if (1.01 * genmf > specmf) break;
         fprintf(fp, 
            "%3d  %6d  %3d %3d %3d %3d %3d  %5d  %5d  %5d  %9.2lf  %9.2lf\n",
                 kb, muladd, speclat, nb, mu, nu, ku, FFetch, ifetch, nfetch, 
                 specmf, genmf);
         fflush(fp);
      }
      fprintf(fp, 
              "%3d  %6d  %3d %3d %3d %3d %3d  %5d  %5d  %5d  %9.2lf  %9.2lf\n",
              0, muladd, genlat, nb, mu, nu, genku, FFetch, ifetch, nfetch, 
              specmf, genmf);
      fclose(fp);
   }
}

void FindCleanupM(char pre, int nb, int mu, int nu, int ku, 
                  int muladd, int lat, int FFetch, int ifetch, int nfetch)
{
   char fnam[256];
   double mf;
   FILE *fp;
   int beta=1, csC=1;

   if (ifetch == -1 || nfetch == -1) { ifetch = mu*nu; nfetch = 1; }
   if (pre == 'c' || pre == 'z')
   {
      beta = 8;
      csC = 2;
   }
   sprintf(fnam, "res/%cCleanM", pre);
   fp = fopen(fnam, "r");
   if (fp == NULL)
   {
      mf = mmcase(NULL, pre, "JIK", 'T', 'N', nb, nb, nb, 0, nb, nb, nb, nb, 0,
                  mu, nu, ku, muladd, lat, beta, 1, 1, csC, 
                  FFetch, ifetch, nfetch);
      fp = fopen(fnam, "w");
      assert(fp);
      PutInstLogFile(fp, muladd, lat, nb, mu, nu, ku,
                     FFetch, ifetch, nfetch, mf);
      fclose(fp);
   }
   else fclose(fp);
}
void FindCleanupN(char pre, int nb, int mu, int nu, int ku,
                  int muladd, int lat, int FFetch, int ifetch, int nfetch)
{
   char fnam[256];
   double mf;
   FILE *fp;
   int beta=1, csC=1;

   if (ifetch == -1 || nfetch == -1) { ifetch = mu*nu; nfetch = 1; }
   if (pre == 'c' || pre == 'z')
   {
      beta = 8;
      csC = 2;
   }
   sprintf(fnam, "res/%cCleanN", pre);
   fp = fopen(fnam, "r");
   if (fp == NULL)
   {
      mf = mmcase(NULL, pre, "JIK", 'T', 'N', nb, nb, nb, nb, 0, nb, nb, nb, 0,
                  mu, nu, ku, muladd, lat, beta, 1, 1, csC, 
                  FFetch, ifetch, nfetch);
      fp = fopen(fnam, "w");
      assert(fp);
      PutInstLogFile(fp, muladd, lat, nb, mu, nu, ku, 
                     FFetch, ifetch, nfetch, mf);
      fclose(fp);
   }
   else fclose(fp);
}

void FindCleanup(char pre, int nb, int mu, int nu, int ku, int muladd, int lat,
                 int ForceFetch, int ifetch, int nfetch)
{
   FindCleanupM(pre, nb, mu, nu, ku, muladd, lat, ForceFetch, ifetch, nfetch);
   FindCleanupN(pre, nb, mu, nu, ku, muladd, lat, ForceFetch, ifetch, nfetch);
   FindCleanupK(pre, nb, mu, nu, ku, muladd, lat, ForceFetch, ifetch, nfetch);
}
int GetNumRegs0(char pre, int muladd, int nb, int lat, 
                int nr0, int nrN, int incN)
{
   int n, nr, i, imax, nu, mu;
   double *rates, mf, mmf=0.0;

   n = 0;
   i = nr0;
   while (i <= nrN)
   {
      if (incN == -2) i <<= 1;
      else i += incN;
      n++;
   }
   rates = malloc(n * sizeof(double));
   nr = nr0;
   for (i=0; i < n; i++)
   {
      FindMUNU(muladd, lat, nr, &mu, &nu);
      mf = rates[i] = mms_case(pre, muladd, nb, mu, nu, nb, lat);
      if (1.09*mf > mmf)
      {
         if (mf > mmf) mmf = mf;
         imax = i;
      }
      if (incN == -2) nr <<= 1;
      else nr += incN;
   }
   if (imax < n)
   {
      for (i=imax+1; i < n && 1.10*rates[i] < mmf; i++);
      if (i != n) i = -1;
      else if (incN == -2) i = (nr0 << imax);
      else i = nr0 + imax*incN;
   }
   else i = nrN;
   free(rates);
   return(i);
}

int RefineNumRegs(char pre, int muladd, int nb, int lat, int nr0, int nrN)
/*
 * recursively halves gap until true number is found
 */
{
   int i, nr;

   i = (nrN - nr0) / 2;
   if (i < 1) return(nr0);
   nr = GetNumRegs0(pre, muladd, nb, lat, nr0, nr0+i, i);
   if (nr != nr0) /* positive or no difference in two points, so go larger */
      nr0 += i;
   else          /* difference, point is between */
      nrN = nr0 + i;
   return(RefineNumRegs(pre, muladd, nb, lat, nr0, nrN));
}

int GetNumRegs(char pre, int muladd, int nb, int lat, int maxnr)
{
   int nr, i;

   fprintf(stderr, "\n\nFINDING ROUGHLY HOW MANY REGISTERS TO USE:\n\n");

   nr = GetNumRegs0(pre, muladd, nb, lat, 4, maxnr, -2);
/*
 * Refine number of regs
 */
   if (nr != -1) i = RefineNumRegs(pre, muladd, nb, lat, nr, nr<<1);
   else i = nr;
   fprintf(stderr, "\n\nAPPROXIMATE NUMBER OF USABLE REGISTERS=%d\n\n", i);
   return(i);
}

void RunTimes(char pre)
{
   const char TR[2] = {'N', 'T'};
   char fnam[128], ln[128];
   const int COMPLEX = (pre == 'c' || pre == 'z');
   int csC = (COMPLEX ? 2 : 1);
   int NB, muladd, lat, nb, mu, nu, ku, ffetch, ifetch, nfetch, ia, ib;
   int maxreg;
   double mf;
   FILE *fp;

   sprintf(fnam, "res/%cMMRES", pre);
   fp = fopen(fnam, "r");
   if (fp)
   {
      fgets(ln, 128, fp);
      fgets(ln, 128, fp);
      fscanf(fp, " %d", &maxreg);
      fclose(fp);
      GetInstLogFile(fnam, pre, &muladd, &lat, &nb, &mu, &nu, &ku,
                     &ffetch, &ifetch, &nfetch, &mf);
      if (mf < 0.0) /* need to retime */
      {
         mf = mmcase(NULL, pre, "JIK", 'T', 'N', nb, nb, nb, nb, nb, nb, 
                     nb, nb, 0, mu, nu, ku, muladd, lat, 1, 1, 1, csC, 
                     ffetch, ifetch, nfetch);
         fp = fopen(fnam, "w");
         assert(fp);
         PutInstLogFile(fp, muladd, lat, nb, mu, nu, ku,
                        ffetch, ifetch, nfetch, mf);
         fprintf(fp, "%d\n", maxreg);
         fclose(fp);
      }
   }

   sprintf(fnam, "res/%cNCNB", pre);
   fp = fopen(fnam, "r");
   if (fp == NULL) return;
   assert(fp);
   assert(fscanf(fp, " %d", &NB) == 1);
   fclose(fp);

   for (ia=0; ia < 2; ia++)
   {
      for (ib=0; ib < 2; ib++)
      {
         sprintf(fnam, "res/%cbest%c%c_%dx%dx%d", pre, TR[ia], TR[ib], 
                 NB, NB, NB);
         fp = fopen(fnam, "r");
         if (fp)
         {
            fclose(fp);
            GetInstLogFile(fnam, pre, &muladd, &lat, &nb, &mu, &nu, &ku,
                           &ffetch, &ifetch, &nfetch, &mf);
            if (mf < 0.0) /* need to retime */
            {
               mf = mmcase(NULL, pre, "JIK", TR[ia], TR[ib], nb, nb, nb, 
                           nb, nb, nb, 0, 0, 0, mu, nu, ku, muladd, lat, 1, 
                           1, 1, csC, ffetch, ifetch, nfetch);
               PutInstLogFile1(fnam, pre, muladd, lat, nb, mu, nu, ku,
                              ffetch, ifetch, nfetch, mf);
            }
         }
         sprintf(fnam, "res/%cbest%c%c_%dx%dx%d", pre, TR[ia], TR[ib], 
                 0, 0, NB);
         fp = fopen(fnam, "r");
         if (fp)
         {
            fclose(fp);
            GetInstLogFile(fnam, pre, &muladd, &lat, &nb, &mu, &nu, &ku,
                           &ffetch, &ifetch, &nfetch, &mf);
            if (mf < 0.0) /* need to retime */
            {
               mf = mmcase(NULL, pre, "JIK", TR[ia], TR[ib], nb, nb, nb, 
                           0, 0, nb, 0, 0, 0, mu, nu, ku, muladd, lat, 1, 
                           1, 1, csC, ffetch, ifetch, nfetch);
               PutInstLogFile1(fnam, pre, muladd, lat, nb, mu, nu, ku,
                              ffetch, ifetch, nfetch, mf);
            }
         }
         sprintf(fnam, "res/%cbest%c%c_%dx%dx%d", pre, TR[ia], TR[ib], 
                 0, 0, 0);
         fp = fopen(fnam, "r");
         if (fp)
         {
            fclose(fp);
            GetInstLogFile(fnam, pre, &muladd, &lat, &nb, &mu, &nu, &ku,
                           &ffetch, &ifetch, &nfetch, &mf);
            if (mf < 0.0) /* need to retime */
            {
               mf = mmcase(NULL, pre, "JIK", TR[ia], TR[ib], nb, nb, nb, 
                           0, 0, 0, 0, 0, 0, mu, nu, ku, muladd, lat, 1, 
                           1, 1, csC, ffetch, ifetch, nfetch);
               PutInstLogFile1(fnam, pre, muladd, lat, nb, mu, nu, ku,
                              ffetch, ifetch, nfetch, mf);
            }
         }
      }
   }
}

void cmmsearch(char pre, int nreg, int KUisNB, int muladd, int nNBs, int *NBs, 
               int mu, int nu, int ku, int lat)
/*
 * With all other parameters set by real search, find good complex NB
 */
{
   char *typ, ln[512];
   int i, k, mnb=0;
   double mf, mmf=0.0;
   FILE *fp;

   if (pre == 'c') typ = "SCPLX";
   else typ = "DCPLX";

   ComplexFind = 2;
   for (i=0; i < nNBs; i++)
   {
      if (KUisNB) k = NBs[i];
      else k = Mmin(ku, NBs[i]);
      mf = mms_case(pre, muladd, NBs[i], mu, nu, k, lat);
      if (mf > mmf)
      {
         mmf = mf;
         mnb = NBs[i];
      }
   }
   if (KUisNB) ku = mnb;
   else ku = Mmin(ku, mnb);
   sprintf(ln, "res/%cBEST", pre);
   fp = fopen(ln, "w");
   fprintf(fp, 
           "muladd=%d typ=%s pre=%c ldc=%d nb=%d mu=%d nu=%d ku=%d lat=%d\n",
           muladd, typ, pre, 0, mnb, mu, nu, ku, lat);
   fclose(fp);
   Znb = mnb; Zmuladd=muladd;  Zmu=mu; Znu=nu; Zku=ku; Zlat=lat; ZmfB=mmf;

/*
 * Save NB we've found 
 */
   sprintf(ln, "res/%cNB", pre);
   fp = fopen(ln, "w");
   fprintf(fp, "%d\n%d\n", 1, Znb);
   fclose(fp);
   CreateSummary(pre, nreg, muladd, lat, mnb, mu, nu, ku, 0, Zmu*Znu, 1, ZmfB);
}
void GetMulAdd(char pre, int *MULADD, int *lat)
{
   char nam[64], ln[128];
   FILE *fp;

   sprintf(nam, "res/%cMULADD", pre);
   fp = fopen(nam, "r");
   if (fp == NULL)
   {
      sprintf(ln, "make RunMulAdd pre=%c maxlat=%d mflop=%d\n", pre, 6, 200);
      assert(system(ln) == 0);
      fp = fopen(nam, "r");
      assert(fp != NULL);
   }
   fscanf(fp, "%d", MULADD);
   fscanf(fp, "%d", lat);
   fclose(fp);
}

main(int nargs, char *args[])
{
   char prec, upre, lang;
   char NBnam[80];
   int MULADD, lat, nNBs, MaxL1Size, ForceLat, FRC, i, nreg, ku, ROUT;
   int *NBs;
   FILE *NBf;

   GetSettings(nargs, args, &prec, &lang, &ku, &ForceLat, &FRC, &nreg, 
               &MaxL1Size, &ROUT);
   LANG = lang;

   if (prec == 'z') upre = 'd';
   else if (prec == 'c') upre = 's';
   else upre = prec;
   if (ROUT == -2)
   {
      RunTimes(prec);
      exit(0);
   }
   fprintf(stderr, "Precision='%c', FORCE=%d, LAT=%d, nreg=%d, MaxL1=%d\n",
           prec, FRC, ForceLat, nreg, MaxL1Size);
   sprintf(NBnam, "res/%cNB", prec);
   GetMulAdd(upre, &MULADD, &lat);
   NBf = fopen(NBnam, "r");
   if (!NBf)
   {
      findNBs(upre, NBnam, MaxL1Size);
      assert( (NBf = fopen(NBnam, "r")) != NULL );
   }
   fscanf(NBf, "%d", &nNBs);
   fprintf(stderr, "\nNB's to try: ");
   NBs = malloc(nNBs*sizeof(int));
   for (i=0; i != nNBs; i++)
   {
      fscanf(NBf, "%d", &NBs[i]);
      fprintf(stderr, "%d   ",NBs[i]);
   }
   fprintf(stderr, "\n\n");

   if (nreg == -1)
   {
      nreg = GetNumRegs(upre, MULADD, NBs[0], lat, 64);
      if (nreg == -1)
      {
         fprintf(stderr, 
            "\nUNABLE TO FIND NUMBER OF REGISTERS, ASSUMMING 32.\n\n");
         nreg = 32;
      }
      if (nreg > 48)
      {
         fprintf(stderr, "FOUND NUMBER OF REGISTERS TO BE %d; THIS WOULD TAKE TOO LONG TO SEARCH, SO SETTING TO 48.\n", nreg);
         nreg = 48;
      }
      if (nreg < 8)
      {
         fprintf(stderr, 
                 "FOUND # OF REGISTERS TO BE %d; TRYING 8 FOR SAFETY.\n",
                 nreg);
         nreg = 8;
      }
      else if (nreg < 16)
      {
         fprintf(stderr, 
                 "FOUND # OF REGISTERS TO BE %d; TRYING 16 FOR SAFETY.\n",
                 nreg);
         nreg = 16;
      }
   }
   if (ForceLat != -1) lat = ForceLat;
   if (prec == 'c' || prec == 'z')
   {
      ComplexFind = 1;
      mmsearch(upre, MULADD, ku, nNBs, NBs, nreg, lat, FRC);
      cmmsearch(prec, nreg, (Znb <= Zku), Zmuladd, nNBs, NBs, Zmu, Znu, 
                Zku, Zlat);
      FindNoCopy(prec, Znb, Zmu, Znu, Zku, Zmuladd, Zlat,
                 ZFFetch, Zifetch, Znfetch);
      FindCleanup(prec, Znb, Zmu, Znu, Zku, Zmuladd, Zlat, 
                  ZFFetch, Zifetch, Znfetch);
      fprintf(stdout, "\n\nFor this run, the best parameters found were MULADD=%d, NB=%d, MU=%d, NU=%d, KU=%d\n",
              Zmuladd, Znb, Zmu, Znu, Zku);
   fprintf(stdout, 
           "latency factor=%d.  This gave a performance = %f MFLOP.\n",
           Zlat, ZmfB);
   }
   else
   {
      mmsearch(prec, MULADD, ku, nNBs, NBs, nreg, lat, FRC);
      FindNoCopy(prec, RnbB, RmuB, RnuB, RkuB, RmuladdB, RlatB,
                 RFFetch, Rifetch, Rnfetch);
      FindCleanup(prec, RnbB, RmuB, RnuB, RkuB, RmuladdB, RlatB,
                  RFFetch, Rifetch, Rnfetch);
      fprintf(stdout, "\n\nFor this run, the best parameters found were MULADD=%d, NB=%d, MU=%d, NU=%d, KU=%d\n",
              RmuladdB, RnbB, RmuB, RnuB, RkuB);
   fprintf(stdout, 
           "latency factor=%d.  This gave a performance = %f MFLOP.\n",
           RlatB, RmfB);
   }
   free(NBs);
   fprintf(stdout, 
"The necessary files have been created.  If you are happy with\n");
   fprintf(stdout, 
"the above mflops for your system, type 'make %cinstall'.\n", prec);
   fprintf(stdout, 
"Otherwise, try the xmmsearch with different parameters, or hand\n");
   fprintf(stdout, "tweak the code.\n");
   exit(0);
}
