/*

######################################################################
######################################################################
##                                                                  ##
##                                                                  ##
##                          dynet                                   ##
##                                                                  ##
##                a recurrent neural network                        ##
##              for modelling dynamical systems                     ##
##                                                                  ##
##                           by                                     ##
##                                                                  ##
##                    Coryn Bailer-Jones                            ##
##                                                                  ##
##                         22/05/98                                 ##
##                                                                  ##
##             email: calj@mpia-hd.mpg.de                           ##
##               www: http://wol.ra.phy.cam.ac.uk/calj/             ##
##                                                                  ##
##                                                                  ##
##        see the README file for disclaimer and warranty           ##
##        see the dynet_manual file for operational details         ##
##                                                                  ##
##        This file is copyright 1998 by C.A.L. Bailer-Jones        ##
##                                                                  ##
##                                                                  ##
######################################################################
######################################################################


FILE:		dynet.c
DESCRIPTION:    ANSI C dynet program
AUTHOR:		Coryn Bailer-Jones
LAST MOD DATE:	02.03.99


######################################################################
######################################################################
##                                                                  ##
##                           dynet.c                                ##
##                                                                  ##
######################################################################
######################################################################


  Flags: (X = not yet implemented)

  1-49 inc. are user defined flags
  51+       are program internal flags 

  ********* User defined flags **********
  
  flag[1]  (train?) 0 = no, 1 = yes
  flag[2]  (apply?) 0 = no, 1 = yes
  flag[3]  (optimization method) 1 = gradient descent
                                 2 = macopt (forces flag[4]=4)
  flag[4]  (update method) 1 = after each p,t combination
                           3 = after each p (all t)
			   4 = after all p and t
X flag[5]  (error function) 1 = sum of squares			   
X flag[6]  (data file format) see README
  flag[7]  (Use noise (beta) parameters?) 0 = no, 1 = yes
  flag[8]  (Use alpha (weight decay) parameters?) 0 = no
                                                  1 = yes and set to defaults
						  2 = yes and read in from file
  flag[9]  (nature of weight initialisation) 1 = uniform distribution
  flag[10] (read in weights?) 0 = no  (=> initialise weights)
                              1 = yes (set if there's a line in spec file
			               giving input weight file name)
  flag[11] (output weights file specified?) 0 = no
                   (-> give warning and use default weights file) 
                                            1 = yes
                   (-> set if there's a line in spec file giving
	               output weight file name)
  flag[12] (data scaling method) 0 = none
                                 1 = zero mean, unit standard deviation
				 2 = maxmin (not yet implemented)
				 3 = only use Hlam (i.e. partial scaling by
				     number of external and recurrent inputs)
  flag[13] (perform maccheckgrad?) 0 = no, 1 = yes
  flag[14] (verbosity level)       0 = nothing, except intro title and errors
                                       (warnings are suppressed)
				       (If flag[13]=1, output is produced
				        from maccheckgrad regardless)
				   1 = minimum
				   2 = normal 
				   3 = routine calls & diagnostic information
				   4 = everything
				   (NB lots of run specific screen dumps
				       currently written in at this level)
  flag[15] (write final values of predicted
            sequences to file for plotting?) 0 = no, 1 = yes
  flag[16] (write errors at each iteration to a file?) 0 = no, 1 = yes
  flag[17] (include initial values of state variables in
            plot file? Will only take effect if flag[15]=1) 0 = no, 1 = yes
  flag[18] (write tper files?) 0 = no, 1 = yes
	    
  ********* Internal program flags **********
				 
  flag[51] (ederiv initialisation) 0 = don't do anything
                                   1 = allocate statics
				   2 = initialize statics
				   3 = deallocate statics
  flag[52] (has network just been trained?)
                0 = no  
		1 = yes
  flag[54] (filename suffix to append - see evtpnewname() )
                1 = output file (OTFILESUFFIX)
		2 = error file  (ERFILESUFFIX)
  flag[55] (end-of-training call to dymacint()? ) 0 = no, 1 = yes

*/

#include <stdio.h>   
#include <stdlib.h>   
#include <math.h>
#include <string.h>
#include "dynet.h"


/* =================== Start of Main Program ================== */

void main(int numinvar, char *invar[])
{
  /* declarations */

  int flag[Nflags] = {0};
  
  int p;
  int Vsize; /* number of recurrent inputs */
  int Vm;    /* number of measured recurrent inputs */
  int Xsize; /* number of external inputs (does not include bias) */
  int Hsize; /* number of hidden nodes (does not include bias) */
  int Ysize; /* number of outputs */
  int TRN_Npats; /* number of patterns in training set */
  int APP_Npats; /* number of patterns in application set */
  int Nits;      /* number of training iterations (if doing graddescent) */
  long ranseed; /* or do we want to use a pointer instead? */
  
  int    macitmax;    /* maximum number of iterations to use in macopt */
  double macconvtol;  /* convergence tolerance in macopt */
  double macchecktol; /* gradient checking tolerance in maccheckgrad */
  
  double eta;   /* step size in gradient descent */
  double wtrng; /* range of initial weights */ 
  
  double **wtVH; /* weights for VH layer */
  double **wtXH; /* weights for XH layer */
  double **wtHY; /* weights for HY layer */

  double *vscale[2], *xscale[2];
  /* scale parameters for v and x respectively
     if flag[12]=1: scale[0] contains mean
		    scale[1] contains standard deviations */
  double Hlam; /* lambda parameter in tanh transfer function
	       It is passed to/from dynettrain and dynetapply as a
	       pointer as dynetapply will use weights and scale
	       factors from dynettrain if we do the training */
  
  char *TRN_tpfname[MAXTPFILES]; /* training pattern file names */
  char *APP_tpfname[MAXTPFILES]; /* applying pattern file names */
  char sfname[FNAMESIZE];        /* specfile name */
  char inwtfname[FNAMESIZE];     /* input weight file name */
  char otwtfname[FNAMESIZE];     /* output weight file name */
  char plotfname[FNAMESIZE];     /* plotting data file name */
  char errfname[FNAMESIZE];      /* error file name */

  /* Anfang */
  
  printf("\n*** dynet version %0.2f created on %s ***\n\n", VERSION, LASTMODDATE);

  if(numinvar < 2)
    neterr("main(): must specify name of specfile on command line");
  if(strcmp(invar[1],"-v")==0)
    exit(0);

  printf("Program started: ");
  fflush(stdout);
  system("date"); /* UNIX specific */
  fflush(stdout);
  
  /* obtain and check specifications (set defaults and read specfile) */

  strcpy(sfname,invar[1]);
  specread(sfname, flag, &Vsize, &Vm, &Xsize, &Hsize, &Ysize, 
	   &Nits, &eta, &TRN_Npats, TRN_tpfname, &APP_Npats, APP_tpfname,
	   inwtfname, otwtfname, &ranseed, &macconvtol,
	   &macitmax, &macchecktol, &wtrng, plotfname, errfname);
    
  /* allocate memory */

  wtVH = Dmatrix(Vsize, Hsize); 
  wtXH = Dmatrix(Xsize+1, Hsize);
  wtHY = Dmatrix(Hsize+1, Ysize);

  vscale[0] = Dvector(Vsize);
  vscale[1] = Dvector(Vsize);
  xscale[0] = Dvector(Xsize);
  xscale[1] = Dvector(Xsize);
		     
  /* the business */

  flag[52]=0;
  if(flag[1]==1) {
    dynettrain(flag, Vsize, Vm, Xsize, Hsize, Ysize, TRN_Npats,
	       wtVH, wtXH, wtHY, vscale, xscale, &Hlam, TRN_tpfname,
	       Nits, eta, sfname, inwtfname, otwtfname, ranseed,
	       macconvtol, macitmax, macchecktol, wtrng, errfname);
    flag[52]=1;
  }
  if(flag[2]==1)
    dynetapply(flag, Vsize, Vm, Xsize, Hsize, Ysize, APP_Npats,
	       wtVH, wtXH, wtHY, vscale, xscale, &Hlam,
	       APP_tpfname, sfname, inwtfname, otwtfname, ranseed,
	       wtrng, plotfname); 

  /* deallocations */

  for(p=1;p<=TRN_Npats;++p)
    free_Cvector(TRN_tpfname[p],0);  /* allocated in specread */
  for(p=1;p<=APP_Npats;++p)
    free_Cvector(APP_tpfname[p],0);  /* allocated in specread */

  free_Dmatrix(wtVH, Vsize, Hsize);
  free_Dmatrix(wtXH, Xsize+1, Hsize);
  free_Dmatrix(wtHY, Hsize+1, Ysize);

  free_Dvector(vscale[0], Vsize);
  free_Dvector(vscale[1], Vsize);
  free_Dvector(xscale[0], Xsize);
  free_Dvector(xscale[1], Xsize);
    
  if(flag[14]>=1) printf("\n*** dynet completed ***\n\n");
  exit(0);
  
}

/* ==================== End of Main Program =================== */



/* ==================== Principal Control Routines =================== */  

void dynettrain(int *flag, int Vsize, int Vm, int Xsize, int Hsize, int Ysize,
		int Npats, double **wtVH, double **wtXH, double **wtHY,
		double **vscale, double **xscale, double *Hlam,
		char *tpfname[], int Nits, double eta,
		char *sfname, char *inwtfname, char *otwtfname, long ranseed,
		double macconvtol, int macitmax, double macchecktol,
		double wtrng, char *errfname)
{
  /* train dynet */


  int p;

  double ***x;     /* input data, x[p][t][l] (pattern,epoch,node) */
  targets ***tar; /* defined in dynet.h: [p][t][k] (pattern,epoch,node) */
  
  double *alpha; /* Bayes' weight decay parameters */
  double *beta;  /* noise levels */
  
  double **tsteps; /* time steps for each pattern */
  int *ntsteps;   /* number of time steps for each pattern */


  /* allocate memory */

  x   = Dpmatrix3_3(Npats+1); /* only allocates top dimension */
  tar = tpmatrix3_3(Npats+1); /* dataread() does the rest */

  alpha = Dvector(Nalpha);
  beta = Dvector(Vsize);
  
  tsteps = Dpvector(Npats+1); 
  ntsteps = Ivector(Npats+1);
  /* tsteps is a ragged 2D array
     size of tsteps[p] for each p is allocated in dataread()
     vector tsteps[0] is not used
     vector tsteps[p] (p=/=0) is of size number_of_time_steps_in_pattern_p
     tsteps[p][t] (p =/=0) holds time between epoch t-1 and t for pattern p
     tsteps[p][0] (p =/=0) is read in but not used as this length of time
                           has no meaning.
     
     ntsteps[p] (p =/=0) holds number_of_time_steps_in_pattern_p
     */

  
  /* read in data */

  dataread(Vsize, Vm, Xsize, tpfname, Npats, tsteps, ntsteps, x, tar, flag);
  
  /* initialise network */

  dynetinit(Vsize, Vm, Xsize, Hsize, Ysize, Npats, wtVH, wtXH, wtHY, vscale,
	    xscale, Hlam, flag, inwtfname, sfname, ranseed, tsteps,
	    alpha, beta, wtrng);  

  /* flag[53]=1; if scaling, scale both x and tar */

  if(flag[10]==0 && flag[12]!=0)
    scalecalc(Vsize, Xsize, Npats, ntsteps, x, tar,
	      xscale, vscale, Hlam, flag);

  if(flag[12]==1 || flag[12]==2)
    datascale(Vsize, Xsize, Npats, ntsteps, x, tar, xscale, vscale, flag);
  
  /* train */

  flag[51]=1; /* will allocate and initialise statics in ederiv */
  if(flag[3]==1) { /* gradient descent */
    graddescent(Vsize, Xsize, Hsize, Ysize, wtVH, wtXH, wtHY, x, tar, 
		tsteps, ntsteps, Npats, alpha, beta, *Hlam, flag, Nits, eta);
  }
  else if(flag[3]==2) { /* macopt */
    callmacopt(Vsize, Xsize, Hsize, Ysize, wtVH, wtXH, wtHY, x,
	       tar, tsteps, ntsteps, Npats, alpha, beta, *Hlam, flag,
	       errfname, macconvtol, macitmax, macchecktol);
  }
  else neterr("dynettrain(): flag[3] option does not exist");
  
  /* write out weights */

  if(flag[11]==1)
    writeweights(Vsize, Vm, Xsize, Hsize, Ysize, wtVH, wtXH, wtHY,
		 vscale, xscale, *Hlam, otwtfname, flag);
  
  /* deallocate memory */

  for(p=1;p<=Npats;++p) { /* x[p] and tar allocated in dataread() */
    free_Dmatrix(x[p],ntsteps[p],Xsize+1);
    free_tmatrix(tar[p],ntsteps[p],Vsize);
  }
  free_Dpmatrix3_3(x,Npats+1);
  free_tpmatrix3_3(tar,Npats+1);

  free_Dvector(alpha,Nalpha);
  free_Dvector(beta,Vsize);
  
  for(p=1;p<=Npats;++p)
    free_Dvector(tsteps[p], ntsteps[p]);
  /* free_Dpvector(tsteps, Npats+1); - see notes under version 1.06 */

  free_Ivector(ntsteps, Npats+1);
  
}



void dynetapply(int *flag, int Vsize, int Vm, int Xsize, int Hsize, int Ysize,
		int Npats, double **wtVH, double **wtXH, double **wtHY,
		double **vscale, double **xscale, double *Hlam,
		char **APP_tpfname, char *sfname,
		char *inwtfname, char *otwtfname, long ranseed,
		double wtrng, char *plotfname)
{
  /* Apply network */

  int p,t,k;
  /* int l,m,n; */
  
  targets ***tar; /* defined in dynet.h: [p][t][k] (pattern,epoch,node) */
  
  double ***x;   /* external inputs, x[p][t][l] (pattern,epoch,node) */
  double *v_prev;   
  double *h_prev;   
  double *y_prev;   
  double *v;   
  double *thistar, *inittar;
  double *alpha; /* alpha and beta not needed in this routine but */
  double *beta;  /* required for compatibility with dynetinit */ 
  
  double **tsteps; /* time steps for each pattern */
  double diff;    /* these 3 are error measures */

  int *ntsteps;   /* number of time steps for each pattern */
 
  FILE *tpotf;
  FILE *tperf;
  FILE *plotf;
  char tpotname[FNAMESIZE];
  char tpername[FNAMESIZE];
  char wtfname[FNAMESIZE]; /* weights file name to write to tpot file,
			    i.e. the weights used in this net application */
  char message[MESSAGESIZE];


  if(flag[14]>=2) printf("dynetapply(): Entering network application phase\n");
  
  /* allocate memory */

  x   = Dpmatrix3_3(Npats+1); /* only allocates the top dimension */
  tar = tpmatrix3_3(Npats+1); /* dataread() does the rest */
  v_prev = Dvector(Vsize); 
  h_prev = Dvector(Hsize+1); 
  y_prev = Dvector(Ysize); 
  v = Dvector(Vsize); 
  thistar = Dvector(Vsize); 
  inittar = Dvector(Vsize); /* acutally only use this if flag[17]==1 */
  
  alpha = Dvector(Nalpha); 
  beta  = Dvector(Vsize);  
  
  tsteps = Dpvector(Npats+1); 
  ntsteps = Ivector(Npats+1);
  /* tsteps is a ragged 2D array
     size of tsteps[p] for each p is allocated in dataread()
     vector tsteps[0] is not used
     vector tsteps[p] (p=/=0) is of size number_of_time_steps_in_pattern_p
     tsteps[p][t] (p =/=0) holds time between epoch t-1 and t for pattern p
     tsteps[p][0] (p =/=0) is read in but not used as this length of time
                           has no meaning.

     ntsteps[p] (p =/=0) holds number_of_time_steps_in_pattern_p
  */

  h_prev[Hsize]=HBIAS; /* initialises hidden layer bias node */
  
  /* read in data */

  dataread(Vsize, Vm, Xsize, APP_tpfname, Npats, tsteps, ntsteps,
	   x, tar, flag);
      
  /* initialise network */

  /*  flag[53]=0; */

  /* i.e. no need to scale tar when not training.  Note that if we are
     applying a network with random weights, then tar is never scaled
     so scale values of 0.0 will be written to the output file. We
     could scale them, but it would be meaningless: no dataset has
     been used to obtain the weights so what dataset do we use to
     obtain the scale? This would change if we ever do teacher forcing
     in the application. */
  
  if(flag[52]==0) { /* => we don't have weights or scale factors,
		       so we'll either read weights in or use random ones */
    dynetinit(Vsize, Vm, Xsize, Hsize, Ysize, Npats, wtVH, wtXH, wtHY, vscale,
	      xscale, Hlam, flag, inwtfname, sfname, ranseed, tsteps,
	      alpha, beta, wtrng);  
    if(flag[10]==0 && flag[12]!=0) /* => are using random weights and
				         want to scale data */
      scalecalc(Vsize, Xsize, Npats, ntsteps, x, tar,
		xscale, vscale, Hlam, flag);
    if(flag[10]==1) /* => have read in weights */
      strcpy(wtfname,inwtfname); 
    else /* => are using random weights */
      strcpy(wtfname,otwtfname); 
  }
  else /* => are using weights and scale factors found from training */
    strcpy(wtfname,otwtfname); 
  
  if(flag[12]==1 || flag[12]==2)
      datascale(Vsize, Xsize, Npats, ntsteps, x, tar, xscale, vscale, flag);

  if(flag[15]==1) {
    plotf=fopen(plotfname,"w");
    if(!plotf) {
      sprintf(message, "dynetapply(): output file %s could not be opened", plotfname);
      neterr(message);
    }
  }
  
  /* run network for all patterns and epochs */

  if(flag[14]>=2)
    printf("dynetapply(): Applying Network\n");
  
  for(p=1;p<=Npats;++p) { /* patterns */

    /* write to tper file if requested in specfile */

    if(flag[18]==1) {
      flag[54]=2;
      evtpnewname(tpername,APP_tpfname[p],flag); /* evaluate tpername */
      tperf=fopen(tpername,"w");
      if(!tperf) {
	sprintf(message, "dynetapply(): output file %s could not be opened", tpername);
	neterr(message);
      }
      fprintf(tperf, "# dynet temporal pattern error file\n");
      fprintf(tperf, "# ##################################\n");
      fprintf(tperf, "# input   file = %s\n", APP_tpfname[p]);
      fprintf(tperf, "# weights file = %s\n", wtfname);
      fprintf(tperf, "# V (tot state), Vm (meas state), epochs:\n");
      fprintf(tperf, "  %d              %d                %d\n",
	      Vsize, Vm, ntsteps[p]);
      fprintf(tperf, "# Measured (state/target/state-target/|diff/target|):\n"); 
    }

    flag[54]=1;
    evtpnewname(tpotname,APP_tpfname[p],flag); /* evaluate tpotname */
    tpotf=fopen(tpotname,"w");
    if(!tpotf) {
      sprintf(message, "dynetapply(): output file %s could not be opened", tpotname);
      neterr(message);
    }
    if(flag[14]>=2)
      printf("%3d: %s -> %s\n", p, APP_tpfname[p], tpotname);
    
    fprintf(tpotf, "# dynet temporal pattern output file\n");
    fprintf(tpotf, "# ##################################\n");
    fprintf(tpotf, "# input   file = %s\n", APP_tpfname[p]);
    fprintf(tpotf, "# weights file = %s\n", wtfname);
    fprintf(tpotf, "# V (tot state), Vm (meas state), epochs:\n");
    fprintf(tpotf, "  %d              %d                %d\n",
	    Vsize, Vm, ntsteps[p]);
    fprintf(tpotf, "# State variables (epoch/measured/unmeasured):\n"); 

    /* set and write initial condition */
    
    t=0;
    dynetloopinit(Vsize, v_prev, tar[p][t]); 
    for(k=0;k<Vsize;++k)
      thistar[k] = v_prev[k];
    if(flag[12]==1 || flag[12]==2)
      unscale(Vsize, thistar, vscale, flag);
    /* Note that we also unscale any initial values which were not defined,
       and hence were set to VINITDEF. To reiterate, any unspecified values
       of v at t=0 are set to VINITDEF within the network, but these will
       generally correspond to different values if scaling has been used, and
       these are the values written to any output files.
    */
    if(flag[17]==1) {
      for(k=0;k<Vsize;++k)
	inittar[k] = thistar[k]; 
    }
    fprintf(tpotf, "%3d ", t);
    for(k=0;k<Vsize;++k) 
      fprintf(tpotf, " % 8.5f ", thistar[k]);
    fprintf(tpotf, "\n");

    if(flag[18]==1) {
      fprintf(tperf, "%3d ", t);
      for(k=0;k<Vm;++k) {
	fprintf(tperf, " % 8.5f ", thistar[k]);
	if(tar[p][t][k].def)
	  fprintf(tperf, "% 8.5f ", thistar[k]);
	else
	  fprintf(tperf, " ------- ");
	fprintf(tperf, " 0.00    0.00 ");
      }
      fprintf(tperf, "\n");    
    }
    
    /* evaluate sequence */

    for(t=1;t<ntsteps[p];++t) { /* epochs */
      /* for(l=1;l<Xsize;++l)
	 printf("x[%d][%d][%d] = %lf\n", p, t, l, x[p][t-1][l]); */
      dynetloop(Vsize, Xsize, Hsize, Ysize,
		v_prev, x[p][t-1], h_prev, y_prev, v,
		tsteps[p][t], wtVH, wtXH, wtHY, *Hlam);
      if(flag[14]>=3) {
	for(k=0;k<Vsize;++k)
	  printf("p=%2d t=%2d k=%2d  v_prev=% 8.7f y=% 8.7f v=% 8.7f  %2d tar=% 8.7f\n", p, t, k, v_prev[k], y_prev[k], v[k], tar[p][t][k].def, tar[p][t][k].val);
      }
      updprev(Vsize, v_prev, v); /* copies v into v_prev */
      fprintf(tpotf, "%3d ", t);
      for(k=0;k<Vsize;++k) 
	thistar[k] = tar[p][t][k].val;
      if(flag[12]==1 || flag[12]==2) {
	unscale(Vsize, v, vscale, flag);
	unscale(Vsize, thistar, vscale, flag);
      }
      for(k=0;k<Vsize;++k)
	fprintf(tpotf, " % 8.5f ", v[k]);
      fprintf(tpotf, "\n");

      if(flag[18]==1) {
	fprintf(tperf, "%3d ", t);
	for(k=0;k<Vm;++k) {
	  fprintf(tperf, " % 8.5f ", v[k]);
	  if(tar[p][t][k].def)
	    fprintf(tperf, "% 8.5f ", thistar[k]);
	  else
	    fprintf(tperf, " ------- ");
	  diff = v[k]-thistar[k];
	  fprintf(tperf, "% 8.5f ", diff);
	  if(!tar[p][t][k].def) /* undefined target */
	    fprintf(tperf, "---- ");
	  else if(thistar[k]==0 && diff==0) /* allow for 0/0 */
	    fprintf(tperf, "0.00");
	  else if(thistar[k]==0) /* defined target =0 (i.e. divide by zero) */
	    fprintf(tperf, "Div0");
	  else
	    fprintf(tperf, "%4.2f ", fabs(diff/thistar[k]));
	}
	fprintf(tperf, "\n");
      }

    } /* end of epoch */
    
    fclose(tpotf);
    if(flag[18]==1)
      fclose(tperf);

    /* write to plot file if requested in specfile */
    
    if(flag[15]==1) {
      fprintf(plotf, "%3d ", p);
      for(k=0;k<Vm;++k) {
	if(!tar[p][ntsteps[p]-1][k].def) /* undefined target */
	  fprintf(plotf, "% 8.5f --------  ", v[k]);
	else 
	  fprintf(plotf, "% 8.5f % 8.5f  ", v[k], thistar[k]);
      }
      if(flag[17]==1) { /* also print initial v values - it would have
			   been neater to write out all data for each v
			   in a single block, but it's done this way for
			   backward compatibility with other versions, and
			   for the fact that we may often not want v(t=0) */
	for(k=0;k<Vm;++k)
	    fprintf(plotf, "% 8.5f ", inittar[k]);
      }
      fprintf(plotf, "\n");
    }
    
  } /* end of pattern */

  if(flag[15]==1)
    fclose(plotf);
  
  /* write out weights if we've been using random weights */

  if(flag[11]==1 && flag[52]==0 && flag[10]==0)
    writeweights(Vsize, Vm, Xsize, Hsize, Ysize, wtVH, wtXH, wtHY,
		 vscale, xscale, *Hlam, otwtfname, flag);

  /* deallocate memory */

  for(p=1;p<=Npats;++p) { /* x[p] and tar allocated in dataread() */
    free_Dmatrix(x[p],ntsteps[p],Xsize+1);
    free_tmatrix(tar[p],ntsteps[p],Ysize);
  }
  free_Dpmatrix3_3(x, Npats+1);
  free_tpmatrix3_3(tar, Npats+1);

  free_Dvector(v_prev, Vsize);
  free_Dvector(h_prev, Hsize+1);
  free_Dvector(y_prev, Ysize);
  free_Dvector(v, Vsize);
  free_Dvector(thistar, Vsize);
  free_Dvector(inittar, Vsize);

  free_Dvector(alpha, Nalpha);
  free_Dvector(beta, Vsize);
 
  for(p=1;p<=Npats;++p) 
    free_Dvector(tsteps[p], ntsteps[p]);
  /* free_Dpvector(tsteps, Npats+1); - see notes under version 1.06 */

  free_Ivector(ntsteps, Npats+1);
}



/* ==================== Forward Pass Routine =================== */

void dynetloop(int Vsize, int Xsize, int Hsize, int Ysize,
	       double *v_prev, 
	       double *x_prev, double *h_prev, double *y_prev, double *v,
	       double tstep,
	       double **wtVH, double **wtXH, double **wtHY, double Hlam)
{
  /* Loop one epoch of a pattern through the network.
     Given v(t-1) and x(t-1) we evaluate h(t-1), y(t-1) and v(t).
     x(t-1) is passed in with variable x_prev.
     v(t-1) is passed in with variable v_prev.
     h(t-1) is passed out with variable h_prev
     y(t-1) is passed out with variable y_prev.
     v(t) is passed out with variable v. */
     
  int k,m,n;

  double *p; /* h = f(p) */
  double *tmp;
  double *q; /* y = g(q) */

  p = Dvector(Hsize);
  tmp = Dvector(Hsize);
  q = Dvector(Ysize);
  
  /* forward pass */
  
  vectmat(wtVH, v_prev, Vsize, Hsize, tmp);
  vectmat(wtXH, x_prev, Xsize+1, Hsize, p);
  for(m=0;m<Hsize;++m) {
    p[m] += tmp[m];
    h_prev[m] = Tf(p[m], Hlam);
  }
  
  vectmat(wtHY, h_prev, Hsize+1, Ysize, q);
    for(n=0;n<Ysize;++n) /* NB for linear outputs Tg(q[n]) = q[n] */
      y_prev[n] = Tg(q[n]);

  /* recurrent loop */
  
  for(k=0;k<Vsize;++k) 
    v[k] = v_prev[k] + tstep*y_prev[k];

  free_Dvector(p, Hsize);
  free_Dvector(tmp, Hsize);
  free_Dvector(q, Ysize);

  
}



/* ==================== Gradient Descent Routines =================== */

void graddescent(int Vsize, int Xsize, int Hsize, int Ysize,
		 double **wtVH, double **wtXH, double **wtHY, double ***x,
		 targets ***tar, double **tsteps, int *ntsteps, int Npats,
		 double *alpha, double *beta, double Hlam, int *flag, int Nits,
		 double eta)
{
  /* optimise weights by gradient descent for finite number of iterations */

  double *v_prev; /* recurrent layer */
  double *h_prev; /* hidden layer */
  double *y_prev; /* output layer */
  double *v;
  
  double **edVH; /* error derivatives wrt weights */
  double **edXH;
  double **edHY;
  double **cumedVH; /* cumulative of ed */
  double **cumedXH;
  double **cumedHY;

  int n,p,t,k;

  /* allocate memory */
  
  edVH    = Dmatrix(Vsize, Hsize); /* dE/dw_ij */
  edXH    = Dmatrix(Xsize+1, Hsize);
  edHY    = Dmatrix(Hsize+1, Ysize);
  cumedVH = Dmatrix(Vsize, Hsize); /* Sum dE/dw_ij */
  cumedXH = Dmatrix(Xsize+1, Hsize);
  cumedHY = Dmatrix(Hsize+1, Ysize);
  
  v_prev = Dvector(Vsize); 
  h_prev = Dvector(Hsize+1); 
  y_prev = Dvector(Ysize); 
  v = Dvector(Vsize); 
  
  h_prev[Hsize]=HBIAS; /* initialises hidden layer bias node */
  
  /* The three update choices implemented here are to update weights after:
     1b. passing a single epoch (for each pattern)
     3.  passing a whole pattern (all epochs)
     4.  passing all patterns (all epochs)

     I cannot do update methods 1a or 2 as these require that values
     of v(t-1), dv(t-1), dy(t-1) etc. are stored for each pattern,
     which would be a serious memory gobbler.
     */

  if(flag[14]>=3) printf("graddescent(): Training network with gradient descent\n");
  
  for(n=1;n<=Nits;++n) { /* iterations */
    for(p=1;p<=Npats;++p) { /* patterns */
      /* set v_prev for first epoch */
      dynetloopinit(Vsize, v_prev, tar[p][0]);
      for(t=1;t<ntsteps[p];++t) { /* epochs */
	dynetloop(Vsize, Xsize, Hsize, Ysize,
		  v_prev, x[p][t-1], h_prev, y_prev, v,
		  tsteps[p][t], wtVH, wtXH, wtHY, Hlam);
	ederiv(Vsize, Xsize, Hsize, Ysize,
	       v_prev, x[p][t-1], h_prev, y_prev, v,
	       wtVH, wtXH, wtHY, edVH, edXH, edHY,
	       tar[p][t], alpha, beta, Hlam, tsteps[p][t], flag); 
	if(flag[4]==1) /* update method 1b - uses edVH etc. */
	  updatewt(eta, Vsize, Xsize, Hsize, Ysize,
		   wtVH, wtXH, wtHY, edVH, edXH, edHY, flag);
	else if(flag[4]==3 || flag[4]==4)
	  cumederivs(Vsize, Xsize, Hsize, Ysize,
		     edVH, edXH, edHY, cumedVH, cumedXH, cumedHY);
	else
	  neterr("dynettrain(): flag[4] option does not exist");
	if(flag[14]>=4) {
	  printf("n =%2d  p =%2d  t =%2d   edHY[0][0] = %8.5f  cumedHY[0][0] = %8.5f  wtHY[0][0] = %8.5f\n", n, p, t, edHY[0][0], cumedHY[0][0], wtHY[0][0]);
	  for(k=0;k<Vsize;++k)
	  printf("p=%2d t=%2d k=%2d  v_prev=% 8.7f y=% 8.7f v=% 8.7f  %2d tar=% 8.7f\n", p, t, k, v_prev[k], y_prev[k], v[k], tar[p][t][k].def, tar[p][t][k].val);
	}
	updprev(Vsize, v_prev, v);
      } /* end of epoch loop */
      if(flag[4]==3) /* update method 3 - uses cumedVH etc.*/
	updatewt(eta, Vsize, Xsize, Hsize, Ysize,
		 wtVH, wtXH, wtHY, cumedVH, cumedXH, cumedHY, flag);
      flag[51]=2; /* request re-initialisation of
		     derivative dynamic system for next pattern */
    } /* end of pattern loop */
    if(flag[4]==4) /* update method 4 - uses cumedVH etc.*/
      updatewt(eta, Vsize, Xsize, Hsize, Ysize,
	       wtVH, wtXH, wtHY, cumedVH, cumedXH, cumedHY, flag);	
  } /* end of iteration loop */

  /* deallocate memory */

  flag[51]=3;
  /* call ederiv to deallocate static variables - as they are statics I
     would guess that the deallocation has to be done from within ederiv */
  ederiv(Vsize, Xsize, Hsize, Ysize,
	 v_prev, x[1][0], h_prev, y_prev, v,
	 wtVH, wtXH, wtHY, edVH, edXH, edHY,
	 tar[1][0], alpha, beta, Hlam, tsteps[1][0], flag); 
  
  free_Dmatrix(edVH, Vsize, Hsize);
  free_Dmatrix(edXH, Xsize+1, Hsize);
  free_Dmatrix(edHY, Hsize+1, Ysize);
  free_Dmatrix(cumedVH, Vsize, Hsize);
  free_Dmatrix(cumedXH, Xsize+1, Hsize);
  free_Dmatrix(cumedHY, Hsize+1, Ysize);

  free_Dvector(v_prev, Vsize);
  free_Dvector(h_prev, Hsize+1);
  free_Dvector(y_prev, Ysize);
  free_Dvector(v, Vsize);
  
}



void updatewt(double eta, int Vsize, int Xsize, int Hsize, int Ysize,
	      double **wtVH, double **wtXH, double **wtHY,
     	      double **cumedVH, double **cumedXH, double **cumedHY, int *flag)
{
  /* Gradient Descent update of weights and set cumed to zero.

     if(flag[4]==1) then edVH etc. are passed and called cumedVH etc. here

     Note that we only bother to set re-initialise cumulated variables 
     if we've actually used them before, i.e. if we're using weight update
     method 3 or 4. */

  int k,l,m,n;

  for(m=0;m<Hsize;++m) {
    for(k=0;k<Vsize;++k) {
      wtVH[k][m] += -eta*cumedVH[k][m];
      if(flag[4]!=1)
	cumedVH[k][m]=0.0;
    }
    for(l=0;l<Xsize+1;++l) {
      wtXH[l][m] += -eta*cumedXH[l][m];
      if(flag[4]!=1)
	cumedXH[l][m]=0.0;
    }
  }
  for(m=0;m<Hsize+1;++m) {
    for(n=0;n<Ysize;++n) {
      wtHY[m][n] += -eta*cumedHY[m][n];
      if(flag[4]!=1)
	cumedHY[m][n]=0.0;
    }
  }  

}



/* ==================== Macopt-relevant Routines =================== */

void callmacopt(int Vsize, int Xsize, int Hsize, int Ysize,
		double **wtVH, double **wtXH, double **wtHY, double ***x,
		targets ***tar, double **tsteps, int *ntsteps,
		int Npats, double *alpha, double *beta, double Hlam,
		int *flag, char *errfname,
		double macconvtol, int macitmax, double macchecktol)
{

  /* macoptII is called, which does the entire minimization itself.

     macopt_defaults(&mca);
     - This sets default values for the macopt parameters held in
       the structure mca
       
     macoptII(wtvec, wtvec_size, dymacint, (void *)(&dga), &mca);
     - wtvec is the initial vector from which minimization commences;
       dymacint is the name of the function which macoptII must call
       in order to evaluate the gradient of the error function;
       dga is a structure of parameters which dymacint needs;
       dymacint is generally called many times by macoptII.
       Upon returning from macoptII, wtvec hold the optimized weights
       
     maccheckgrad(wtvec, wtvec_size, (double)macchecktol,
                  dymacfn, (void *)(&dga), dymacint, (void *)(&dga), 0); 
     - maccheckgrad tests to see whether the gradient of the error
       function is correct by comparing it with an approximation to
       the gradient obtained from a first difference method using the
       function itself. maccheckgrad must be passed the names of two
       functions, the first which evaluates the function (dymacfn),
       and the second which evaluates its derivatives (dymacint).
       dymacfn is basically the same as dymacint but with the
       derivative stuff removed and the with the correct arguments to
       be compatiable with macopt.  macchecktol is the tolerance to
       which first differences are calculated. This is a bit
       repetitive, but as maccheckgrad is not an integral part of the
       code (it's only for debugging) it's not worth worrying
       about. The final zero just ensures that checking is done for
       all weights.
   */
  
  dymacint_args dga; /* defined in dynet.h */
  macopt_args mca;   /* defined in macopt.h */
  
  double *wtvec;  /* concatenated weight vector for passing to macopt
		     As we do pointer arithmetic on this, type must be
		     consistent with usage elsewhere (i.e. in dymacint) */
  double *fdum;   /* dummy variable for ederiv deallocate call */
  double **f2dum; /* dummy variable for ederiv deallocate call */
  int k,l,m,n;    /* counters */
  int offset;
  int wtvec_size;
  char message[MESSAGESIZE];
  
  if(flag[14]>=3) printf("callmacopt(): Using macopt to train network\n");
  
  /* allocate memory */

  wtvec_size = Vsize*Hsize + (Xsize+1)*Hsize + (Hsize+1)*Ysize;
  wtvec = Dvector(wtvec_size);

  if(flag[4]!=4 && flag[14]>=1) {
    netwarn("callmacopt(): macopt has only been implemented with total batch update.\nflag[4] has been re-set to 4");
    flag[4]=4;
  }
    
  /* Convert weight matrices into single vector for macopt.  Elements
     of wtvec are in order wtVH, wtXH, wtHY.

     This wastes a little time as it is repeated every time macoptII
     calls dymacint. It also increases memory usage as wtvec is copied
     to new wt matrices in dymacint. The alternative, however, is to
     use a vector for the weights throughout, which is prone to
     producing bugs given the intrinsic matrix nature of the weights.
     (If you need convincing, look in the ederiv subroutine.)  Memory
     isn't a problem as the weight matrices are insignificant compared
     to the error derivative arrays. Time also isn't much of an issue
     as it's just copying the matrices once per call of dymacint. */

  for(k=0;k<Vsize;++k) {
    for(m=0;m<Hsize;++m)
      wtvec[Hsize*k + m] = wtVH[k][m];
  }
  offset = Vsize*Hsize;
  for(l=0;l<Xsize+1;++l) {
    for(m=0;m<Hsize;++m)
      wtvec[offset + Hsize*l + m] = wtXH[l][m];
  }
  offset = Vsize*Hsize + (Xsize+1)*Hsize;
  for(m=0;m<Hsize+1;++m) {
    for(n=0;n<Ysize;++n)
      wtvec[offset + Ysize*m + n] = wtHY[m][n];
  }
  
  /* Or can use: for(i=0;i<(Xsize+1)*Hsize;++i)
     new[i]=wtXH[(int)(i/Hsize)][(int)(i%Hsize)]; */
  
  /* assign values of dga
     we could have just defined these parameters as a
     structure in the first place */
  
  dga.Vsize = Vsize;
  dga.Xsize = Xsize;
  dga.Hsize = Hsize;
  dga.Ysize = Ysize;
  dga.tsteps = tsteps;
  dga.ntsteps = ntsteps;
  dga.alpha = alpha;
  dga.beta = beta;
  dga.x = x;
  dga.tar = tar;
  dga.Hlam = Hlam;
  dga.Npats = Npats;
  dga.flag = flag;
  dga.wtvec_size = wtvec_size;
  dga.gtyp = &(mca.gtyp);
  dga.its = &(mca.its);
  dga.previts = 0;
  if(flag[16]==1) { /* we want to write errors to a file during training */
    dga.errf=fopen(errfname,"w");
    if(!dga.errf) {
      sprintf(message, "callmacopt(): error file %s could not be opened", errfname);
      neterr(message);
    }
  }
  
  /* macopt stuff - see top of subroutine for details */
   
  wtvec--; /* macopt uses single offset vectors */
  macopt_defaults(&mca); /* set macopt parameters to defaults... */
  mca.verbose = flag[14]-1; /* should be -1 */
  mca.tol = macconvtol; /* ...except for those from specfile */
  mca.itmax = macitmax;
  mca.rich = 1;
  mca.end_if_small_step = 0;  /* i.e. macopt will only convergence
				 once the gradient is small enough.
				 It will not converge simply if the
				 step is small */
  if(flag[13]==1) { /* check gradient prior to training */
    if(flag[14]>=3) printf("callmacopt(): Calling maccheckgrad\n");
    maccheckgrad(wtvec, wtvec_size, macchecktol,
		 dymacfn, (void *)(&dga), dymacint, (void *)(&dga), 0);
  }
  flag[55]=0;
  macoptII(wtvec, wtvec_size, dymacint, (void *)(&dga), &mca);
  flag[55]=1;
  if(*dga.its-1!=macitmax) /* see discussion under dymacint() */
    dymacint(wtvec, fdum, (void *)(&dga)); /* writes final errors */
  if(flag[13]==1) {
    if(flag[14]>=3) printf("callmacopt(): Calling maccheckgrad\n");
    maccheckgrad(wtvec, wtvec_size, macchecktol,
		 dymacfn, (void *)(&dga), dymacint, (void *)(&dga), 0);
  }
  wtvec++;


  /* rewrite wtvec back into matrices */

  if(flag[14]>=3) printf("Weights passed out of callmacopt():\n");
  for(k=0;k<Vsize;++k) {
    for(m=0;m<Hsize;++m) {
      wtVH[k][m] = wtvec[Hsize*k + m];
      if(flag[14]>=4) printf("wtVH[%d][%d] = %8.5f ", k, m, wtVH[k][m]);
    }
    if(flag[14]>=4) printf("\n");
  }
  offset = Vsize*Hsize;
  for(l=0;l<Xsize+1;++l) {
    for(m=0;m<Hsize;++m) {
      wtXH[l][m] = wtvec[offset + Hsize*l + m];
      if(flag[14]>=4) printf("wtXH[%d][%d] = %8.5f ", l, m, wtXH[l][m]);      
    }
    if(flag[14]>=4) printf("\n");
  }
  offset = Vsize*Hsize + (Xsize+1)*Hsize;
  for(m=0;m<Hsize+1;++m) {
    for(n=0;n<Ysize;++n) {
      wtHY[m][n] = wtvec[offset + Ysize*m + n];
      if(flag[14]>=4) printf("wtHY[%d][%d] = %8.5f ", m, n, wtHY[m][n]);      
    }
    if(flag[14]>=4) printf("\n");
  }
    
  /* deallocate memory */

  free_Dvector(wtvec, wtvec_size);
  /* call ederiv to deallocate static variables - as they are statics I
     would guess that the deallocation has to be done from within ederiv */
  flag[51]=3;
  ederiv(Vsize, Xsize, Hsize, Ysize, fdum, x[1][0], fdum, fdum, fdum,
	 wtVH, wtXH, wtHY, f2dum, f2dum, f2dum,
	 tar[1][0], alpha, beta, Hlam, tsteps[1][0], flag); 
  
  /* close files */
  
  if(flag[16]==1)
    fclose(dga.errf);
  
}



void dymacint(double *wtvec, double *wt_grad, void *pass_args)
{
  /* This is the interface between dynet and macopt: it is the
     gradient evaluating function which macopt calls.  wt_grad is
     declared and defined (i.e. memory allocated) in macopt.c.  The
     wtvec passed in may be the initial vectors (declared and defined
     in callmacopt) or later vectors (declared and defined in macopt).
     
     wtvec (vector) is the weights (passed into this function)
     wt_grad (vector) is the gradient of weights (evaluated by this function) 
     pass_args (structure - see below) carries the arguments for this function

     i.e. only wt_grad is changed by this function.
     
     Note that macopt defines (and/or uses) wtvec and wt_grad as unit
     offset (i.e they start at 1 rather than 0 like in most C code) so
     the pointers which it passes don't actually point to the
     beginning of the vector but one place earlier. To get around this
     we increment the pointers at the beginning of this function and
     decrement them before leaving. Thus the pointer wtvec and wt_grad
     should only be used locally. As we do pointer arithmetic, types
     for these two pointers have to be consistent with those in macopt
     and callmacopt. The alternative is to put "+1" wherever wtvec
     and wt_grad are used.

     pass_args is a pointer to the dga structure defined and declared
     in callmacopt. This pointer (and the data therein) is not
     affected by macopt.

     Note that macopt calls this function more than once per
     iteration.  At the beginning of iteration n, macopt prints the
     gradient from the end of the *previous* iteration. Therefore, in
     this function (below) when an iteration change is detected at the
     beginning of the function, the error evaluated at the end of the
     previous function call (stored as dga->lerr and dga->werr) are
     reported and associated with the current iteration number. This
     way the error reported by this function and the gradient reported
     by macopt correspond to the same iteration and the same point on
     the minimization surface. If macopt does not hit the iteration
     limit, then it will finish without calling this function after
     the converged iteration, so the final errors and gradient are not
     reported by this function. To get around this, callmacopt() calls
     this function again with flag[55]=1 if training finishes without
     hitting the limit. If macopt does hit the limit, then it does
     seems to call this function at the end.

     Update method 4 is currently hard-wired into this algorithm.
     Other methods are not easily compatible with macopt as it is
     essentially a batch optimizer. */

  dymacint_args *dga = (dymacint_args *) pass_args; 
  /* This is necessary to be compatible with macopt. Format taken from
     MacKay's test_function.c */
       
  double **wtVH; /* weights */
  double **wtXH;
  double **wtHY;
  double **edVH; /* error derivatives wrt weights */
  double **edXH;
  double **edHY;
  double **cumedVH; /* cumulative of ed */
  double **cumedXH;
  double **cumedHY;

  double *v_prev; /* recurrent layer */
  double *h_prev; /* hidden layer */
  double *y_prev; /* output layer */
  double *v;

  double toterr;  /* the total error (lerr+werr) */
  double grad;    /* grad is the magnitude of the total gradient.
		     gg from macopt is sum of squares of gradient.
		     gtyp is from macopt, where gtyp = sqrt (gg/wtvec_size)
		     grad = sqrt (gg) = gtyp * sqrt (wtvec_size) */
  double temp;
  
  int p,t;
  int k,l,m,n;
  int offset;

  if(dga->flag[14]>=3)
    printf("dymacint(): Performing forward & backward passes\n");

  /* report errors */

  if(*(dga->its)!=dga->previts) { /* i.e. if we're on the next iteration */
    toterr=dga->lerr+dga->werr;
    grad = *(dga->gtyp)*sqrt((double)dga->wtvec_size);
    if(*dga->its>=1) { /* errors not defined upon first call to this routine */
      if(dga->flag[14]>=3)
	printf("%d: Errors: L = %.5e (%3.1f%%)  W = %.5e (%3.1f%%)  Tot = %.5e\n", *(dga->its), dga->lerr, 100*dga->lerr/toterr, dga->werr, 100*dga->werr/toterr, toterr);
      if(dga->flag[16]==1) /* file opened and closed in callmacopt() */
	fprintf(dga->errf, "%5d  %.5e %5.1f  %.4e %5.1f   %.5e   %.5e\n", *(dga->its), dga->lerr, 100*dga->lerr/toterr, dga->werr, 100*dga->werr/toterr, toterr, grad);
    }
  }
  if(dga->flag[55]==1) /* if we're just calling dymacint() to get final errors */
    return;
  dga->lerr = 0.0;
  dga->werr = 0.0;
  
  /* allocate memory */
  
  wtVH    = Dmatrix(dga->Vsize, dga->Hsize); /* weights */
  wtXH    = Dmatrix(dga->Xsize+1, dga->Hsize);
  wtHY    = Dmatrix(dga->Hsize+1, dga->Ysize);
  edVH    = Dmatrix(dga->Vsize, dga->Hsize); /* dE/dw_ij */
  edXH    = Dmatrix(dga->Xsize+1, dga->Hsize);
  edHY    = Dmatrix(dga->Hsize+1, dga->Ysize);
  cumedVH = Dmatrix(dga->Vsize, dga->Hsize); /* Sum dE/dw_ij */
  cumedXH = Dmatrix(dga->Xsize+1, dga->Hsize);
  cumedHY = Dmatrix(dga->Hsize+1, dga->Ysize);

  v_prev = Dvector(dga->Vsize);
  h_prev = Dvector(dga->Hsize+1);
  y_prev = Dvector(dga->Ysize);
  v      = Dvector(dga->Vsize);
  
  /* increment pointers to accommodate the unit offset in macopt */

  wtvec++;
  wt_grad++;
  
  h_prev[dga->Hsize]=HBIAS; /* initialises hidden layer bias node */

  /* Copy weight vector from macopt into weight matrices */
  
  for(k=0;k<dga->Vsize;++k) {
    for(m=0;m<dga->Hsize;++m)
      wtVH[k][m] = wtvec[dga->Hsize*k + m];
  }
  offset = dga->Vsize*dga->Hsize;
  for(l=0;l<dga->Xsize+1;++l) {
    for(m=0;m<dga->Hsize;++m)
      wtXH[l][m] = wtvec[offset + dga->Hsize*l + m];
  }
  offset = dga->Vsize*dga->Hsize + (dga->Xsize+1)*dga->Hsize;
  for(m=0;m<dga->Hsize+1;++m) {
    for(n=0;n<dga->Ysize;++n)
      wtHY[m][n] = wtvec[offset + dga->Ysize*m + n];
  }

  /* initialise cumed */

  for(m=0;m<dga->Hsize;++m) {
    for(k=0;k<dga->Vsize;++k)
      cumedVH[k][m]=0.0;
    for(l=0;l<dga->Xsize+1;++l)
      cumedXH[l][m]=0.0;
  }
  for(m=0;m<dga->Hsize+1;++m) {
    for(n=0;n<dga->Ysize;++n)
      cumedHY[m][n]=0.0;
  }  
  
  /* Using update method 4, evaluate error derivatives */
  
  for(p=1;p<=dga->Npats;++p) { /* patterns */
    dynetloopinit(dga->Vsize, v_prev, dga->tar[p][0]);
    for(t=1;t<dga->ntsteps[p];++t) { /* epochs */
      dynetloop(dga->Vsize, dga->Xsize, dga->Hsize, dga->Ysize, v_prev,
		dga->x[p][t-1], h_prev, y_prev, v, dga->tsteps[p][t],
		wtVH, wtXH, wtHY, dga->Hlam);
      if(dga->flag[14]>=4)
	printf("wtVH = %10.7f, wtXH = %10.7f, wtbH = %10.7f, wtHY = %10.7f, wtbY = %10.7f\n", wtVH[0][0], wtXH[0][0], wtXH[1][0], wtHY[0][0], wtHY[1][0]);
      ederiv(dga->Vsize, dga->Xsize, dga->Hsize, dga->Ysize,
	     v_prev, dga->x[p][t-1], h_prev, y_prev, v,
	     wtVH, wtXH, wtHY, edVH, edXH, edHY, dga->tar[p][t],
	     dga->alpha, dga->beta, dga->Hlam, dga->tsteps[p][t], dga->flag); 
      cumederivs(dga->Vsize, dga->Xsize, dga->Hsize, dga->Ysize,
		 edVH, edXH, edHY, cumedVH, cumedXH, cumedHY);

      /* diagnostics */
      if(dga->flag[14]>=4) {
	printf("edVH = %10.7f, edXH = %10.7f, edbH = %10.7f, edHY = %10.7f, edbY = %10.7f\n\n", edVH[0][0], edXH[0][0], edXH[1][0], edHY[0][0], edHY[1][0]);
	for(k=0;k<dga->Vsize;++k)
	  printf("p=%2d t=%2d k=%2d  v_prev=% 8.7f y=% 8.7f v=% 8.7f  %2d tar=% 8.7f\n", p, t, k, v_prev[k], y_prev[k], v[k], dga->tar[p][t][k].def, dga->tar[p][t][k].val);
      }
      for(k=0;k<dga->Vsize;++k) { 
	/* only contribute to error if target defined */
	if(dga->tar[p][t][k].def) { 
	  temp = errfn(v[k],dga->tar[p][t][k].val);
	  dga->lerr += 0.5*dga->beta[k]*temp*temp;
	}
      }
      if(dga->flag[8]>0) /* weight decay contribution to error */
	dga->werr += wderr(dga->Vsize, dga->Xsize, dga->Hsize, dga->Ysize,
			    wtVH, wtXH, wtHY, dga->alpha); 

      updprev(dga->Vsize, v_prev, v); /* copies v into v_prev */      
    } /* end of epoch loop */
    dga->flag[51]=2; /* re-initialise derivative
			dynamic system for next pattern */
  } /* end of pattern loop */

  toterr=dga->lerr+dga->werr;
  if(dga->flag[14]>=4)
    printf("Errors: L = %.5e (%3.1f%%)  W = %.5e (%3.1f%%)  Tot = %.5e\n", dga->lerr, 100*dga->lerr/toterr, dga->werr, 100*dga->werr/toterr, toterr);

  dga->previts = *(dga->its);
  
  /* evaluate wt_grad from cumed terms */

  for(k=0;k<dga->Vsize;++k) {
    for(m=0;m<dga->Hsize;++m)
      wt_grad[dga->Hsize*k + m] = cumedVH[k][m];
  }
  offset = dga->Vsize*dga->Hsize;
  for(l=0;l<dga->Xsize+1;++l) {
    for(m=0;m<dga->Hsize;++m)
      wt_grad[offset + dga->Hsize*l + m] = cumedXH[l][m];
  }
  offset = dga->Vsize*dga->Hsize + (dga->Xsize+1)*dga->Hsize;
  for(m=0;m<dga->Hsize+1;++m) {
    for(n=0;n<dga->Ysize;++n)
      wt_grad[offset + dga->Ysize*m + n] = cumedHY[m][n];
  }

  /* deallocate memory */
  
  free_Dmatrix(wtVH, dga->Vsize, dga->Hsize);
  free_Dmatrix(wtXH, dga->Xsize+1, dga->Hsize);
  free_Dmatrix(wtHY, dga->Hsize+1, dga->Ysize);
  free_Dmatrix(edVH, dga->Vsize, dga->Hsize);
  free_Dmatrix(edXH, dga->Xsize+1, dga->Hsize);
  free_Dmatrix(edHY, dga->Hsize+1, dga->Ysize);
  free_Dmatrix(cumedVH, dga->Vsize, dga->Hsize);
  free_Dmatrix(cumedXH, dga->Xsize+1, dga->Hsize);
  free_Dmatrix(cumedHY, dga->Hsize+1, dga->Ysize);

  free_Dvector(v_prev, dga->Vsize);
  free_Dvector(h_prev, dga->Hsize+1);
  free_Dvector(y_prev, dga->Ysize);
  free_Dvector(v, dga->Vsize);

  /* decrement pointers to accommodate the unit offset in macopt */

  wtvec--;
  wt_grad--;
  
}


double dymacfn(double *wtvec, void *pass_args)
{
  /* Evaluates the error function for a given weight vector (wtvec)
     and returns it as the function value. Neither wtvec nor
     pass_args are changed by this function.
     
     This is only used by maccheckgrad.  It is very similar to
     dymacint, but with derivative code removed.  Error is evaluated
     from scratch, i.e. using all the input x values etc. in
     pass_args. This is inefficient, but it doesn't matter as it's
     only a diagnostic.  Ultimately you could probably integrate this
     with dynetapply.

     Update method 4 is currently hard-wired into this algorithm, i.e.
     the total error evaluated is the sum for all patterns at all epochs.
     */

  dymacint_args *dga = (dymacint_args *) pass_args; 
  /* This is necessary to be compatible with macopt. Format taken from
     MacKay's test_function.c */

  double lerr = 0.0; /* the likelihhod contribution to the error */
  double werr = 0.0; /* the weight decay contribution to the error */
  double toterr;     /* the total error (lerr+werr) */
  double temp;
  
  double **wtVH; /* weights */
  double **wtXH;
  double **wtHY;

  double *v_prev; /* recurrent layer */
  double *h_prev; /* hidden layer */
  double *y_prev; /* output layer */
  double *v;
  
  int p,t;
  int k,l,m,n;
  int offset;


  if(dga->flag[14]>=3)
    printf("dymacfn(): Performing forward pass\n");

  /* allocate memory */
  
  wtVH   = Dmatrix(dga->Vsize, dga->Hsize); /* weights */
  wtXH   = Dmatrix(dga->Xsize+1, dga->Hsize);
  wtHY   = Dmatrix(dga->Hsize+1, dga->Ysize);

  v_prev = Dvector(dga->Vsize);
  h_prev = Dvector(dga->Hsize+1);
  y_prev = Dvector(dga->Ysize);
  v      = Dvector(dga->Vsize);

  /* increment pointer to accommodate the unit offset in macopt */

  wtvec++;
  
  h_prev[dga->Hsize]=HBIAS; /* initialises hidden layer bias node */

  /* Copy weight vector from macopt into weight matrices */
  
  for(k=0;k<dga->Vsize;++k) {
    for(m=0;m<dga->Hsize;++m)
      wtVH[k][m] = wtvec[dga->Hsize*k + m];
  }
  offset = dga->Vsize*dga->Hsize;
  for(l=0;l<dga->Xsize+1;++l) {
    for(m=0;m<dga->Hsize;++m)
      wtXH[l][m] = wtvec[offset + dga->Hsize*l + m];
  }
  offset = dga->Vsize*dga->Hsize + (dga->Xsize+1)*dga->Hsize;
  for(m=0;m<dga->Hsize+1;++m) {
    for(n=0;n<dga->Ysize;++n)
      wtHY[m][n] = wtvec[offset + dga->Ysize*m + n];
  }
    
  /* Evaluate total error for all epochs and patterns
     (i.e. "update" method 4) */
  
  for(p=1;p<=dga->Npats;++p) { /* patterns */
    /* set v_prev for first epoch */
    dynetloopinit(dga->Vsize, v_prev, dga->tar[p][0]);
    for(t=1;t<dga->ntsteps[p];++t) { /* epochs - NB we start from t=1 */
      dynetloop(dga->Vsize, dga->Xsize, dga->Hsize, dga->Ysize, v_prev,
		dga->x[p][t-1], h_prev, y_prev, v, dga->tsteps[p][t],
		wtVH, wtXH, wtHY, dga->Hlam);
      updprev(dga->Vsize, v_prev, v); /* copies v into v_prev */
      for(k=0;k<dga->Vsize;++k) { 
	/* only contribute to error if target defined */
	if(dga->tar[p][t][k].def) { 
	  temp = (double)errfn(v[k],dga->tar[p][t][k].val);
	  lerr += 0.5*dga->beta[k]*temp*temp;
	}
      }
      if(dga->flag[8]>0) /* weight decay contribution to error */
	werr += wderr(dga->Vsize, dga->Xsize, dga->Hsize, dga->Ysize,
			    wtVH, wtXH, wtHY, dga->alpha); 
    } /* end of epoch loop */
  } /* end of pattern loop */

  toterr=lerr+werr;
  if(dga->flag[14]>=3)
    printf("dymacint(): Errors: L = %5.2e (%3.1f%%)  W = %5.2e (%3.1f%%)  Tot = %5.2e\n", lerr, 100*lerr/toterr, werr, 100*werr/toterr, toterr);

  /* deallocate memory */

  free_Dmatrix(wtVH, dga->Vsize, dga->Hsize);
  free_Dmatrix(wtXH, dga->Xsize+1, dga->Hsize);
  free_Dmatrix(wtHY, dga->Hsize+1, dga->Ysize);
  
  free_Dvector(v_prev, dga->Vsize);
  free_Dvector(h_prev, dga->Hsize+1);
  free_Dvector(y_prev, dga->Ysize);
  free_Dvector(v, dga->Vsize);

  /* decrement pointer to accommodate the unit offset in macopt */

  wtvec--;

  return(toterr);
  
}



/* ==================== Gradient Evaluation Routines =================== */

void ederiv(int Vsize, int Xsize, int Hsize, int Ysize, double *v_prev,
	    double *x_prev, double *h_prev, double *y_prev, double *v,
	    double **wtVH, double **wtXH, double **wtHY,
	    double **edVH, double **edXH, double **edHY,
	    targets *tar, double *alpha, double *beta, double Hlam,
	    double tstep, int *flag)
{
  /* Evaluate derivatives of error with respect to all weights (ij) at
     a single epoch for a single pattern.

     tstep is Delta(t)

     A forward pass of the network is done using dynetloop() before
     this routine is called. dynetloop evaluates v(t), y(t-1) and
     h(t-1) from v(t-1) and x(t-1). All of these are passed into this
     routine. The (t-1) variables are denoted by _prev.
     
     I use static storage classes for the previous values of the
     weight derivatives and state variables. This requries some
     careful allocation and deallocation of memory and copying of
     variables at the end of the loop.

     Note that the current values of the derivaties etc. (e.g. dvVH)
     have memory allocated and deallocated each time this subroutine is
     called. Whilst that is quite tidy, how much time does it waste?
     (Probably not much relatively speaking as there are so many loops
     in this subroutine.)

     Note also that tar in this subroutine is a 1D vector of type
     targets rather than a 3D array of type targets as in the rest of
     the program.
     */
  
  /* declare dv_ij(t-1)/dt, dy_ij(t-1)/dt, dv_prev_ij(t-1)/dt,
     dy_prev_ij(t-1)/dt for VH, XH and HY weights */

  double ***dvVH;
  static double ***dvVH_prev;
  double ***dyVH_prev;
  
  double ***dvXH;
  static double ***dvXH_prev;
  double ***dyXH_prev;

  double ***dvHY;
  static double ***dvHY_prev;
  double ***dyHY_prev;

  double s1,s2;   /* intermediate sum variables */
  double r1;      /* intermediate variables */
  double *ipH;    /* intermediate products for H */
  double ***tmp3; /* temporary pointer */
  
  int i,j,k,m,n;


  /* allocate memory */

  ipH  = Dvector(Hsize);
  dvVH = Dmatrix3(Vsize, Vsize, Hsize);
  dyVH_prev = Dmatrix3(Ysize, Vsize, Hsize);
  dvXH = Dmatrix3(Vsize, Xsize+1, Hsize);
  dyXH_prev = Dmatrix3(Ysize, Xsize+1, Hsize);
  dvHY = Dmatrix3(Vsize, Hsize+1, Ysize);
  dyHY_prev = Dmatrix3(Ysize, Hsize+1, Ysize);
  
  if(flag[51]==1) { /* allocate */    
    if(flag[14]>=3)
      printf("ederiv(): Allocating memory for dvWW_prev (weight derivatives)\n");
    dvVH_prev = Dmatrix3(Vsize, Vsize, Hsize);
    dvXH_prev = Dmatrix3(Vsize, Xsize+1, Hsize);
    dvHY_prev = Dmatrix3(Vsize, Hsize+1, Ysize);
    flag[51]=2;
  }

  if(flag[51]==2) { /* initialise */
    dysyswtinit(Vsize, Xsize, Hsize, Ysize,
	      dvVH_prev, dvXH_prev, dvHY_prev);
    flag[51]=0;
  }

  if(flag[51]==3) { /* deallocate */
    if(flag[14]>=3)
      printf("ederiv(): Deallocating memory for dvWW_prev (weight derivatives)\n");
    free_Dmatrix3(dvVH_prev, Vsize, Vsize, Hsize);
    free_Dmatrix3(dvXH_prev, Vsize, Xsize+1, Hsize);
    free_Dmatrix3(dvHY_prev, Vsize, Hsize+1, Ysize);
    return;
  } 

  /* initialise error derivatives */

  for(j=0;j<Hsize;++j) {
    for(i=0;i<Vsize;++i)
      edVH[i][j]=0.0;
    for(i=0;i<Xsize+1;++i)
      edXH[i][j]=0.0;
  }
  for(i=0;i<Hsize+1;++i) {
    for(j=0;j<Ysize;++j)
      edHY[i][j]=0.0;
  }
    
  /* equations C and B
     loop levels are: 1-Y (n)
                         2a-H (i) (not including H bias)
			     3-V (j) (VH weights - eqn. C2)
			        4-H (m)
			           5-V (k)
			     3-X (j) (XH weights - eqn. C3)
		                4-H (m)
			           5-V (k)
                         2a-H (i) (including H bias)
			     3-Y (j) (HY weights - eqn. C1)
			        4-H (m)
			           5-V (k)

  i.e. loops are nested 5 deep
  */

  /* equation C - evaluate dy terms */
  for(n=0;n<Ysize;++n) { /* level 1 */
    r1 = dTg(y_prev[n]);

    for(j=0;j<Hsize;++j) { /* level 2a - not including H bias */
      for(i=0;i<Vsize;++i) { /* level 3 - VH */
	for(s2=0,m=0;m<Hsize;++m) { /* level 4 */
	  ipH[m] = wtHY[m][n]*dTf(h_prev[m], Hlam);
	  for(s1=0,k=0;k<Vsize;++k) /* level 5 */
	    s1 += wtVH[k][m]*dvVH_prev[k][i][j];
	  if(idelta(m,j))
	    s1+=v_prev[i];
	  s2 += s1*ipH[m];
	}
	dyVH_prev[n][i][j] = r1*s2; 
      }      
      for(i=0;i<Xsize+1;++i) { /* level 3 - XH */
	for(s2=0,m=0;m<Hsize;++m) { /* level 4 */
	  for(s1=0,k=0;k<Vsize;++k) /* level 5 */
	    s1 += wtVH[k][m]*dvXH_prev[k][i][j];
	  if(idelta(m,j))
	    s1+=x_prev[i];
	  s2 += s1*ipH[m];
	}
	dyXH_prev[n][i][j] = r1*s2;
      }
    } /* end of level 2a */

    for(i=0;i<Hsize+1;++i) { /* level 2b - including H bias */
      for(j=0;j<Ysize;++j) { /* level 3 - HY */
	for(s2=0,m=0;m<Hsize;++m) { /* level 4 */
	  for(s1=0,k=0;k<Vsize;++k) /* level 5 */
	    s1 += wtVH[k][m]*dvHY_prev[k][i][j];
	  s2 += s1*ipH[m];
	}
	if(fdelta(n,j))
	  s2+=h_prev[i];
	dyHY_prev[n][i][j] = r1*s2;
      }      
    } /* end of level 2b */

  } /* end of level 1 */

  /* equation B - evaluate dv and ed terms */
  for(k=0;k<Vsize;++k) { /* level 1 */

    for(j=0;j<Hsize;++j) { /* level 2a - not including H bias */
      for(i=0;i<Vsize;++i) {/* level 3 - VH */
	dvVH[k][i][j] = dvVH_prev[k][i][j] + tstep*dyVH_prev[k][i][j]; 
	if(tar[k].def) /* only contribute to error if target defined */
	  edVH[i][j]+=beta[k]*errfn(v[k],tar[k].val)*dvVH[k][i][j]; 
      }
      for(i=0;i<Xsize+1;++i) {/* level 3 - XH */
	dvXH[k][i][j] = dvXH_prev[k][i][j] + tstep*dyXH_prev[k][i][j];
	if(tar[k].def) /* only contribute to error if target defined */
	  edXH[i][j]+=beta[k]*errfn(v[k],tar[k].val)*dvXH[k][i][j]; 
      }
    } /* end of level 2a */

    for(i=0;i<Hsize+1;++i) { /* level 2b - including H bias */
      for(j=0;j<Ysize;++j) {/* level 3 - HY */
	dvHY[k][i][j] = dvHY_prev[k][i][j] + tstep*dyHY_prev[k][i][j];
	if(tar[k].def) /* only contribute to error if target defined */
	  edHY[i][j]+=beta[k]*errfn(v[k],tar[k].val)*dvHY[k][i][j]; 
      }
    } /* end of level 2b */

  } /* end of level 1 */

  /* Contributions from weight decay - note that if we're using weight
     decay then we get contributions (for non-zero alphas) even when
     outputs are not defined. Could this be a problem? Will we get too
     much regularization from weight decay error contributions? */
  
  if(flag[8]>0) { /* using weight decay */
    for(j=0;j<Hsize;++j) { /* not including H bias */
      for(i=0;i<Vsize;++i) /* VH */
	edVH[i][j]+=alpha[0]*wtVH[i][j]; 
      for(i=0;i<Xsize;++i) /* XH */
	edXH[i][j]+=alpha[1]*wtXH[i][j];
      edXH[Xsize][j]+=alpha[2]*wtXH[Xsize][j]; /* input bias */
    }
    for(i=0;i<Hsize+1;++i) { /* including H bias */
      for(j=0;j<Ysize;++j) /* HY */
	edHY[i][j]+=alpha[3]*wtHY[i][j]; 
    }
  }

  if(flag[14]>=4)
    printf("dvVH_prev = %8.5f, dvXH_prev = %8.5f, dvbH_prev = %8.5f, dvHY_prev = %8.5f, dvbY_prev = %8.5f\n", dvVH_prev[0][0][0], dvXH_prev[0][0][0], dvXH_prev[0][1][0], dvHY_prev[0][0][0], dvHY_prev[0][1][0]);
  
  /* data shifts at end of subroutine by redirecting pointers */

  tmp3 = dvVH_prev;
  dvVH_prev = dvVH;
  dvVH = tmp3;

  tmp3 = dvXH_prev;
  dvXH_prev = dvXH;
  dvXH = tmp3;

  tmp3 = dvHY_prev;
  dvHY_prev = dvHY;
  dvHY = tmp3;
  
  /* deallocate memory */
  
  free_Dvector(ipH, Hsize);
  free_Dmatrix3(dvVH, Vsize, Vsize, Hsize);
  free_Dmatrix3(dyVH_prev, Ysize, Vsize, Hsize);
  free_Dmatrix3(dvXH, Vsize, Xsize+1, Hsize);
  free_Dmatrix3(dyXH_prev, Ysize, Xsize+1, Hsize);
  free_Dmatrix3(dvHY, Vsize, Hsize+1, Ysize);
  free_Dmatrix3(dyHY_prev, Ysize, Hsize+1, Ysize);
  
}



void cumederivs(int Vsize, int Xsize, int Hsize, int Ysize,
		double **edVH, double **edXH, double **edHY, 
		double **cumedVH, double **cumedXH, double **cumedHY)
{
  /* cumulate the error derivatives (ed) into cumed */

  int k,l,m,n;

  for(m=0;m<Hsize;++m) {
    for(k=0;k<Vsize;++k)
      cumedVH[k][m] += edVH[k][m];
    for(l=0;l<Xsize+1;++l)
      cumedXH[l][m] += edXH[l][m];
  }
  for(m=0;m<Hsize+1;++m) {
    for(n=0;n<Ysize;++n) 
      cumedHY[m][n] += edHY[m][n];
  }  

}



/* ==================== Initialisation Routines =================== */

void dynetinit(int Vsize, int Vm, int Xsize, int Hsize, int Ysize, int Npats,
	       double **wtVH, double **wtXH, double **wtHY, double **vscale,
	       double **xscale, double *Hlam, int *flag, char *inwtfname,
	       char *sfname, long ranseed, double **tsteps,
	       double *alpha, double *beta, double wtrng)
{
  /* Initialise recurrent neural network
     
     This routine is called if the network has not just be trained,
     i.e. it is called by dynettrain and may be called by dynetapply.

     Hlam is always set by this routine.
     Weights are either read in from weights file or randomized.
     Scaling factors are only set if read in from the weights file.
     
     */

  FILE *inwtf; /* input weight file */
  FILE *sf;    /* specfile */
  char message[MESSAGESIZE]; /* message */
  char word[WORDSIZE], tmp[WORDSIZE]; /* generic words */
  char scaleopt[WORDSIZE]; /* scaling option */
  int k,l,m,n;
  int Vtmp, Vmtmp, Xtmp, Htmp;
  double betasum; /* used as a test of zero beta values */

  /* set Hlam to default */

  *Hlam = 1.0;
  
  if(flag[10]==1) { /* read weights and scaling factors from file */
    inwtf=fopen(inwtfname,"r");
    if(!inwtf) {
      sprintf(message, "dynetinit(): weights file %s could not be opened",
	      inwtfname);
      neterr(message);
    }
    if(flag[14]>=2)
      printf("dynetinit(): Reading weights from %s\n", inwtfname);

    fgets(word, WORDSIZE, inwtf); /* read line including \n */
    fgets(word, WORDSIZE, inwtf);
    fgets(word, WORDSIZE, inwtf);
    fgets(word, WORDSIZE, inwtf);
    sscanf(word, "%d %d %d %d\n", &Vtmp, &Vmtmp, &Xtmp, &Htmp);
    if(Vtmp!=Vsize || Vmtmp!=Vm || Xtmp!=Xsize || Htmp!= Hsize)
      neterr("dynetinit(): Network size in weights file does not match spec file"); 

    fgets(word, WORDSIZE, inwtf);
    fgets(word, WORDSIZE, inwtf);
    sscanf(word,"%s",scaleopt);
    if(strcmp(scaleopt,"none")==0) {
      if(flag[12]!=0 && flag[14]>=1) {
	netwarn("dynetinit(): Input weights file indicates no scaling option in\n conflict with spec file. Former takes precedence and hence no\n scaling of input data will be performed");
	flag[12]=0;
      }
    }
    else if(strcmp(scaleopt,"var")==0) {
      if(flag[12]!=1 && flag[14]>=1) {
	netwarn("dynetinit(): Input weights file indicates variance scaling option\n in conflict with spec file. Former takes precedence and will be used\n to scale state variables and external inputs");
	flag[12]=1;
      }
      fgets(word, WORDSIZE, inwtf);
      for(k=0;k<Vsize;++k) {
	fgets(word, WORDSIZE, inwtf);
	sscanf(word, "%lf %lf", &vscale[0][k], &vscale[1][k]);
      }
      fgets(word, WORDSIZE, inwtf);
      for(l=0;l<Xsize;++l) { /* this does not count biases */
	fgets(word, WORDSIZE, inwtf);
	sscanf(word, "%lf %lf", &xscale[0][l], &xscale[1][l]);
      }
    }
    else if(strcmp(scaleopt,"maxmin")==0)
      neterr("dynetinit(): maxmin scaling option not yet implemented");
    else if(strcmp(scaleopt,"netsize")==0)
      ; /* don't do anything: Hlam is read in for all options anyway */
    else 
      neterr("dynetinit(): invalid scaling option in weights file");
    
    fgets(word, WORDSIZE, inwtf);
    fgets(word, WORDSIZE, inwtf);
    sscanf(word, "%lf", Hlam);
    fgets(word, WORDSIZE, inwtf);
    for(k=0;k<Vsize;++k) {
      for(m=0;m<Hsize;++m)
	fscanf(inwtf, "%lf", &wtVH[k][m]);
      fgets(word, WORDSIZE, inwtf); /* to remove \n */
    }
    fgets(word, WORDSIZE, inwtf);
    for(l=0;l<Xsize+1;++l) {
      for(m=0;m<Hsize;++m)
	fscanf(inwtf, "%lf", &wtXH[l][m]);
      fgets(word, WORDSIZE, inwtf); /* to remove \n */
    }
    fgets(word, WORDSIZE, inwtf);
    for(m=0;m<Hsize+1;++m) {
      for(n=0;n<Ysize;++n)
	fscanf(inwtf, "%lf", &wtHY[m][n]);
      fgets(word, WORDSIZE, inwtf);
    }
    fclose(inwtf);
  }

  else if(flag[10]==0) { /* initialise weights
			    NB scaling factors not set here */
    if(flag[14]>=2)
      printf("dynetinit(): Initialising weights\n");
    if(flag[9]==1) { /* uniform weight distribution */
      for(k=0;k<Vsize;++k) {
	for(m=0;m<Hsize;++m) 
	  wtVH[k][m] = 2*wtrng*(double)(ran1(&ranseed)-0.5);
      }
      for(l=0;l<Xsize+1;++l) {
	for(m=0;m<Hsize;++m)
	  wtXH[l][m] = 2*wtrng*(double)(ran1(&ranseed)-0.5);
      }
      for(m=0;m<Hsize+1;++m) {
	for(n=0;n<Ysize;++n)
	  wtHY[m][n] = 2*wtrng*(double)(ran1(&ranseed)-0.5);
      }
    }
    else
      neterr("dynetinit(): Only a uniform weight distribution\nfor initialisation is currently available\nBummer huh?");
  }
  
  /* set alpha and beta parameters only if training
   - searches specfile twice. Slow maybe, but makes neater code. */

  if(flag[52]==1)
    return;

  /* set alpha parameters */

  if(flag[8]==2) {
    sf=fopen(sfname,"r");
    if(!sf) {
      sprintf(message, "dynetinit(): specfile %s could not be opened",
	      sfname);
      neterr(message);
    }
    while(fscanf(sf,"%s",word)!=EOF) {
      if(word[0]=='#') { /* NB Must use single quotes! */
	fgets(word, WORDSIZE, sf); /* read rest of line, including \n */
	continue;
      }
      if(strcmp(word,"TRN:alpha_VH")==0) {
	fscanf(sf,"%lf",&alpha[0]);
	if(alpha[0]<0.0)
	  neterr("dynetinit(): alpha_VH is an invalid negative value");
      }
      if(strcmp(word,"TRN:alpha_XH")==0) {
	fscanf(sf,"%lf",&alpha[1]);
	if(alpha[1]<0.0)
	  neterr("dynetinit(): alpha_XH is an invalid negative value");
      }
      if(strcmp(word,"TRN:alpha_bH")==0) {
	fscanf(sf,"%lf",&alpha[2]);
	if(alpha[2]<0.0) 
	  neterr("dynetinit(): alpha_bH is an invalid negative value");
      }
      if(strcmp(word,"TRN:alpha_HY")==0) {
	fscanf(sf,"%lf",&alpha[3]);
	if(alpha[3]<0.0)
	  neterr("dynetinit(): alpha_HY is an invalid negative value");
      }
    } /* end of file searching while loop */
    fclose(sf);
  }
  else if(flag[8]==1) {
    alpha[0]=alpha_VH_DEF;
    alpha[1]=alpha_XH_DEF;
    alpha[2]=alpha_bH_DEF;
    alpha[3]=alpha_HY_DEF;
  }
  else { /* strictly unnecessary: just for security */
    for(n=0;n<Nalpha;++n)
      alpha[n]=0.0;
  }
  
  /* set beta parameters */
  
  /* Note that if we do not set a sufficient number of values, then the
     last value is used for all the values of beta where a value is not
     specified. This is useful for ensuring that any "unmeasured" state
     variables are assigned a suitable beta value. However, it's
     best to assign your own values. */
     
  if(flag[7]==1) {
    sf=fopen(sfname,"r");
    if(!sf) {
      sprintf(message, "dynettrain(): specfile %s could not be opened",
	      sfname);
      neterr(message);
    }
    while(fscanf(sf,"%s",word)!=EOF) {
      if(word[0]=='#') { /* NB Must use single quotes! */
	fgets(word, WORDSIZE, sf); /* read rest of line, including \n */
	continue;
      }
      if(strcmp(word,"TRN:use_beta_parameters?_(yes/no)")==0) {
	fgets(word, WORDSIZE, sf); /* to remove "yes" or "no" */
	for(k=0;k<Vsize;++k) {
	  fgets(word, WORDSIZE, sf); 
	  sscanf(word,"%s %lf", tmp, &beta[k]);  
	  /* ignore value if incorrect stem */ 
	  if(strcmp(tmp,BETASTEM)!=0)
	    beta[k]=BETADEF;
	}
	break; /* stop scaning specfile */
      }
    }
    fclose(sf);
    for(k=1;k<Vsize;++k) { /* if no beta specified for any k, assign
			      last non-zero value - doesn't work unless
			      at least beta[0] is assigned */
      if(beta[k]==0.0)
	beta[k]=beta[k-1];
    }
  }
  else { /* use defaults */
    for(k=0;k<Vsize;++k)
      beta[k]=BETADEF;
  }

  /* print out and check values of beta */
  if(flag[14]>=2) {
    betasum=0.0;
    for(k=0;k<Vsize;++k) {
      printf("dynetinit(): beta[%d] = %9.5f\n", k, beta[k]);
      if(beta[k]<0.0)
	neterr("dynetinit(): Invalid negative beta value");
      else if(beta[k]==0.0)
	netwarn("dynetinit(): zero beta value. This state variable will not contribute to\n the error function, so it will not be constrained by the training");
      betasum+=beta[k];
    }
    if(flag[7]==1 && betasum==0.0)
      netwarn("dynetinit(): All beta values are zero. Training will be data independent");
  }
  
}



void dynetloopinit(int Vsize, double *v_prev, targets *tar)
{

  int k;

  for(k=0;k<Vsize;++k) {
    if(tar[k].def)
      v_prev[k]=tar[k].val;
    else
      v_prev[k]=VINITDEF;
  }

}



void dysyswtinit(int Vsize, int Xsize, int Hsize, int Ysize,
	       double ***dvVH_prev, double ***dvXH_prev, double ***dvHY_prev)
{
  /* initialise weight derivatives of derivative dynamic system
     currently everything is set to zero. If this is changed, the relevance
     of the data scaling must be considered. */

  int i,k,m;

  for(k=0;k<Vsize;++k) {
    for(m=0;m<Hsize;++m) {
      for(i=0;i<Vsize;++i)
	dvVH_prev[k][i][m] = dVINITDEF;
      for(i=0;i<Xsize+1;++i)
	dvXH_prev[k][i][m] = dVINITDEF;
    }
    for(m=0;m<Hsize+1;++m) {
      for(i=0;i<Ysize;++i)
	dvHY_prev[k][m][i] = dVINITDEF;      
    }
  }
  
}



/* ==================== Scaling Routines =================== */

void scalecalc(int Vsize, int Xsize, int Npats, int *ntsteps,
	       double ***x, targets ***tar,
	       double **xscale, double **vscale, double *Hlam, int *flag)
{
  /* Calculate the data scale factors for external inputs
     and recurrent inputs and Hlam.

     Note that as the final x input vector is not used in the network
     it is not included in the scaling.

     If there are no defined values for a given node, scaling should
     not affect what occurs in rest of program. Thus:
       mean-sd scaling -> mean=0, sd=1
     If there is only one defined value on a given node or all 
     defined values are equal:
       mean-sd scaling -> mean=<value>, sd=1
     These prevent divide by zero in datascale()

     */

  
  int p,t,k,l;
  int numvecs;
  int *vnum;
  long double tmp; 
  long double vtmp;
  long double *xtmp;
  char message[MESSAGESIZE];
  
  /* evaluate Hlam */
  
  *Hlam = 1/( sqrt((double)(Xsize+1+Vsize)) );

  if(flag[12]==3) /* i.e. only set Hlam */
    return;

  if(flag[12]!=1)
    neterr("scalecalc(): Scaling option not yet implemented");
  /* put all of the following into an if clause if and when you write
     the minmax scaling option. Also, if you implement on-line learning
     you could evaluate some scale factors here */

  if(flag[14]>=2)
    printf("scalecalc(): Calculating data scale\n");
  
  /* initialise xscale - the use of numvecs must be changed so that
   you use an xnum variable in the same way that you use the vnum
   variable in the vscale thing below */

  xtmp = lDvector(Xsize);
  for(l=0;l<Xsize;++l) {
    xscale[0][l]=0.0;
    xscale[1][l]=0.0;
    xtmp[l]=0.0;
  }

  /* first pass: evaluate mean */
  for(p=1,numvecs=0;p<=Npats;++p) { /* patterns */
    for(t=0;t<ntsteps[p]-1;++t) { /* epochs */
      numvecs++;
      for(l=0;l<Xsize;++l)  
	xtmp[l] += (long double)x[p][t][l];
    }
  }
  for(l=0;l<Xsize;++l) {
    xscale[0][l] = (xtmp[l]/(long double)numvecs);
    xtmp[l] = 0.0; /* re-initialise */
  }

  /* second pass: evaluate standard deviation */
  for(p=1;p<=Npats;++p) { /* patterns */
    for(t=0;t<ntsteps[p]-1;++t) { /* epochs */
      for(l=0;l<Xsize;++l) { 
	tmp = (long double)x[p][t][l] - (long double)xscale[0][l];
	xtmp[l] += tmp*tmp; 
      }
    }
  }

  for(l=0;l<Xsize;++l) {
    xscale[1][l] = (double)(sqrt( (double)(xtmp[l]/(long double)numvecs) ));
    if(flag[14]>=2)
      printf(" input (x) %d: mean = % 8.5f  sd = % 8.5f\n", l, xscale[0][l], xscale[1][l]);
  }
  for(l=0;l<Xsize;++l) {
    if(xscale[1][l]==0.0) { /* i.e. all inputs for a given node are equal */
      xscale[1][l]=1.0;
      if(flag[14]>=1) {
	sprintf(message, "scalecalc(): X standard deviation for l=%d is zero (all inputs the same?)\n Forcing stddev=1 to prevent divide by zero", l);
	netwarn(message);
      }
    }
  }
  
  free_lDvector(xtmp, Xsize);

  /* initialise vscale - here the number of nodes is the outermost loop,
     in contrast to the xscale calculation, where the number of patterns
     was the outermost loop. There was probably a good reason for this! */
  
  vnum=Ivector(Vsize);
  for(k=0;k<Vsize;++k) {
    vscale[0][k]=0.0;
    vscale[1][k]=0.0;
    vnum[k]=0;
  }

  /* first pass: evaluate mean */
  for(k=0;k<Vsize;++k) { /* nodes */
    vtmp=0.0;
    for(p=1,numvecs=0;p<=Npats;++p) { /* patterns */
      for(t=0;t<ntsteps[p];++t) { /* epochs */
	if(tar[p][t][k].def) {
	  vtmp += (long double)tar[p][t][k].val;
	  vnum[k]++;
	}
      }
    } /* end of patterns */
    if(vnum[k]==0)
      vscale[0][k]=0.0;
    else
      vscale[0][k] = (double)(vtmp/(long double)vnum[k]);
  }
  
  /* second pass: evaluate standard deviation */
  for(k=0;k<Vsize;++k) { /* nodes */
    if(vnum[k]<=1)
      vscale[1][k]=1.0; /* Force sd to unity
			   to prevent divide by zero in datascale() */
    else {
      vtmp=0.0;
      for(p=1;p<=Npats;++p) { /* patterns */
	for(t=0;t<ntsteps[p];++t) { /* epochs */
	  if(tar[p][t][k].def) {
	    tmp   = (long double)tar[p][t][k].val - (long double)vscale[0][k];
	    vtmp += tmp*tmp; 
	  }
	} /* end of epochs */
      } /* end of patterns */
      vscale[1][k] = (double)(sqrt( (double)(vtmp/(long double)vnum[k]) ));
      if(flag[14]>=2)
	printf(" state (v) %d: mean = % 8.5f  sd = % 8.5f\n", k, vscale[0][k], vscale[1][k]);
    }
  }

  for(k=0;k<Vsize;++k) {
    if(vscale[1][k]==0.0) {
      vscale[1][k]=1.0;
      if(flag[14]>=1) {
	sprintf(message, "datascale(): V standard deviation for k=%d is zero (all inputs the same?)\n Forcing stddev=1 to prevent divide by zero", k);
	netwarn(message);
      }
    }
  }
  
  free_Ivector(vnum, Vsize);
    
}



void datascale(int Vsize, int Xsize, int Npats, int *ntsteps, double ***x,
	       targets ***tar, double **xscale, double **vscale, int *flag)
{
  /* scale the external inputs and recurrent inputs
     using known scaling parameters */

  int p,t,k,l;
  char message[MESSAGESIZE];

  
  if(flag[14]>=2) printf("datascale(): Scaling data\n");
  if(flag[12]==1) { /* mean and standard deviation scale */
    for(p=1;p<=Npats;++p) { /* patterns */
      if(flag[14]>=3) printf("p = %2d\n", p);
      /* scale v */
      if(flag[14]>=3) printf("targets (scaled values):\n");
      for(t=0;t<ntsteps[p];++t) { /* epochs */
	for(k=0;k<Vsize;++k) { /* nodes */
	  if(tar[p][t][k].def) {
	    if(vscale[1][k]==0.0) {
	      sprintf(message, "datascale(): attempt to divide by zero (V standard deviation for k=%d is zero)", k);
	      neterr(message);
	    }
	    tar[p][t][k].val = (tar[p][t][k].val-vscale[0][k])/vscale[1][k];
	    if(flag[14]>=3) printf("% 9.5f ", tar[p][t][k].val);
	  }
	  else {
	    if(flag[14]>=3) printf("  x       ");
	  }
	} /* end of nodes */
	  if(flag[14]>=3) printf("\n"); 
      } /* end of epochs */
      /* scale x */
      if(flag[14]>=3) printf("external inputs (scaled values):\n");
      for(t=0;t<ntsteps[p]-1;++t) { /* epochs */
	for(l=0;l<Xsize;++l) {
	  if(xscale[1][l]==0.0) {
	    sprintf(message, "datascale(): attempt to divide by zero (X standard deviation for l=%d is zero)", l);
	    neterr(message);
	  }
	  x[p][t][l] = (x[p][t][l]-xscale[0][l])/xscale[1][l];
	  if(flag[14]>=3) printf("% 9.5f ", x[p][t][l]);
	}
	if(flag[14]>=3) printf("\n"); 
      } /* end of epochs */
    } /* end of patterns */
  }
  else
    neterr("datascale(): Scaling option not yet implemented");
  
}



void unscale(int Vsize, double *v, double **vscale, int *flag)
{
  /* unscale a vector */

  int k;
  
  if(flag[12]==1) { /* mean and standard deviation scale */
    for(k=0;k<Vsize;++k)
      v[k] = v[k]*vscale[1][k] + vscale[0][k];
  }
  else
    neterr("unscale(): Scaling option not yet implemented");
}



/* ==================== Input/Output Routines =================== */

void specread(char *sfname, int *flag,
	      int *Vsize, int *Vm, int *Xsize, int *Hsize, int *Ysize,
	      int *Nits, double *eta,
	      int *TRN_Npats, char *TRN_tpfname[],
	      int *APP_Npats, char *APP_tpfname[],
	      char *inwtfname, char *otwtfname, long *ranseed,
	      double *macconvtol, int *macitmax, double *macchecktol,
	      double *wtrng, char *plotfname, char *errfname)
{
  /* Read in spec file 

     The general rule is that if a bogus input is specified, flags
     etc. will be set to their default values without warning.
     Exceptions are where inputs are NOT simply "yes/no".

     Could have passed tpfname in as char **tpfname, it's equivalent
     Need to pass back read-in variables, so defined them in as pointers */
  
  FILE *sf;
  char message[MESSAGESIZE];
  char word[WORDSIZE];
  double fnum;
  int inum,p;


  /* set specifications (flags etc.) to their default values. Some 
     values are bizarre to serve as checks on whether they've been set
  */

  flag[1] = 0;
  flag[2] = 0;  
  flag[3] = 2;
  flag[4] = 4;
  flag[7] = 0;
  flag[8] = 0;
  flag[9] = 1;
  flag[10]= 0;
  flag[11]= 0;
  flag[12]= 1;
  flag[13]= 0;
  flag[14]= 2;
  flag[15]= 0;
  flag[16]= 0;
  flag[17]= 0;
  flag[18]= 0;
  *eta=0.0;
  *Nits=0;
  *TRN_Npats=0;
  *APP_Npats=0;
  *Vm=-1;
  *Vsize=*Vm;
  *Xsize=-1;
  *Hsize=-1;
  *Ysize=*Vsize;
  *wtrng=WTRNGDEF;
  *ranseed=RANSEEDDEF;
  *macitmax=0;
  *macconvtol=0.0;
  *macchecktol=MACCHECKTOLDEF;
  /* internal macopt parameters are set to default values by calling
     macopt_defaults() in callmacopt() */
  
  /* read specfile */
  
  sf=fopen(sfname,"r");
  if(!sf) {
    sprintf(message, "specread(): specfile %s could not be opened", sfname);
    neterr(message);
  }

  while(fscanf(sf,"%s",word)!=EOF) {

    if(word[0]=='#') { /* NB Must use single quotes! */
      fgets(word, WORDSIZE, sf); /* read rest of line, including \n */
      continue;
    }
    else if(strcmp(word,"verbosity_level_(0/1/2/3/4)")==0) {
      fscanf(sf,"%d",&inum);
      if(inum>=0 && inum<=4)
	flag[14]=inum; /* otherwise retain default */
    }
    else if(strcmp(word,"train_network?_(yes/no)")==0) {
      fscanf(sf,"%s",word);
      if(strcmp(word,"yes")==0) flag[1]=1;
    }
    else if(strcmp(word,"apply_network?_(yes/no)")==0) {
      fscanf(sf,"%s",word);
      if(strcmp(word,"yes")==0) flag[2]=1;
    }
    else if(strcmp(word,"NET:number_of_state_variables_(V)")==0) {
      fscanf(sf,"%d",&inum);
      if(inum<=0)
	neterr("specread(): Number of state variables (V) must exceed zero");
      if(inum>NODEWARNSIZE && flag[14]>=1)
	netwarn("specread(): That's a lot of state variables (V)");
      *Vsize=inum;
    }
    else if(strcmp(word,"NET:number_of_recurrent_inputs_(V)")==0) {
      /* This option is the same as the previous one. I used to call
	 the state variables "recurrent inputs", so this option
	 enables backward compatibility with old specfiles. */
      fscanf(sf,"%d",&inum);
      if(inum<=0)
	neterr("specread(): Number of recurrent inputs (V) must exceed zero");
      if(inum>NODEWARNSIZE && flag[14]>=1)
	netwarn("specread(): That's a lot of recurrent inputs (V)");
      *Vsize=inum;
    }
    else if(strcmp(word,"NET:number_of_measured_state_variables_(Vm)")==0) {
      fscanf(sf,"%d",&inum);
      if(inum<=0)
	neterr("specread(): Number of measured state variables (Vm) must exceed zero");
      *Vm=inum;
    }
    else if(strcmp(word,"NET:number_of_measured_recurrent_inputs_(Vm)")==0) {
      /* This option is the same as the previous one. I used to call
	 the state variables "recurrent inputs", so this option
	 enables backward compatibility with old specfiles. */
      fscanf(sf,"%d",&inum);
      if(inum<=0)
	neterr("specread(): Number of measured recurrent inputs (Vm) must exceed zero");
      *Vm=inum;
    }
    else if(strcmp(word,"NET:number_of_external_inputs_(X)")==0) {
      fscanf(sf,"%d",&inum);
      if(inum<0)
	neterr("specread(): Number of external inputs (X) cannot be negative");
      if(inum>NODEWARNSIZE && flag[14]>=1)
	netwarn("specread(): That's a lot of external inputs (X)");
      *Xsize=inum;
    }
    else if(strcmp(word,"NET:number_of_hidden_nodes_(H)")==0) {
      fscanf(sf,"%d",&inum);
      if(inum<0)
	neterr("specread(): Number of hidden nodes (H) cannot be negative");
      if(inum>NODEWARNSIZE && flag[14]>=1)
	netwarn("specread(): That's a lot of hidden nodes (H)");
      *Hsize=inum;
    }
    else if(strcmp(word,"NET:data_scaling_(none/var/maxmin/netsize)")==0) {
      fscanf(sf,"%s",word);
      if(strcmp(word,"none")==0)
	flag[12]=0;
      else if(strcmp(word,"var")==0)
	flag[12]=1;
      else if(strcmp(word,"maxmin")==0)
	flag[12]=2;
      else if(strcmp(word,"netsize")==0)
	flag[12]=3;
      else
	neterr("specread(): Option for\nNET:data_scaling_method_(none/var/maxmin) ... is not valid");
    }
    else if(strcmp(word,"NET:input_weight_file")==0) {
      fscanf(sf,"%s",inwtfname);
      flag[10]=1;
    }
    else if(strcmp(word,"TRN:optimization_method_(grd/macopt)")==0) {
      fscanf(sf,"%s",word);
      if(strcmp(word,"macopt")==0) flag[3]=2;
      else if(strcmp(word,"grd")==0) flag[3]=1;
      else neterr("specread(): Option for\nTRN:Optimization_method_(grd/macopt) ... is not valid");
    }
    else if(strcmp(word,"TRN:update_method_(1/3/4)")==0) {
      fscanf(sf,"%d",&inum);
      if(inum==1 || inum==3 || inum==4)
	flag[4]=inum;
      else
	neterr("specread(): Option for TRN:Update_method_(1/3/4) is not valid");
    }
    else if(strcmp(word,"TRN:number_of_temporal_pattern_files")==0) {
      fscanf(sf,"%d", TRN_Npats);
      if(*TRN_Npats>MAXTPFILES) {
	sprintf(message, "(specread(): You have chosen TRN_Npats=%d which exceeds MAXTPFILES", *TRN_Npats);
	neterr(message);
      }
      else if(*TRN_Npats<=0) {
	sprintf(message, "(specread(): TRN_Npats=%d is invalid", *TRN_Npats);
	neterr(message);
      }
      for(p=1;p<=*TRN_Npats;++p) {
	fscanf(sf,"%s",word);
	if(word[0]=='#') { /* this accommodates hashed out files */
	  fgets(word, WORDSIZE, sf); /* read rest of line, including \n */
	  p--;
	  continue;
	}
	TRN_tpfname[p]=Cvector((int)(strlen(word)+1)); /* deallocated in main() */
	strcpy(TRN_tpfname[p],word);
	if(flag[1]==1 && flag[14]>=2)
	  printf("specread(): Train file no.%3d: %s\n", p, TRN_tpfname[p]);
      }
    }
    else if(strcmp(word,"TRN:number_of_temporal_process_files")==0) {
      /* This option is the same as the previous one. I used to call
	 the temporal patterns "temporal processes" but that is
	 misleading.  However, I keep this option to enable backward
	 compatibility with old specfiles. */
      fscanf(sf,"%d", TRN_Npats);
      if(*TRN_Npats>MAXTPFILES) {
	sprintf(message, "(specread(): You have chosen TRN_Npats=%d which exceeds MAXTPFILES", *TRN_Npats);
	neterr(message);
      }
      else if(*TRN_Npats<=0) {
	sprintf(message, "(specread(): TRN_Npats=%d is invalid", *TRN_Npats);
	neterr(message);
      }
      for(p=1;p<=*TRN_Npats;++p) {
	fscanf(sf,"%s",word);
	if(word[0]=='#') { /* this accommodates hashed out files */
	  fgets(word, WORDSIZE, sf); /* read rest of line, including \n */
	  p--;
	  continue;
	}
	TRN_tpfname[p]=Cvector((int)(strlen(word)+1)); /* deallocated in main() */
	strcpy(TRN_tpfname[p],word);
	if(flag[1]==1 && flag[14]>=2)
	  printf("specread(): Train file no.%3d: %s\n", p, TRN_tpfname[p]);
      }
    }
    else if(strcmp(word,"TRN:weight_decay_(none/default/list)")==0) {
      fscanf(sf,"%s",word);
      if(strcmp(word,"default")==0)
	flag[8]=1;
      if(strcmp(word,"list")==0)
	flag[8]=2;
      /* alpha parameters themselves are read in later in dynettrain */
    }
    else if(strcmp(word,"TRN:use_beta_parameters?_(yes/no)")==0) {
      fscanf(sf,"%s",word);
      if(strcmp(word,"yes")==0)
	flag[7]=1;
      /* beta parameters themselves are read in later in dynettrain */
    }
    else if(strcmp(word,"TRN:output_weight_file")==0) {
      fscanf(sf,"%s",otwtfname);
      flag[11]=1;
    }
    else if(strcmp(word,"TRN:error_file")==0) {
      fscanf(sf,"%s",errfname);
      flag[16]=1;
    }
    else if(strcmp(word,"TRN:form_of_weight_init_(uniform/gaussian)")==0) {
      fscanf(sf,"%s",word);
      if(strcmp(word,"uniform")==0)
	flag[9]=1;
      else if(strcmp(word,"gaussian")==0)
	flag[9]=2;
      else
	neterr("specread(): Option for\nTRN:Form_of_weight_init_(uniform/gaussian) ... is not valid");
    }
    else if(strcmp(word,"TRN:initial_weight_range")==0) {
      fscanf(sf,"%lf",wtrng);
      if(fabs(*wtrng)>WTRNGWARNSIZE && flag[14]>=1)
	netwarn("specread(): Initial weight range is very large");
    }
    else if(strcmp(word,"TRN:random_number_seed")==0) {
      fscanf(sf,"%d",&inum);
      *ranseed=-(long)(abs((int)inum));
    }
    else if(strcmp(word,"GRD:number_of_iterations")==0) {
      fscanf(sf,"%d",&inum);
      if(inum<=0)
	neterr("specread(): Number of iterations must exceed zero");
      *Nits=inum;    
    }
    else if(strcmp(word,"GRD:learning_rate")==0) {
      fscanf(sf,"%lf",&fnum);
      if(fnum<=0.0)
	neterr("specread(): Learning rate must exceed zero");
      *eta=fnum;    
    }
    else if(strcmp(word,"MAC:convergence_tolerance_gradient")==0) {
      fscanf(sf,"%lf",macconvtol); 
      *macconvtol *= *macconvtol;
      if(*macconvtol<=0.0)
	neterr("specread(): MAC:convergence_tolerance must be positive");
    }
    else if(strcmp(word,"MAC:convergence_tolerance")==0) {
      /* obsolete, but retained for backwards compatibility */
      fscanf(sf,"%lf",macconvtol); 
      if(*macconvtol<=0.0)
	neterr("specread(): MAC:convergence_tolerance must be positive");
    }
    else if(strcmp(word,"MAC:maximum_number_of_iterations")==0) {
      fscanf(sf,"%d",macitmax); 
      if(*macitmax<0)
	neterr("specread(): MAC:maximum_number_of_iterations cannot be negative");
    }
    else if(strcmp(word,"MAC:perform_maccheckgrad?_(yes/no)")==0) {
      fscanf(sf,"%s",word);
      if(strcmp(word,"yes")==0)
	flag[13]=1;
    }
    else if(strcmp(word,"MAC:maccheckgrad_tolerance")==0) {
      fscanf(sf,"%lf",macchecktol); 
      if(*macchecktol<=0.0)
	neterr("specread(): MAC:maccheckgrad_tolerance must be positive");
    }
    else if(strcmp(word,"APP:plot_file_name")==0) {
      fscanf(sf,"%s",word);
      strcpy(plotfname,word);
      flag[15]=1;
    }
    else if(strcmp(word,"APP:include_v(t=0)_in_plot_file?_(yes/no)")==0) {
      fscanf(sf,"%s",word);
      if(strcmp(word,"yes")==0)
	flag[17]=1;
    }
    else if(strcmp(word,"APP:write_tper_files?_(yes/no)")==0) {
      fscanf(sf,"%s",word);
      if(strcmp(word,"yes")==0)
	flag[18]=1;
    }
    else if(strcmp(word,"APP:number_of_temporal_pattern_files")==0) {
      fscanf(sf,"%d", APP_Npats);
      if(*APP_Npats>MAXTPFILES) {
	sprintf(message, "(specread(): You have chosen APP_Npats=%d which exceeds MAXTPFILES", *APP_Npats);
	neterr(message);
      }
      else if(*APP_Npats<=0) {
	sprintf(message, "(specread(): APP_Npats=%d is invalid", *APP_Npats);
	neterr(message);
      }
      for(p=1;p<=*APP_Npats;++p) {
	fscanf(sf,"%s",word);
	if(word[0]=='#') { /* this accommodates hashed out files */
	  fgets(word, WORDSIZE, sf); /* read rest of line, including \n */
	  p--;
	  continue;
	}
	APP_tpfname[p]=Cvector((int)(strlen(word)+1)); /* deallocated in main() */
	strcpy(APP_tpfname[p],word);
	if(flag[2]==1 && flag[14]>=2)
	  printf("specread(): Apply file no.%3d: %s\n", p, APP_tpfname[p]);
      }
    }
    else if(strcmp(word,"APP:number_of_temporal_process_files")==0) {
      /* This option is the same as the previous one. I used to call the
	 temporal patterns "temporal processes" but that is misleading.
	 However, I keep this option to enable backward compatibility with
	 old specfiles. */
      fscanf(sf,"%d", APP_Npats);
      if(*APP_Npats>MAXTPFILES) {
	sprintf(message, "(specread(): You have chosen APP_Npats=%d which exceeds MAXTPFILES", *APP_Npats);
	neterr(message);
      }
      else if(*APP_Npats<=0) {
	sprintf(message, "(specread(): APP_Npats=%d is invalid", *APP_Npats);
	neterr(message);
      }
      for(p=1;p<=*APP_Npats;++p) {
	fscanf(sf,"%s",word);
	if(word[0]=='#') { /* this accommodates hashed out files */
	  fgets(word, WORDSIZE, sf); /* read rest of line, including \n */
	  p--;
	  continue;
	}
	APP_tpfname[p]=Cvector((int)(strlen(word)+1)); /* deallocated in main() */
	strcpy(APP_tpfname[p],word);
	if(flag[2]==1 && flag[14]>=2)
	  printf("specread(): Apply file no.%3d: %s\n", p, APP_tpfname[p]);
      }
    }
  }
  fclose(sf);
  
  /* Errors */

  if(*Vm==-1) neterr("specread(): Number of state variables (V) has not been set");
  if(*Vsize==-1) neterr("specread(): Number of state variables (V) has not been set");
  if(*Xsize==-1) neterr("specread(): Number of external inputs (X) has not been set");
  if(*Hsize==-1) neterr("specread(): Number of hidden nodes (H) has not been set");
  if(*Vm>*Vsize) neterr("specread(): Number of measured state variables (Vm) cannot exceed\ntotal number of state variables (V)");
  if(flag[1]==1 && *TRN_Npats==0)
    neterr("specread(): Network training requested but no training data given\n");
  if(flag[2]==1 && *APP_Npats==0)
    neterr("specread(): Network application requested but no application data given\n");
  if(flag[12]==2)
    neterr("specread(): maxmin scaling option not yet implemented");
  if(flag[3]==2) { /* if using macopt */
    if(*macitmax<=0) neterr("specread(): maximum number of macopt iterations has not been set");
    if(*macconvtol<=0.0) neterr("specread(): macopt convergence tolerance has not been set");
  }
  
  /* Warnings */

  if(flag[14]>=1) {
    if(flag[1]==0 && flag[10]==0 && flag[2]==1) {
      netwarn("specread(): Application of network without training or reading of weights\n file requested. Application will take place with random weights.");
      if(flag[12]==1 || flag[12]==2)
	netwarn("specread(): Scaling has been requested with random weights: Note that\n results and weights will vary if the tp files specified in the specfile is\n varied. This is because scaling factors are calculated using all tp files.");
    }
    if(flag[1]==1 && flag[10]==1) 
      netwarn("specread(): Training of network starting from given weights requested\n New data will be scaled according to scales in given weights file\n This will over-ride scaling method requested in spec file if necessary.");
    /* if flag[11]=1 and otwtfile exists, could warn user of overwrite */
    if(flag[1]==1 && flag[11]==0) {
      sprintf(message, "specread(): Network training requested but no output weights file specified.\n Weights will be written to file %s", DEFWTNAME);
      netwarn(message);
      strcpy(otwtfname,DEFWTNAME);
      flag[11]=1;
    }
    if(flag[3]==2 && flag[4]!=4) {
      netwarn("specread(): flag[4] option not impemented with macopt; setting flag[4] = 4");
    }
    if(flag[12]==0)
      netwarn("specread(): No data scaling requested. Weight saturation may occur");
    if(flag[12]==3)
      netwarn("specread(): Only netsize (Hlam) data scaling requested\n Weight saturation may occur");
  }
  
  /* make specifications consistent */

  *Ysize=*Vsize;
  
}



void dataread(int Vsize, int Vm, int Xsize, char *tpfname[],
	      int Npats, double **tsteps, int *ntsteps, double ***x,
	      targets ***tar, int *flag)
{

  
  /*  Read in temporal pattern files (.tpin files)

      tar is the name for the ideal values of the recurrent inputs.
      This name is slightly misleading as for t=0 they are not
      the targets but the initial conditions. Also, when applying
      (rather than training) the network, they are the ideal values
      which we may have in a file (as required when testing the
      network).
      
      If the initial conditions (i.e. tar at t=0) are not defined
      (i.e. "x" in tpin file), dynetinit sets these to VINITDEF
      (defined in dynet.h).
      */

  int p,t,l,k;
  int Xtmp,Vmtmp;
  FILE *tpf;
  char message[MESSAGESIZE];
  char word[WORDSIZE];

  
  if(flag[14]>=2) printf("dataread(): Reading in data\n");

  for(p=1;p<=Npats;++p) {
    tpf=fopen(tpfname[p],"r");
    if(!tpf) {
      sprintf(message, "dataread(): temporal pattern file %s not found",
	      tpfname[p]);
      neterr(message);
    }
    fgets(word, WORDSIZE, tpf);
    fgets(word, WORDSIZE, tpf);
    fgets(word, WORDSIZE, tpf);
    fgets(word, WORDSIZE, tpf);
    sscanf(word,"%d %d %d", &Vmtmp, &Xtmp, &ntsteps[p]);
    /* check consistency of file headers with specifications */
    if(Xtmp!=Xsize) {
      sprintf(message, "dataread(): Header in pd file %s states %d external inputs\n in conflict with spec file (%d)", tpfname[p], Xtmp, Xsize);
      neterr(message);
    }
    if(Vmtmp!=Vm) {
      sprintf(message, "dataread(): Header in pd file %s states %d measured state variables\n in conflict with spec file (%d)", tpfname[p], Vmtmp, Vm);
      neterr(message);
    }

    /* allocate memory 
       dellocation occurs in calling routine (dynettrain or dynetapply) */
    
    tsteps[p]= Dvector(ntsteps[p]);
    x[p]     = Dmatrix(ntsteps[p],Xsize+1);
    tar[p]   = tmatrix(ntsteps[p],Vsize); /* assume this initialises to zero
				  although this probably isn't necessary */

    /* Read in columns
       tstep at time t is the time between step t and step t-1.
       Thus although tsteps[p][0] are read in for all p (i.e. the
       first time step) they are not used. This is because the
       first relevant time step is that between the initial
       conditions at t=0 and the data at t=1.
       */

    fgets(word, WORDSIZE, tpf);
    for(t=0;t<ntsteps[p];++t) {
      fscanf(tpf," %*s %lf",&tsteps[p][t]);
      for(k=0;k<Vm;++k) { /* read measured state variables */
	fscanf(tpf,"%s",word);
	if( !(strcmp(word,"x")) || !(strcmp(word,"X")) ) {
	  tar[p][t][k].def = 0;
	  if(t==0 && flag[14]>=1) { /* warn if we have undefined initial conditions */
	    sprintf(message, "dataread(): Initial condition for a state variable in file %s\n is not defined. Default initial value of VINITDEF = %3.1f will be used.\n This will not be scaled if you are using scaling.", tpfname[p], VINITDEF);
	    netwarn(message);
	  }
	}
	else {
	  tar[p][t][k].val = atof(word);
	  tar[p][t][k].def = 1;
	}
      }
      for(k=Vm;k<Vsize;++k)  /* add extra unmeasured state variables */
	tar[p][t][k].def = 0;
      x[p][t][Xsize]=XBIAS;  /* set biases */
      for(l=0;l<Xsize;++l) { /* read external inputs */
	fscanf(tpf,"%s",word);
	if( (!(strcmp(word,"x")) || !(strcmp(word,"X"))) && (t!=ntsteps[p]-1) ) {
	  /* can only have unspecified external inputs at last epoch */
	  sprintf(message, "dataread(): undefined external input specified at\n epoch %d in pattern %d. This is not permitted.", t, p);
	  neterr(message);
	}
	else
	  x[p][t][l] = atof(word);
      }
    } /* end of this epoch */

    fclose(tpf);
  } /* end of this pattern */

    
}


void writeweights(int Vsize, int Vm, int Xsize, int Hsize, int Ysize,
		  double **wtVH, double **wtXH, double **wtHY, double **vscale, 
		  double **xscale, double Hlam, char *otwtfname, int *flag)
{
  /* write out weights */
  
  FILE *otwtf; 
  char message[MESSAGESIZE];
  int k,l,m,n;

  otwtf=fopen(otwtfname,"w");
  if(!otwtf) { 
    sprintf(message, "writeweights(): weights file %s could not be opened\n",
	    otwtfname);
    neterr(message); /* or could dump weights to stderr as an emergency */
  }

  if(flag[14]>=2)
    printf("writeweights(): Writing weights to %s\n", otwtfname);

  fprintf(otwtf, "# dynet weights file - do not add or remove lines\n");
  fprintf(otwtf, "# ###############################################\n");
  fprintf(otwtf, "# V (tot state), Vm (meas state), X (ext input), H (hidden): (exc biases)\n");
  fprintf(otwtf, "  %d                %d              %d              %d\n",
	  Vsize, Vm, Xsize, Hsize);
  fprintf(otwtf, "# scaling type: \n");
  if(flag[12]==1) {
    fprintf(otwtf, "var\n");
    fprintf(otwtf, "# V (state variable) mean and stdev scaling factors:\n"); 
    for(k=0;k<Vsize;++k)
      fprintf(otwtf, " % 8.5e % 8.5e\n", vscale[0][k], vscale[1][k]);
    fprintf(otwtf, "# X (external input) mean and stdev scaling factors:\n"); 
    for(l=0;l<Xsize;++l)
      fprintf(otwtf, " % 8.5e % 8.5e\n", xscale[0][l], xscale[1][l]);
  }
  else if(flag[12]==3)
    fprintf(otwtf, "netsize\n");
  else
    fprintf(otwtf, "none\n");
  fprintf(otwtf, "# Lambda scale parameter for hidden layer:\n");
  fprintf(otwtf, " % 7.5f\n", Hlam);
  fprintf(otwtf, "# wtVH (state-hidden weights):\n");
  for(k=0;k<Vsize;++k) {
    for(m=0;m<Hsize;++m)
      fprintf(otwtf, " % 8.5f ", wtVH[k][m]);
    fprintf(otwtf, "\n");
  }
  fprintf(otwtf, "# wtXH (input-hidden weights):\n");
  for(l=0;l<Xsize+1;++l) {
    for(m=0;m<Hsize;++m)
     fprintf(otwtf, " % 8.5f ", wtXH[l][m]);
    fprintf(otwtf, "\n");
  }
  fprintf(otwtf, "# wtHY (hidden-output weights):\n");
  for(m=0;m<Hsize+1;++m) {
    for(n=0;n<Ysize;++n)
      fprintf(otwtf, " % 8.5f ", wtHY[m][n]);
    fprintf(otwtf, "\n");
  }
  
  fclose(otwtf);
  
}

void evtpnewname(char *tpnewname, char *tpfname, int *flag)
{
   /* evaluate output file name; cut-off known suffices if present */

  int n;
  char message[MESSAGESIZE];

  char *infsuf1 = INFILESUFFIX1;
  char *infsuf2 = INFILESUFFIX2;
  char *newfsuf = Cvector(FNAMESIZE);
  
  size_t tpfnamelen = strlen(tpfname);
  size_t infsuflen1 = strlen(infsuf1);
  size_t infsuflen2 = strlen(infsuf2);

  if(flag[54]==1)
    newfsuf = OTFILESUFFIX;
  else if(flag[54]==2)
    newfsuf = ERFILESUFFIX;
  else { 
    sprintf(message, "evtpnewname(): flag[54] = %d is not recognised", flag[54]);
    neterr(message);
  }

  strcpy(tpnewname,"");
  if(strcmp(&tpfname[tpfnamelen-infsuflen1],infsuf1)==0) {
    for(n=0;n<(int)(tpfnamelen-infsuflen1);++n)
      sprintf(&tpnewname[n],"%c",tpfname[n]);
  }
  else if(strcmp(&tpfname[tpfnamelen-infsuflen2],infsuf2)==0) {
    for(n=0;n<(int)(tpfnamelen-infsuflen2);++n)
      sprintf(&tpnewname[n],"%c",tpfname[n]);
  }
  /*  The above loop is instead of the following which didn't work:
      strncpy(tpnewname,(const char*)(tpfname[p]),
      (size_t)(tpfnamelen-infsuflen));
  */
  else
    sprintf(tpnewname,"%s.",tpfname);
  strncat(tpnewname,newfsuf,strlen(newfsuf));
  
  free_Cvector(newfsuf,0);

}



/* ==================== Error Evaluation Routines =================== */

double wderr(int Vsize, int Xsize, int Hsize, int Ysize,
	     double **wtVH, double **wtXH, double **wtHY, double *alpha)
{
  /* return weight decay contribution to total error */

  double err = 0.0; /* returned by this subroutine */
  int k,l,m,n;
  
  for(m=0;m<Hsize;++m) { /* not including H bias */
    for(k=0;k<Vsize;++k) /* VH */
      err += alpha[0]*wtVH[k][m]*wtVH[k][m]; 
    for(l=0;l<Xsize;++l) /* XH */
      err += alpha[1]*wtXH[l][m]*wtXH[l][m];
    err += alpha[2]*wtXH[Xsize][m]*wtXH[Xsize][m]; /* input bias */
  }
  for(m=0;m<Hsize+1;++m) { /* including H bias */
    for(n=0;n<Ysize;++n) /* HY */
      err += alpha[3]*wtHY[m][n]*wtHY[m][n]; 
  }

  err /= 2.0;
  return (err);
}
