/*****************************************************************************
 * $Id:: main.c 8651 2011-11-18 10:07:27Z nxp28536                           $
 *
 * Nota Bene:******** This is the Cortex M4 file **********************
 *
 * Description:
 *   This is the Cortex-M4 code for a project that illustrates the difference
 *  in execution time of the M4 and M0 cores inside the LPC4350. Both cores will
 *  run what is all-but the same code. This in a way mirrors ARM's big.LITTLE
 *  strategy on the Cortex-A15 and Cortex-A7. The idea is that when a lot of 
 *  performance is needed, the A15 (or, in this case, the M4) runs the code. When
 *  there's less going on that shuts down and the little A7 (M0) runs to conserve
 *  power. We'll use external tools to measure the differing execution times.
 *
 *   This code was created by modifying the project in
 *  c:\keil\ARM\Boards\Hitex\LPC4350\Examples\IPC\IPC\Mm4_m0_ipc_mbx_techcon and
 *  associated subdirectories. This was done to avoid having to create a new project
 *  and work out new make and project files. Hence, the directory names are still
 *  those of that example.
 *   
 * Coded:
 *   December 2011 and January 2012 by Jack Ganssle
 *
 * Copyright(s):
 *   Jack Ganssle releases all of this code to the public domain, insofar as
 *  he is permitted. Since this code was derived from that provided from Keil and NXP,
 *  those organizations may hold copyrights on this work. Check with them.
 *
 *----------------------------------------------------------------------------
 * Software that is described herein is for illustrative purposes only
 * which provides customers with programming information regarding the
 * products. This software is supplied "AS IS" without any warranties.
 * NXP Semiconductors and Jack Ganssle assume no responsibility or liability for the
 * use of the software, conveys no license or title under any patent,
 * copyright, or mask work right to the product. NXP Semiconductors and Jack Ganssle
 * reserves the right to make changes in the software without
 * notification. NXP Semiconductors  and Jack Ganssle also make no representation or
 * warranty that such application will be suitable for the specified
 * use without further testing or modification.
 *****************************************************************************/
#ifndef CORE_M4
	  #error "Build Error: please define CORE_M4 in the project settings" 
#endif

#include "LPC43xx.h"
#include "platform_config.h"
#include "uarthandler.h"

#include "ipc_mbx.h"

#include "platform_check.h"

#include "stdlib.h"
#include "string.h"
#include "math.h"
#include "stdio.h"
#include "ctype.h"

#define BUFSIZE 80

typedef float  float32_t;
typedef double float64_t;

void error(int);
void toggle(int);
void run_test(void);
void test_both_sleep(void);
void test0(void);
uint32_t test0_sqrt(uint32_t);
void test1(void);
float32_t test1_sqrt(float32_t);
void test2(void);
uint32_t test2_fir(void);
void test3(void);

char test_number[] = "2\n\r";				// Which test we're running

/* fromelf.exe always generates an unsigned char LR0[] type of array */
const	/* DO NOT REMOVE THIS CONST QUALIFIER, IS USED TO PLACE THE IMAGE IN M4 ROM */

#include SLAVE_IMAGE_FILE

#define LPC_CPACR         0xE000ED88              

#define SCB_MVFR0           0xE000EF40
#define SCB_MVFR0_RESET     0x10110021

#define SCB_MVFR1           0xE000EF44
#define SCB_MVFR1_RESET     0x11000011

void fpu_init(void)
{
// from arm trm manual:
//                ; CPACR is located at address 0xE000ED88
//                LDR.W R0, =0xE000ED88
//                ; Read CPACR
//                LDR R1, [R0]
//                ; Set bits 20-23 to enable CP10 and CP11 coprocessors
//                ORR R1, R1, #(0xF << 20)
//                ; Write back the modified value to the CPACR
//                STR R1, [R0]

               
    volatile uint32_t* regCpacr = (uint32_t*) LPC_CPACR;
    volatile uint32_t* regMvfr0 = (uint32_t*) SCB_MVFR0;
    volatile uint32_t* regMvfr1 = (uint32_t*) SCB_MVFR1;
    volatile uint32_t Cpacr;
    volatile uint32_t Mvfr0;
    volatile uint32_t Mvfr1;   
    char vfpPresent = 0;

    Mvfr0 = *regMvfr0;
    Mvfr1 = *regMvfr1;

    vfpPresent = ((SCB_MVFR0_RESET == Mvfr0) && (SCB_MVFR1_RESET == Mvfr1));
   
    if(vfpPresent)
    {
        Cpacr = *regCpacr;
        Cpacr |= (0xF << 20);
        *regCpacr = Cpacr;   // enable CP10 and CP11 for full access
    }

}
/**********************************************************************

Function main() for the M4.

 This function waits until the Cortex-M0 is not busy (so it is in
a low-power sleep state) and then execuctes one of a number of
cycle-burning functions selected by the variable "test_number." It then
goes into sleep mode after signalling the Cortex-M0 that the
M4 is idle. 

 The idea is that each core alternates execution. There will be a low-ohm
resistor in series with the board's power supply ground. The problem is that the board
is chock-full of cool stuff, and has a nominal power draw of about 0.25 amps.
I suspect the difference in power used by the M0 vs the M4 won't be huge,
so will be rather in the noise compared to that 0.25 amps. It could be on the 
order of 1 ma, for all I know (the LPC4350's datasheets are, at this moment,
rather silent on power needs). I don't have an ammeter that can
differentiate a single milliamp out of a quarter amp, so will build
a differential amp with op-amps to measure the drop across the series resistor.
A pot will dial out the board's "bias" current. The output
will go to a scope to get a good view of what's going on. GPIO bits
will signal which core is operating when.

 I considered using the scope to measure the current across the resistor 
with the input channel set to AC to remove the bias current. That
won't work, as I expect the signal to be rather square-wave-ish. The scope's
series capacitor would differentiate the signal rather than passing it.

 To make it clear which core is executing when, two GPIO bits are used, one 
for each core. When the M4 is running its bit is set; when the M0 is running 
its bit is set.

 Several errors are possible, in which case function error() will be called.
That puts the core in an infinite loop while it outputs pulse patterns on
the GPIO bit to indicate which error has occurred.

 There's also a diagnostic function, named toggle(), which provides
a momentary pulse on the GPIO bit to aid in seeing where time is being
consumed.

**********************************************************************/
int main (void) 
{
/**************** Initializations ************************************/

  uint32_t i;

/* initializes the platform dependent things - I/O, clock, fpu etc */
  platformInit();
  fpu_init();

// just in case
  IPC_haltSlave();
		
/* setup the local master mailbox system */
  IPC_initMasterMbx(&Master_CbackTable[0], (Mbx*) MASTER_MBX_START, (Mbx*) SLAVE_MBX_START);
		
/* download the cpu image */
  IPC_downloadSlaveImage(SLAVE_ROM_START, &LR0[0], sizeof(LR0));

/* start the remote processor */
  IPC_startSlave();

// wait for the M0 to signal being ready via a message to the command queue
  while(mbxFlags[MASTER_MBX_CMD] != MSG_PENDING) __WFE();

  if(NOTIFY_SLAVE_STARTED == IPC_getMsgType(MASTER_MBX_CMD)) {
/* free our mbx */
  	IPC_freeMbx(MASTER_MBX_CMD);
  };

 // Send the test number to the M0
 		if(IPC_queryRemoteMbx(SLAVE_MBX_CMD) == READY) {
			IPC_sendMsg(SLAVE_MBX_CMD, PRINT_WELCOME_MESSAGE, (msgId_t) 0xF, (mbxParam_t) &test_number);	
		};

/*****************************  Main loop ****************************/	
  while(1)
  {

/*
 We've just woken up since the M0 has completed its work and gone to sleep. 
Set a GPIO bit to trigger the scope. Note that the GPIO bit is found on
connector X3 pin 11.

 This is not encapsulated in a driver, per normal practice, so that it,
and the corresponding I/O write before going to sleep, has the fewest
delays, and gives a scope reading that mirrors operation as closely 
as possible. Mea culpa. 
*/
    LPC_GPIO5->SET |= (1UL << 3);   
	
/*
  Call the function that runs tests. This is the real meat of 
 this program. It's coded as a separate function to make it easier
 to copy to the other core. This insures the code is identical (at
 a source level) in both cores, insuring a fair timing test.
*/
run_test();	
	
/*********  We're all done; notify the M0 and go to sleep. ***********/
	  	
// First, unset the GPIO bit so the scope shows we're sleeping.
	  LPC_GPIO5->CLR |= (1UL << 3);   

/* Tell the M0 to wake up. This is done by issuing an SEV instruction.
The M0 is sleeping when idle, with a WFE instruction hanging
out there. The SEV will cause the WFE to fall through, and the
M0 to restart.

This is a little clunky. I'm using a pair of WFEs, as the SEV
issues an event interrupt to both cores. The first will fall through
but clear the event register; the second will put the M4 into a
sleep state.
*/
    __DSB();		// Make sure all instructions are caught up
    __SEV();    // Send an event interrupt to the other core
    __WFE();    // We'll get it too; clear the event register
    __WFE();    // Go to sleep

  } // while(1) - main loop

}   // main()


/**********************************************************************

Function run_test()

 This function, which is identical in both cores, executes one of
a number of tests. The test number is specified in variable test_number.

 test_number '/' invokes no tests. It's used to get timing of the 
system without tests, sort of like getting a tare weight on a scale.

 test_number '.' lets the cores run for a bit then puts them both
to sleep. This gives us a minimum power consumption.

**********************************************************************/
void run_test(void)
{
//  Sanity check on test numbers
  if((test_number[0]<'.') || (test_number[0]>'3'))error(2);

   switch(test_number[0])
  {
  	case '/': break;             // Null test for "tare" profiling
  	case '.': 
  	{
  	  test_both_sleep();         // Check both off sleep current.
  	  break;
  	  
  	}
  	case '0': 
  	{
  		test0();								   // Run some basic C operations
  		break;
  	}
  	case '1': 
  	{
  		test1();								   // Run some single precision floating point
  		break;
  	}
  	case '2':
  	{
  		test2();							     // Run an FIR filter
  		break;
  	}
  	case '3':
  	{
  	  test3();									// Run the __sqrtf() intrinsic
  	  break;
  	}	
  			
	default:;
  }
}

/**********************************************************************

Function test_both_sleep()

 This function, which is identical in both cores, runs
a fast loop for a while, and then puts the core to sleep forever.

**********************************************************************/
void test_both_sleep(void)
{
  volatile int i;

  i=0;  
  while(i<30000)
  {
  	i++;
  	toggle(-1);
  }
  while(1)
  {
  	toggle(-2);
  	__WFE();								// sleep forever
  }

}

/**********************************************************************

Function test0()

 This function, which is identical in both cores, takes
square roots to compare cores running integer math.

**********************************************************************/
void test0(void)
{
  int i;
  volatile uint32_t result; 
  
  for(i=0; i<300; i++)
  {
  	result=test0_sqrt(i);
  }
}

/**********************************************************************

Function test0_sqrt()

 This function, which is identical in both cores, does an integer
square root using Jack Crenshaw's algorithm from his Math Toolkit for
Real-Time Programming.

**********************************************************************/
uint32_t test0_sqrt(uint32_t a)
{
	uint32_t rem, root;
	int i;

	rem=root=0;
	for(i=0; i< 16; i++)
	{
		root=root*root;
		rem=(rem*rem*rem*rem) + (a/(2^30));
		a=a*a*a*a;
		root=root+1;
		if(root <= rem)
			{
				rem-= root;
				root=root+1;
			}
			else root=root-1;
	}
	return(root/2);
}

/**********************************************************************

Function test1()

 This function, which is identical in both cores, takes
square roots to compare cores running float32_t math.

**********************************************************************/
void test1(void)
{
  int i;
  volatile float32_t result; 
  
  for(i=0; i<300; i++)
  {
  	result=test1_sqrt(i);
  }
}

/**********************************************************************

Function test1_sqrt()

 This function, which is identical in both cores, is a floating
point version of Jack Crenshaw's square root algorithm.

**********************************************************************/
float32_t test1_sqrt(float32_t a)
{
  float32_t rem, root;
  int i;

  rem=root=(float32_t)0.0;
  for(i=0; i< 16; i++)
  {
    root=root*root;
    rem=(rem*rem*rem*rem) + (a/(float32_t)(2^30));
    a=a*a*a*a;
    root=root+(float32_t)1.0;
    if(root <= rem)
    {
      rem-= root;
      root=root+(float32_t)1.0;
    }
    else root=root-(float32_t)1.0;
  }
  root=root/(float32_t)2.0;
  return(root);	
}

/**********************************************************************

Function test2()

 This function, which is identical in both cores, computes
FIRs.

**********************************************************************/
void test2(void)
{
  int i;
  volatile uint32_t result;
  for(i=0; i<20; i++)
  {
  	result=test2_fir();
  }
}

/**********************************************************************

Function test2_fir()

 This function, which is identical in both cores, implements
a simplified FIR filter to show the behavior of the M4's SIMD
instructions. The only difference in this code between cores is in the M0
the SMLAD instruction is expanded to instructions that will run
on that CPU. This code is derived from an NXP presentation at
http://ics.nxp.com/support/microcontrollers/esc.silicon.valley/pdf/dsp.pdf.
I have simplified the coefficients and data to use small sets,
and did not store the results, as these are not germane to this test.

  I believe that much of the time consumed by this routine will
 not be in the SMLAD instruction execution on the M4. So there are two
 ways to run this. If SIMD_ON is TRUE it will execute the FIR filter. If FALSE
 the SMLADs are replaced with simple assignments, so we can see the overhead.

**********************************************************************/
#define SIMD_ON FALSE
uint32_t test2_fir(void)
{
  volatile uint32_t sum0, sum1, sum2, sum3; 
  uint32_t *statePtr, *coeffPtr;
  uint32_t c0, c1, x0, x1, x2, x3;
  uint16_t i, sample;
  volatile uint32_t result;
  
// data and coeffs are arbitrary numbers generated by Excel's rand() function
  uint32_t data[]=  {237,341,624,657,163,183,129,454,450,999,889,527,991,823,163,307};
  uint32_t coeffs[]={663, 357, 528, 18};

  sample = 2;
  do
  {
  	statePtr = data;
    sum0 = sum1 = sum2 = sum3 = 0;
    x0 = *statePtr++;
    x1 = *statePtr++;
    i = 200;
    do
    {
    	coeffPtr = coeffs;
      c0 = *(coeffPtr++);
      x2 = *(statePtr++);
      x3 = *(statePtr++);
#if SIMD_ON == TRUE 										// If we're using the SIMD instructions
      sum0 += __SMLAD(x0, c0, sum0);
      sum1 += __SMLAD(x1, c0, sum1);
      sum2 += __SMLAD(x2, c0, sum2);
      sum3 += __SMLAD(x3, c0, sum3);
      c0 = *(coeffPtr++);
      x0 = *(statePtr++);
      x1 = *(statePtr++);
      sum0 += __SMLAD(x2, c0, sum0);
      sum1 += __SMLAD(x3, c0, sum1);
      sum2 += __SMLAD(x0, c0, sum2);
      sum3 += __SMLAD(x1, c0, sum3);
#endif
#if SIMD_ON == FALSE                    // If we're not using the SIMD instructions
      sum0 += 1;
      sum1 += 1;
      sum2 += 1;
      sum3 += 1;
      c0 = *(coeffPtr++);
      x0 = *(statePtr++);
      x1 = *(statePtr++);
      sum0 += 1;
      sum1 += 1;
      sum2 += 1;
      sum3 += 1;
#endif

      result=sum0+sum1+sum2+sum3;						// Trick optimizer into generating all of the code
      } while(--i);
    } while(--sample);
   return result;
}
/**********************************************************************

Function test3()

 This function takes
square roots using the __aqrtf intrinsic, which will 
generate a VSQRT instruction on the M4.

**********************************************************************/
void test3(void)
{

  volatile float32_t result, i; 
  
  for(i=0.0; i<300.0; i+=1.0)
  {
  	result=__sqrtf(i);
  }
}


/**********************************************************************

Function error()

 This function is invoked in case an error occurs. It takes
an argument indicating the error number, which must never be
0. It then toggles the port bit associated with this
core a number of times equal to the error number, and then pauses,
before repeating the flashing forever. The result is that, by
watching the scope, we can tell which core errored, and which 
error number occurred. 

Errors:

1 - M0 core:       Bad message from M4 when passing test number
2 - M0 & M4 cores: Test number is out of range

**********************************************************************/
void error(int error_number)
{
	int err;
	
	while(1)
	{
		for(err=0; err< error_number; err++)
		{
      LPC_GPIO5->SET |= (1UL << 3);
      LPC_GPIO5->CLR |= (1UL << 3);
    };
    for(err=0; err< 1000; err++); 		// delay
  };
}

/**********************************************************************

Function toggle()

 This function is used in debugging and profiling execution
times. It takes an argument which tells it how many times to
toggle the scope bit associated with this core. 

The argument must never be zero.

If the argument is positive the routine toggles to a one and
back to zero. If negative, it toggles to a zero and back to one. The reason
for the two modes is to give a clean scope image. 

**********************************************************************/
void toggle(int count)
{
	if(count>0)
	{
		LPC_GPIO5->SET |= (1UL << 3);
    LPC_GPIO5->CLR |= (1UL << 3);
	}
	else
	{
    LPC_GPIO5->CLR |= (1UL << 3);
		LPC_GPIO5->SET |= (1UL << 3);
	};
	
}
