/*********************************************************************** 
 * $Id: main.c 8651 2011-11-18 10:07:27Z nxp28536 $
 * 
 * Nota Bene:******** This is the Cortex M0 file **********************
 *
 * Description:
 *   This is the Cortex-M0 code for a project that illustrates the difference
 *  in execution time of the M4 and M0 cores inside the LPC4350. Both cores will
 *  run what is all-but the same code. This in a way mirrors ARM's big.LITTLE
 *  strategy on the Cortex-A15 and Cortex-A7. The idea is that when a lot of 
 *  performance is needed, the A15 (or, in this case, the M4) runs the code. When
 *  there's less going on that shuts down and the little A7 (M0) runs to conserve
 *  power. We'll use external tools to measure the differing execution times.
 *
 *   This code was created by modifying the project in
 *  c:\keil\ARM\Boards\Hitex\LPC4350\Examples\IPC\IPC\Mm4_m0_ipc_mbx_techcon and
 *  associated subdirectories. This was done to avoid having to create a new project
 *  and work out new make and project files. Hence, the directory names are still
 *  those of that example.
 *   
 * Coded:
 *   December 2011 and January 2012 by Jack Ganssle
 *
 * Copyright(s):
 *   Jack Ganssle releases all of this code to the public domain, insofar as
 *  he is permitted. Since this code was derived from that provided from Keil and NXP,
 *  those organizations may hold copyrights on this work. Check with them.
 *
 *----------------------------------------------------------------------------
 * Software that is described herein is for illustrative purposes only
 * which provides customers with programming information regarding the
 * products. This software is supplied "AS IS" without any warranties.
 * NXP Semiconductors and Jack Ganssle assume no responsibility or liability for the
 * use of the software, conveys no license or title under any patent,
 * copyright, or mask work right to the product. NXP Semiconductors and Jack Ganssle
 * reserves the right to make changes in the software without
 * notification. NXP Semiconductors  and Jack Ganssle also make no representation or
 * warranty that such application will be suitable for the specified
 * use without further testing or modification.
 *****************************************************************************/
#ifndef CORE_M0
	  #error "Build Error: please define CORE_M0 in the project settings" 
#endif

#include "LPC43xx.h"
#include "scu.h"
#include "type.h"
#include "math.h"

#include "uarthandler.h"

/* ipc comms */
#include "ipc_mbx.h"
#include "platform_check.h"

#include "stdlib.h"
#include "string.h"
#include "stdio.h"

/*
  C custom defined SMLAD for M3 and M0 processors. Taken from
 the CMSIS.
*/
  static __INLINE uint32_t __SMLAD(uint32_t x, uint32_t y, uint32_t sum)
  {
    return (sum + ((short) (x >> 16) * (short) (y >> 16)) +
            ((short) x * (short) y));
  }

typedef float  float32_t;
typedef double float64_t;

void error(int);
void toggle(int);
void vIOInit(void);
void run_test(void);
void test_both_sleep(void);
void test0(void);
uint32_t test0_sqrt(uint32_t);
void test1(void);
float32_t test1_sqrt(float32_t);
void test2(void);
uint32_t test2_fir(void);
void test3(void);

#define BUFSIZE 80
mbxParam_t test_number_p;		  // pointer to test we're running - sent from M4
char test_number[BUFSIZE];		// test number itself

/**********************************************************************

Function main() for the M0.

 This code is philosophically the same as that running in the M4. Please
refer to main()'s header comments in the M4 code for a description of this
function. It is not repeated here in case those comments change.

What is different is that the M0's resources are used, rather than the M4's.

**********************************************************************/
int main(void)
{			
/************************* Initializations ***************************/
  int id= 0;
	uint32_t i;
			
// Init GPIO port
  vIOInit();				
	
// init local mailbox system 
  IPC_initSlaveMbx(&Slave_CbackTable[0], (Mbx*) MASTER_MBX_START, (Mbx*) SLAVE_MBX_START);

// signal back to M4 we are now ready
  IPC_sendMsg(MASTER_MBX_CMD, NOTIFY_SLAVE_STARTED, (msgId_t) 0, (mbxParam_t) 1);

// Get the test number from the M4
  while(mbxFlags[SLAVE_MBX_CMD] != MSG_PENDING);
  IPC_lockMbx(SLAVE_MBX_CMD);
  id = IPC_getMsgId(SLAVE_MBX_CMD);
  if (PRINT_WELCOME_MESSAGE == IPC_getMsgType(SLAVE_MBX_CMD)) 
  {
    test_number_p = IPC_getMbxParameter(SLAVE_MBX_CMD);
  }
  else error(1);
  IPC_freeMbx(SLAVE_MBX_CMD);
  memset(&test_number[0], 0, BUFSIZE);
  strcat(test_number,(char*) test_number_p);


/*****************************  Main loop ****************************/	
  while(1)
  {
 
/*
 We've just woken up since the M4 has complete its work and gone to sleep.
Set a GPIO to trigger the scope.  Note that this GPIO is LED 10,
which is also on JP3. I removed the LED so its power consumption
does not bias the results.

*/	
    LPC_GPIO4->SET |= (1UL << 1);
			
/*
  Call the function that runs tests. This is the real meat of 
 this program. It's coded as a separate function to make it easier
 to copy to the other core. This insures the code is identical (at
 a source level) in both cores, insuring a fair timing test.
*/
run_test();	
	
/*********  We're all done; notify the M4 and go to sleep. ***********/
	  	
// First, unset the GPIO bit so the scope shows we're sleeping.
    LPC_GPIO4->CLR |= (1UL << 1);

/* Tell the M4 to wake up. This is done by issuing an SEV instruction.
The M4 is sleeping when idle, with a WFE instruction hanging
out there. The SEV will cause the WFE to fall through, and the
M0 to restart.

This is a little clunky. I'm using a pair of WFEs, as the SEV
issues an event interrupt to both cores. The first will fall through
but clear the event register; the second will put the M0 into a
sleep state.
*/

    __DSB();		// Make sure all instructions are caught up
    __SEV();    // Send an event interrupt to the other core
    __WFE();    // We'll get it too; clear the event register
    __WFE();    // Go to sleep

  } // while(1) - main loop
		
}   // main()

/**********************************************************************

Function run_test()

 This function, which is identical in both cores, executes one of
a number of tests. The test number is specified in variable test_number.

 test_number '/' invokes no tests. It's used to get timing of the 
system without tests, sort of like getting a tare weight on a scale.

 test_number '.' lets the cores run for a bit then puts them both
to sleep. This gives us a minimum power consumption.

**********************************************************************/
void run_test(void)
{
//  Sanity check on test numbers
  if((test_number[0]<'.') || (test_number[0]>'3'))error(2);

   switch(test_number[0])
  {
  	case '/': break;             // Null test for "tare" profiling
  	case '.': 
  	{
  	  test_both_sleep();         // Check both off sleep current.
  	  break;
  	  
  	}
  	case '0': 
  	{
  		test0();								   // Run some basic C operations
  		break;
  	}
  	case '1': 
  	{
  		test1();								   // Run some single precision floating point
  		break;
  	}
  	case '2':
  	{
  		test2();							     // Run an FIR filter
  		break;
  	}
  	case '3':
  	{
  	  test3();									// Run the __sqrtf() intrinsic
  	  break;
  	}	
  			
	default:;
  }
}

/**********************************************************************

Function test_both_sleep()

 This function, which is identical in both cores, runs
a fast loop for a while, and then puts the core to sleep forever.

**********************************************************************/
void test_both_sleep(void)
{
  volatile int i;

  i=0;  
  while(i<30000)
  {
  	i++;
  	toggle(-1);
  }
  while(1)
  {
  	toggle(-2);
  	__WFE();								// sleep forever
  }
}

/**********************************************************************

Function test0()

 This function, which is identical in both cores, takes
square roots to compare cores running integer math.

**********************************************************************/
void test0(void)
{
  int i;
  volatile uint32_t result; 
  
  for(i=0; i<300; i++)
  {
  	result=test0_sqrt(i);
  }
}

/**********************************************************************

Function test0_sqrt()

 This function, which is identical in both cores, does an integer
square root using Jack Crenshaw's algorithm from his Math Toolkit for
Real-Time Programming.

**********************************************************************/
uint32_t test0_sqrt(uint32_t a)
{
	uint32_t rem, root;
	int i;

	rem=root=0;
	for(i=0; i< 16; i++)
	{
		root=root*root;
		rem=(rem*rem*rem*rem) + (a/(float32_t)(2^30));
		a=a*a*a*a;
		root=root+(float32_t)1;
		if(root <= rem)
			{
				rem-= root;
				root=root+(float32_t)1;
			}
			else root=root-(float32_t)1;
	}
	return(root/2);
}

/**********************************************************************

Function test1()

 This function, which is identical in both cores, takes
square roots to compare cores running float32_t math.

**********************************************************************/
void test1(void)
{
  int i;
  volatile float32_t result; 
  
  for(i=0; i<300; i++)
  {
  	result=test1_sqrt(i);
  }
}

/**********************************************************************

Function test1_sqrt()

 This function, which is identical in both cores, is a floating
point version of Jack Crenshaw's square root algorithm.

**********************************************************************/
float32_t test1_sqrt(float32_t a)
{
	float32_t rem, root;
	int i;

	rem=root=0.0;
	for(i=0; i< 16; i++)
	{
		root=root*root;
		rem=(rem*rem*rem*rem) + (a/(2^30));
		a=a*a*a*a;
		root=root+1.0;
		if(root <= rem)
			{
				rem-= root;
				root=root+1.0;
			}
			else root=root-1.0;
	}
	root=root/2.0;
}

/**********************************************************************

Function test2()

 This function, which is identical in both cores, computes
FIRs.

**********************************************************************/
void test2(void)
{
  int i;
  volatile uint32_t result; 
  
  for(i=0; i<20; i++)
  {
  	result=test2_fir();
  }
}

/**********************************************************************

Function test2_fir()

 This function, which is identical in both cores, implements
a simplified FIR filter to show the behavior of the M4's SIMD
instructions. The only difference in this code between cores is in the M0
the SMLAD instruction is expanded to instructions that will run
on that CPU. This code is derived from an NXP presentation at
http://ics.nxp.com/support/microcontrollers/esc.silicon.valley/pdf/dsp.pdf.
I have simplified the coefficients and data to use small sets,
and did not store the results, as these are not germane to this test.

 I believe that much of the time consumed by this routine will
not be in the SMLAD instruction execution on the M4. So there are two
ways to run this. If SIMD_ON is TRUE it will execute the FIR filter. If FALSE
the SMLADs are replaced with simple assignments, so we can see the overhead.

**********************************************************************/
#define SIMD_ON FALSE
uint32_t test2_fir(void)
{

  volatile uint32_t sum0, sum1, sum2, sum3; 
  uint32_t *statePtr, *coeffPtr;
  uint32_t c0, c1, x0, x1, x2, x3;
  uint16_t i, sample;
  volatile uint32_t result;

// data and coeffs are arbitrary numbers generated by Excel's rand() function
  uint32_t data[]=  {237,341,624,657,163,183,129,454,450,999,889,527,991,823,163,307};
  uint32_t coeffs[]={663, 357, 528, 18};

  sample = 2;
  do
  {
  	statePtr = data;
    sum0 = sum1 = sum2 = sum3 = 0;
    x0 = *statePtr++;
    x1 = *statePtr++;
    i = 200;
    do
    {
    	coeffPtr = coeffs;
      c0 = *(coeffPtr++);
      x2 = *(statePtr++);
      x3 = *(statePtr++);
#if SIMD_ON == TRUE 										// If we're using the SIMD instructions
      sum0 += __SMLAD(x0, c0, sum0);
      sum1 += __SMLAD(x1, c0, sum1);
      sum2 += __SMLAD(x2, c0, sum2);
      sum3 += __SMLAD(x3, c0, sum3);
      c0 = *(coeffPtr++);
      x0 = *(statePtr++);
      x1 = *(statePtr++);
      sum0 += __SMLAD(x2, c0, sum0);
      sum1 += __SMLAD(x3, c0, sum1);
      sum2 += __SMLAD(x0, c0, sum2);
      sum3 += __SMLAD(x1, c0, sum3);
#endif
#if SIMD_ON == FALSE                    // If we're not using the SIMD instructions
      sum0 += 1;
      sum1 += 1;
      sum2 += 1;
      sum3 += 1;
      c0 = *(coeffPtr++);
      x0 = *(statePtr++);
      x1 = *(statePtr++);
      sum0 += 1;
      sum1 += 1;
      sum2 += 1;
      sum3 += 1;
#endif
      result=sum0+sum1+sum2+sum3;						// Trick optimizer into generating all of the code
      } while(--i);
    } while(--sample);
   return result;
}

/**********************************************************************

Function test3()

 This function does a square root via the normal system call. The
same test in the M4 uses the intrinsic __sqrtf().

**********************************************************************/
void test3(void)
{

  volatile float32_t result, i; 
  
  for(i=0.0; i<300.0; i+=1.0)
  {
  	result=sqrt(i);
  }
}

/**********************************************************************

Function error()

 This function is invoked in case an error occurs. It takes
an argument indicating the error number, which must never be
0. It then toggles the port bit associated with this
core a number of times equal to the error number, and then pauses,
before repeating the flashing forever. The result is that, by
watching the scope, we can tell which core errored, and which 
error number occurred. 

Errors:

1 - M0 core:       Bad message from M4 when passing test number
2 - M0 & M4 cores: Test number is out of range

**********************************************************************/
void error(int error_number)
{
	int err;
	
	while(1)
	{
		for(err=0; err< error_number; err++)
		{
      LPC_GPIO4->SET |= (1UL << 1);
      LPC_GPIO4->CLR |= (1UL << 1);
    };
    for(err=0; err< 1000; err++); 		// delay
  };
}

/**********************************************************************

Function toggle()

 This function is used in debugging and profiling execution
times. It takes an argument which tells it how many times to
toggle the scope bit associated with this core. 

The argument must never be zero.

If the argument is positive the routine toggles to a one and
back to zero. If negative, it toggles to a zero and back to one. The reason
for the two modes is to give a clean scope image. 

**********************************************************************/
void toggle(int count)
{
	if(count>0)
	{
		LPC_GPIO4->SET |= (1UL << 1);
    LPC_GPIO4->CLR |= (1UL << 1);
	}
	else
	{
    LPC_GPIO4->CLR |= (1UL << 1);
		LPC_GPIO4->SET |= (1UL << 1);
	};
	
}

/*----------------------------------------------------------------------------
  Initialize board specific IO
 *----------------------------------------------------------------------------*/
void vIOInit(void)
{
  #if (PLATFORM == NXP_VALIDATION_BOARD)
// PA.4 : GPIO5_19: LD10 (LED)
    scu_pinmux(0xA ,4 , PDN_ENABLE, FUNC4); 	
    LPC_GPIO5->DIR |= (1UL << 19);
  #endif
  #if (PLATFORM == HITEX_A2_BOARD)
// P8.1 : GPIO4_1
    scu_pinmux(0x8 ,1 , PDN_ENABLE, FUNC0); 	
    LPC_GPIO4->DIR |= (1UL << 1);
#endif
}


