
#include "main.h"
void SystemClock_Config(void);
static void MX_GPIO_Init(void);
static void MX_DMA_Init(void);
static void MX_TIM1_Init(void);

#define NUM_BUFFERS_TEST (8*1024) // 8*1024 of 4kB packets (32MB)
uint32_t test_bursts = NUM_BUFFERS_TEST; // number of 4k buffers to send (to transfer test)
volatile uint8_t transfer_status=0; // indicate ongoing transfer
uint32_t datacounter; // number of remaining data to send 
uint8_t buf[4096]; // data buffer
void init_system(void); // init transfer system (GPIO,TIM,DMA etc.)
void init_buf(void); // fill bufer with "Test pattern" (0,1,2...) 
char transfer_start(void); // start transfer of buffer
void transfer_stop(void); // stop transfer

int main(void){

  HAL_Init();
  SystemClock_Config(); // Core 180MHz (90/45MHz APB buses, 180MHz timers) 
  // init PB7 - used for debug purposess
  LL_AHB1_GRP1_EnableClock(LL_AHB1_GRP1_PERIPH_GPIOB); 
  LL_GPIO_SetPinMode(GPIOB, LL_GPIO_PIN_7, LL_GPIO_MODE_OUTPUT);
  LL_GPIO_SetPinSpeed(GPIOB, LL_GPIO_PIN_7, LL_GPIO_SPEED_FREQ_HIGH);
  LL_GPIO_SetOutputPin(GPIOB, LL_GPIO_PIN_7);

  init_buf();
  init_system();
  // set PA9 speed to max (for debug purposess - PA9 (as timer PWM output) indicates time of DMA request generation)
  LL_GPIO_SetPinSpeed(GPIOA, LL_GPIO_PIN_9, LL_GPIO_SPEED_FREQ_HIGH);
  // wait until PC app set RXF - then MCU can be sure that PC app is ready to recieve data
  while(LL_GPIO_IsInputPinSet(GPIOA, LL_GPIO_PIN_1)){ // wait until RXF flag (signalises that FTDI is ready (PC send dummy byte))
  }

  while (1)
  {
    // transfer defined number of 4kB buffers
  	while(test_bursts){ 
  		transfer_start(); // start transfer of new buffer
				while(transfer_status){ // wait until transfer ends
					asm("nop"); // place for braekpoint during debug
				}
				test_bursts--; // decrement counter 
  	}
  	if(test_bursts == 0){ // if all buffer has been sent
  		while(transfer_status){ // wait until last buffer transfer ends
  			asm("nop"); // place for braekpoint during debug
  		}
  		test_bursts=NUM_BUFFERS_TEST; // send another 8*1024 of 4kB transfer
  	}
  }
}


// initialise buffer with test pattern (0,1,2,3...)
void init_buf(void){
	uint32_t i;
	for(i=0;i<sizeof(buf)/sizeof(buf[0]);i++){
		buf[i]=i;
	}
}

/*
 * External signal (TXE line from FTDI) gates TIM1 (low level enables counting)
 * TIM1 runs in one-pulse mode with repetition counter 256
 * TIM1 generates internaly DMA requests from CH2 (as soon as posible after period start, therefore CCR2=1)
 * DMA transfers (parallel) data from memory buffer to GPIO 
 * TIM1 generates PWM on Channel 2 (WR line to FTDI). Falling edge signalise data write command to FTDI. The falling edge moment is selected so that the DMA has time to complete writing data to the GPIO (+ setup time elapsed).
 * If external signal (TXE) sets to H (FTDI signals that it is not ready to receive data), timer is paused (not gated). When TXE become L, timers continue in run (and transmit)
 * Once the timer completes the predetermined number of periods (repetition counter), it stops and triggers an interrupt.
 * In the interrupt routine, the program determines how much data needs to be sent and sets the repetition counter accordingly and starts the next transfer (enable timer).
 *
 * Since the timer always sends a predetermined and selectable amount of data (period, DMA requests), it is possible to reasonably check whether all the necessary data has been sent.
 * For example, when sending 520 bytes of data, 520 is set in the DMA, the first burst will have RTR="256", the second also RTR="256", and the third RTR="8".
 *
 * The transfer takes place in "bursts" determined by the RTR value of the timer. Typically, the largest possible RTR is selected if there is a lot of data. However, RTR is limited to 256 on older chips, while newer chips allow for higher values.
 * 
 * GPIO connections:
 * STM32			FTDI
 * PC0..7 	-> 		D0..7
 * PA12		<-		C1 (!TXE flag)
 * PA8		->		C3 (!WR cmd)
 *
 * PA1		<-		C0 (!RXF flag) - set to input with pullup
 * PA0		->		C2 (!RD cmd) - set to low
 *
 * PA9 - auxiliary timer output (DMA requests time indicator) - for debug
 * PB9 - auxiliary TIM IRQ routine indicator - for debug
 */
 

void init_system(void){
LL_TIM_InitTypeDef TIM_InitStruct = {0};
LL_TIM_OC_InitTypeDef TIM_OC_InitStruct = {0};
LL_GPIO_InitTypeDef GPIO_InitStruct = {0};
LL_DMA_InitTypeDef dma;

// init GPIOs
LL_APB2_GRP1_EnableClock(LL_APB2_GRP1_PERIPH_TIM1);
LL_AHB1_GRP1_EnableClock(LL_AHB1_GRP1_PERIPH_DMA2 | LL_AHB1_GRP1_PERIPH_GPIOD | LL_AHB1_GRP1_PERIPH_GPIOA | LL_AHB1_GRP1_PERIPH_GPIOC);
LL_RCC_SetTIMPrescaler(LL_RCC_TIM_PRESCALER_FOUR_TIMES);

// ošetření nepoužívaných IO z FTDI (vstupu RD a  výstupu RXF)
// Setting GPIOs for RD and RXF  
// RD=1 (we dont want to read from FTDI now)
LL_GPIO_SetOutputPin(GPIOA, LL_GPIO_PIN_0);
GPIO_InitStruct.Pin = LL_GPIO_PIN_0;
GPIO_InitStruct.Mode = LL_GPIO_MODE_OUTPUT;
GPIO_InitStruct.Speed = LL_GPIO_SPEED_FREQ_LOW;
GPIO_InitStruct.OutputType = LL_GPIO_OUTPUT_PUSHPULL;
GPIO_InitStruct.Pull = LL_GPIO_PULL_NO;
LL_GPIO_Init(GPIOA, &GPIO_InitStruct);
// RXF as input (we need to read RXF flag to check if PC app is running)
GPIO_InitStruct.Pin = LL_GPIO_PIN_1;
GPIO_InitStruct.Mode = LL_GPIO_MODE_INPUT;
GPIO_InitStruct.Pull = LL_GPIO_PULL_UP;
LL_GPIO_Init(GPIOA, &GPIO_InitStruct);

// PC0..7 ------> DMA Out (data bus to FTDI)
GPIO_InitStruct.Pin = LL_GPIO_PIN_0|LL_GPIO_PIN_1|LL_GPIO_PIN_2|LL_GPIO_PIN_3|LL_GPIO_PIN_4|LL_GPIO_PIN_5|LL_GPIO_PIN_6|LL_GPIO_PIN_7;
GPIO_InitStruct.Mode = LL_GPIO_MODE_OUTPUT;
GPIO_InitStruct.Speed = LL_GPIO_SPEED_FREQ_LOW;
GPIO_InitStruct.OutputType = LL_GPIO_OUTPUT_PUSHPULL;
GPIO_InitStruct.Pull = LL_GPIO_PULL_NO;
LL_GPIO_Init(GPIOC, &GPIO_InitStruct);

// PA12   ------> TIM1_ETR (TXE signal from FTDI)
// PA8   ------> TIM1_CH1 (WR signal to FTDI)
// PA9   ------> TIM1_CH2 (debug signal to see DMA request timing)
GPIO_InitStruct.Pin = LL_GPIO_PIN_8 | LL_GPIO_PIN_9 | LL_GPIO_PIN_12;
GPIO_InitStruct.Mode = LL_GPIO_MODE_ALTERNATE;
GPIO_InitStruct.Speed = LL_GPIO_SPEED_FREQ_HIGH;
GPIO_InitStruct.OutputType = LL_GPIO_OUTPUT_PUSHPULL;
GPIO_InitStruct.Pull = LL_GPIO_PULL_NO;
GPIO_InitStruct.Alternate = LL_GPIO_AF_1;
LL_GPIO_Init(GPIOA, &GPIO_InitStruct);

GPIO_InitStruct.Pin = LL_GPIO_PIN_12;
GPIO_InitStruct.Pull = LL_GPIO_PULL_UP;
LL_GPIO_Init(GPIOA, &GPIO_InitStruct);

// DMA setting
dma.PeriphOrM2MSrcAddress  = (uint32_t)&(GPIOC->ODR); // directly write to whole GPIOC port
dma.MemoryOrM2MDstAddress  = (uint32_t)buf; // will be set later 
dma.Direction              = LL_DMA_DIRECTION_MEMORY_TO_PERIPH;
dma.Mode                   = LL_DMA_MODE_NORMAL; // single "run"
dma.PeriphOrM2MSrcIncMode  = LL_DMA_PERIPH_NOINCREMENT;
dma.MemoryOrM2MDstIncMode  = LL_DMA_MEMORY_INCREMENT;
dma.PeriphOrM2MSrcDataSize = LL_DMA_PDATAALIGN_BYTE;
dma.MemoryOrM2MDstDataSize = LL_DMA_MDATAALIGN_BYTE;
dma.NbData                 = sizeof(buf)/sizeof(buf[0]); // will be set later
dma.Channel                = LL_DMA_CHANNEL_6;
dma.Priority               = LL_DMA_PRIORITY_LOW; // should be higher, but now there is only one stream/channel
dma.FIFOMode               = LL_DMA_FIFOMODE_DISABLE; // that probrably should be changed to help to offload SRAM bus (!)
dma.FIFOThreshold          = LL_DMA_FIFOTHRESHOLD_1_4;
dma.MemBurst               = LL_DMA_MBURST_SINGLE;
dma.PeriphBurst            = LL_DMA_PBURST_SINGLE;
LL_DMA_Init(DMA2, LL_DMA_STREAM_2, &dma);

// TIM1 base
TIM_InitStruct.Prescaler = 0; // time unit 1/180M = 5.55ns
TIM_InitStruct.CounterMode = LL_TIM_COUNTERMODE_UP;
TIM_InitStruct.Autoreload = 15; // 16*5.55 = 88ns
TIM_InitStruct.ClockDivision = LL_TIM_CLOCKDIVISION_DIV1;
TIM_InitStruct.RepetitionCounter = 16; // will be set later
LL_TIM_Init(TIM1, &TIM_InitStruct);
LL_TIM_SetOnePulseMode(TIM1, LL_TIM_ONEPULSEMODE_SINGLE); // one pulse mode - run number of "repetition counter" periods

// Tim1 input (gating) - TXE signal from FTDI
LL_TIM_SetTriggerInput(TIM1, LL_TIM_TS_ETRF);
LL_TIM_SetSlaveMode(TIM1, LL_TIM_SLAVEMODE_GATED); // gate mode - count only if TXE asserted
LL_TIM_ConfigETR(TIM1, LL_TIM_ETR_POLARITY_INVERTED, LL_TIM_ETR_PRESCALER_DIV1, LL_TIM_ETR_FILTER_FDIV1); // gated when TXE=0

// output to pin - WR signal to FTDI
LL_TIM_OC_StructInit(&TIM_OC_InitStruct);
TIM_OC_InitStruct.OCMode = LL_TIM_OCMODE_PWM1; // PWM mode, rising when counter starts, falling when "compare" event
TIM_OC_InitStruct.OCState = LL_TIM_OCSTATE_ENABLE;
TIM_OC_InitStruct.OCNState = LL_TIM_OCSTATE_DISABLE;
TIM_OC_InitStruct.CompareValue = 9; // set WR=0 56ns after timer starts (pulse duration 88-55 = 33ns)
TIM_OC_InitStruct.OCPolarity = LL_TIM_OCPOLARITY_HIGH;
LL_TIM_OC_Init(TIM1, LL_TIM_CHANNEL_CH1, &TIM_OC_InitStruct);

// DMA request generator - internal signal for DMA (also routed to PA9 for debug purposes)
TIM_OC_InitStruct.CompareValue = 1; // generate DMA request 5ns after (as soon as possible) timer start -> sending data to PORT
LL_TIM_OC_Init(TIM1, LL_TIM_CHANNEL_CH2, &TIM_OC_InitStruct);

// enable timer outputs
LL_TIM_EnableAllOutputs(TIM1);

// enable timer update IRQ (fires each burst and prepare another burst)
WRITE_REG(TIM1->SR, ~(TIM_SR_CC1IF | TIM_SR_CC2IF | TIM_SR_UIF)); // clear flags
NVIC_SetPriority(TIM1_UP_TIM10_IRQn, NVIC_EncodePriority(NVIC_GetPriorityGrouping(),0, 0));
NVIC_EnableIRQ(TIM1_UP_TIM10_IRQn);
LL_TIM_EnableIT_UPDATE(TIM1); // enable IRQ of update (when timer stops... we are using one-pulse mode)
}

// TIMer IRQ routine
// reloading timer data burst - to control overall number of transmited data from single buffer
void TIM1_UP_TIM10_IRQHandler(void){
	LL_TIM_ClearFlag_UPDATE(TIM1);
	LL_GPIO_SetOutputPin(GPIOB, LL_GPIO_PIN_7); // debug indication
	// prepare "new" burst
	// if number of remaining data in buffer > burst size -> then perform "full size" burst
	if(datacounter > 256){
		LL_TIM_SetRepetitionCounter(TIM1, 255); // set burst size (Repetition counter) - thats probably not neccesary (should be already set)
		LL_TIM_EnableCounter(TIM1); // start another burst
		datacounter = datacounter - 256; // decrement downcounter (number of bytes to transfer)
	} // nothing to transfer - done
	else if(datacounter == 0){ // if there is no data to transfer -> transfer is over
		transfer_stop();
	}
	else{ // "partial burst" (last one)
		LL_TIM_SetRepetitionCounter(TIM1, datacounter); // set burst size (Repetition counter)
		LL_TIM_EnableCounter(TIM1); // start another burst
		datacounter=0; // clear downcounter - thats last data of transfer
	}
	LL_GPIO_ResetOutputPin(GPIOB, LL_GPIO_PIN_7); // for debug/monitoring purposes
}


// start data transfer (indication in transfer_status)
char transfer_start(void){
	uint16_t transfer_size=4096; // copy total transfer size (now it is fixed, can be function parameter)
	datacounter = transfer_size; // copy "bytes to send" to downcounter

	// check if buffer is greater then burst size
	if(datacounter > 256){ // next burst is "full size"
		LL_TIM_SetRepetitionCounter(TIM1, 255);
		datacounter = datacounter - 256; // remaining bytes to send
	}else if(datacounter>0){ // next burst i "partial size"
		LL_TIM_SetRepetitionCounter(TIM1, datacounter);
		datacounter = 0; // last data to send
	}else{ // that shouldnt happen (who wants to transfer 0 bytes ?)
		return 1;
	}

	// set DMA
	LL_DMA_SetMemoryAddress(DMA2, LL_DMA_STREAM_2, (uint32_t)buf); // set buffer address (now fixed, can be function parameter)
	LL_DMA_SetDataLength(DMA2, LL_DMA_STREAM_2, transfer_size); // set total number of bytes to transfer

	// for safety, clear and update counter registers
	LL_TIM_SetCounter(TIM1, 0);
	LL_TIM_DisableIT_UPDATE(TIM1);
	LL_TIM_GenerateEvent_UPDATE(TIM1);
	WRITE_REG(TIM1->SR, ~(TIM_SR_CC1IF | TIM_SR_CC2IF | TIM_SR_UIF));
	LL_TIM_EnableIT_UPDATE(TIM1);

 // running transfer indication
	transfer_status=1;

	// enable transfer
 LL_TIM_EnableDMAReq_CC2(TIM1);
 LL_DMA_ClearFlag_HT2(DMA2); // all flags must be cleared to enable DMA (!!!)
 LL_DMA_ClearFlag_TC2(DMA2); // i am clearing only two of them - :/
 LL_DMA_EnableStream(DMA2, LL_DMA_STREAM_2);
 LL_TIM_EnableCounter(TIM1); // from now, transfer is running
 return 0;
}

// stop transfer (called when all bytes are out)
void transfer_stop(void){
	LL_DMA_DisableStream(DMA2, LL_DMA_STREAM_2);
	LL_TIM_DisableDMAReq_CC2(TIM1);
	LL_TIM_DisableCounter(TIM1);
	transfer_status=0;
}

