/*
 *   Copyright (c) Gejian Semiconductors 2023
 *   All rights reserved.
 *
 *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

/**
*   @file    load_img.c
*   @brief   main core load slave core image from Flash/RAM to destination RAM.
*
*/
/*
 * commit history
 * 2024/03/20, Zhao Lei, Release 2.0 DSP and let it run while(1)
 * 20240322, Jason, add comments for #if #else #endif, change #if for "load_img" function.
 * 20240330, Jason, add SCB_CleanDCache_by_Addr after writing DeadLoop;
 *                  configure DSP ILM/DLM to be Zero before writing DeadLoop.
 * 2024/04/17, Zhao Lei, replace some HWREG with SysCtl APIs; remove 2.2 supporting
 * 2024/04/23, Zhao Lei, replace some HWREG with SysCtl APIs; Clear up; Using IS_GS32Fxxx Macros
 * 2024/05/06, Zhao Lei, disable log output by default
 * 2024/05/31, Zhao Lei, refactor, test for 1.1/2.0
 * 2024/06/05, Zhao Lei, support 2.2, TBD
 * 2024/07/22, Zhao Lei, support 2.2, bugfix
 */

#include <stdint.h>
#include <string.h>

#include "driverlib.h"
#include "sysctl.h"
#include "load_img.h"
#include "board_cfg.h"

extern char DspImageStartAddr[];
extern char DspImageEndAddr[];
extern uint32_t chipRevision;

static uint32_t entry_point = 0xDEADBEEF;
static uint32_t vect_addr = 0xDEADBEEF;

#ifdef __DEBUG
#undef __DEBUG
#endif

#define __DEBUG     0

#if __DEBUG == 1
#include "printf.h"
#include "log.h"
#define debug(...) log_debug(__VA_ARGS__)
#define info(...)  log_info(__VA_ARGS__)
#define warn(...)  log_warn(__VA_ARGS__)
#else
#define debug(...)
#define info(...)
#define warn(...)
#endif

static int32_t load_img(const void *img_base);

#define SLAVE_IS_RUNNING_IN_FLASH   ((defined(__riscv) && LOAD_DSP_CPU2 != 0 && IS_GS32F3xx(0x22) && defined(DSP_CPU2_RUN_IN_FLASH) && DSP_CPU2_RUN_IN_FLASH != 0))

#if IS_GS32F3xx(0x22)
#define TOTAL_DSP_MEM_SIZE          ((256+64)*1024)
#define MAX_AXIDMA_BLOCK_TS         8192

#define MEMCPY_DMA_BASE             DMA2_BASE
#define MEMCPY_DMA_CH_BASE          DMA2_CH8_BASE
#define MEMCPY_DMA_INT              INT_DMA2_CH8

static int memcpy_using_dma(void *dest, const void *src, uint32_t len)
{
    XDMA_ConfigParams dmaCfg;
    uint32_t tfrLen = 0;

    if (len == 0) {
        return 0;
    } else if (len > TOTAL_DSP_MEM_SIZE) {                         //overflow
        return -1;
    }

    XDMA_initController(MEMCPY_DMA_BASE);

    memset(&dmaCfg, 0, sizeof(dmaCfg));

    dmaCfg.ttfc    = XDMA_TT_FC_0_M2M_DMAC;
    dmaCfg.srcBtl  = XDMA_BTL_16;
    dmaCfg.destBtl = XDMA_BTL_16;
    dmaCfg.srcAddrDirect = XDMA_ADDR_INCRE;
    dmaCfg.destAddrDirect = XDMA_ADDR_INCRE;

    if ((len&0x03) == 0 && (((uint32_t)dest)&0x03) == 0 && (((uint32_t)dest)&0x03) == 0) {      //4bytes align
        len /= 4;
        dmaCfg.srcTrWidthBytes = XDMA_TR_WIDTH_BYTE_4;
        dmaCfg.destTrWidthBytes = XDMA_TR_WIDTH_BYTE_4;
    } else {
        dmaCfg.srcTrWidthBytes = XDMA_TR_WIDTH_BYTE_1;
        dmaCfg.destTrWidthBytes = XDMA_TR_WIDTH_BYTE_1;
    }

    Interrupt_disable(MEMCPY_DMA_INT);

    while(tfrLen < len) {
        uint32_t sta;

        dmaCfg.srcAddr = (uint32_t)src + (tfrLen << dmaCfg.srcTrWidthBytes);
        dmaCfg.destAddr = (uint32_t)dest + (tfrLen << dmaCfg.srcTrWidthBytes);

        if (len - tfrLen < MAX_AXIDMA_BLOCK_TS) {
            dmaCfg.blockTS = len - tfrLen;
            tfrLen = len;
        } else {
            dmaCfg.blockTS = MAX_AXIDMA_BLOCK_TS;
            tfrLen += MAX_AXIDMA_BLOCK_TS;
        }

        XDMA_stopChannel(MEMCPY_DMA_CH_BASE);
        XDMA_configChannel(MEMCPY_DMA_CH_BASE, &dmaCfg);
        XDMA_clearInterrupt(MEMCPY_DMA_CH_BASE, XDMA_INT_TFR | XDMA_INT_ERR);
        XDMA_unMaskInterrupt(MEMCPY_DMA_CH_BASE, XDMA_INT_TFR | XDMA_INT_ERR);
        XDMA_startChannel(MEMCPY_DMA_CH_BASE);

        info("DMA transfer %d*%dbytes from 0x%08X to %08X\r\n", dmaCfg.blockTS, 1<<dmaCfg.srcTrWidthBytes, dmaCfg.srcAddr, dmaCfg.destAddr);

        do {
            sta = XDMA_getInterruptStatus(MEMCPY_DMA_CH_BASE);
        } while((sta & (XDMA_INT_TFR | XDMA_INT_ERR)) == 0 );

        if ((sta & XDMA_INT_ERR) != 0) {
            warn("DMA transfer failed status: 0x%08X\r\n", HWREG(MEMCPY_DMA_CH_BASE + 0x88));
            return -1;
        }
    }

    XDMA_clearInterrupt(MEMCPY_DMA_CH_BASE, XDMA_INT_TFR);

    XDMA_disableModule(MEMCPY_DMA_BASE);

    return 0;
}
#endif

#if IS_GS32F3xx(0x22)
const memmap_t memmap[] = {
    {"ILM",      256*1024, 0x10800000, 0x10880000},     //TODO: must using cpu2 dma?
    {"DLM",       64*1024, 0x10840000, 0x108C0000},     //TODO: must using cpu2 dma?
    {"AXI_SRAM", 128*1024, 0x10200000, 0x10200000},
};

int32_t stop_slave(void)
{
    SysCtl_setDspCpu2StopOnSet();                     //hold dsp
    if (SysCtl_getDspCpu2StopOnRpt() != 1) {
        return -1;
    }

    return 0;
}

void presetup_slave(void)
{
//    memset((void *)0x10200000, 0xFF, 128*1024);
}

__attribute__((section(".sram_data"))) uint32_t dsp_deadloop_instruction = 0x0000A001;

void load_deadloop_instruction(void)
{
    //SysCtl_setCpu2DeadLoop(CPU2_ILM_AXI_BASE);        //TODO: hardfault, dma only

    //method 1
    //memcpy_using_dma((void *)CPU2_ILM_AXI_BASE, (const void *)&dsp_deadloop_instruction, 4);
    //vect_addr = CPU1_ILM_BASE;

    //method 2
    vect_addr = (uint32_t)&dsp_deadloop_instruction;
}

void postsetup_slave(void)
{
#if SLAVE_IS_RUNNING_IN_FLASH
    SysCtl_setCpu2ResetVector((uint32_t)(FLASH_MEMORY_BASE + DSP_CPU2_BOOT_ADDR_FLASH_OFFSET));
    debug("cpu2 run from flash @ 0x%08X\r\n", SysCtl_getCpu2ResetVector());
#else
    SysCtl_setCpu2ResetVector(vect_addr);
#endif
}

void run_slave(void)
{
    SysCtl_setDspCpu2StopOnClr();
}
#endif

void bringup_cpu2(void)
{
#if IS_GS32F3xx(0x22)   //2.2CS

    if (chipRevision >= 3) {

#if LOAD_DSP_CPU2 != 0 && SLAVE_IS_RUNNING_IN_FLASH == 0
        int32_t ret = load_img(DspImageStartAddr);

        if (ret != 0) {
            warn("load cpu2 image from 0x%08X failed, ret %d.\r\n", (uint32_t)DspImageStartAddr, ret);
            return;
        }
#endif

        CIDU_TriggerInterCoreInt(0, 1);
        CIDU_ClearInterCoreIntReq(0, 1);

    } else {
        if (stop_slave() != 0) {
            warn("cpu2 is running, Abort.\r\n");
            return;
        }

        presetup_slave();

#if LOAD_DSP_CPU2 == 0
        load_deadloop_instruction();
#elif !SLAVE_IS_RUNNING_IN_FLASH
        int32_t ret = load_img(DspImageStartAddr);

        if (ret != 0) {
            warn("load cpu2 image from 0x%08X failed, ret %d.\r\n", (uint32_t)DspImageStartAddr, ret);
            return;
        }
#endif

        postsetup_slave();

        run_slave();

        info("bringup cpu2 finished.\r\n");
    }
#endif
}

#if defined(SMP_CPU_CNT) && SMP_CPU_CNT > 1
void bringup_smp_cpu2(void)
{
    debug("cpu2 stop_on_reset: %X\r\n", HWREG(CRG_CFG_BASE + 0x454));

    SysCtl_setDspCpu2StopOnSet();

    if (SysCtl_getDspCpu2StopOnRpt() != 1) {
        warn("cpu2 is running, Abort.\r\n");
        return ;
    }

#ifdef FLASH_TARGET
    SysCtl_setCpu2ResetVector(FLASH_MEMORY_BASE);     //DSP_CPU2_RESET_VECTOR        flash
#else
    SysCtl_setCpu2ResetVector(AXI_SRAM0_BASE);        //DSP_CPU2_RESET_VECTOR        axi sram0
#endif

    SysCtl_setDspCpu2StopOnClr();

    debug("cpu2 stop_on_reset: %X\r\n", SysCtl_getDspCpu2StopOnRpt());

    if (SysCtl_getDspCpu2StopOnRpt() == 0) {
        info("cpu2 now is running.\r\n");
    } else {
        warn("fail to release cpu2.\r\n");
    }
}
#endif

//-------------------------------------------------------------------------------------------------------
#if (defined(__riscv) && LOAD_DSP_CPU2 != 0)
static int32_t load_img(const void *img_base)
{
    uint32_t offset = 0;
    int32_t loaded;
    img_header_t *img = (img_header_t *)img_base;

    info("load dsp image @0x%X, size: %d!\r\n", (uint32_t)img_base, DspImageEndAddr - DspImageStartAddr);

    if (img->magic != IMG_HEADER_MAGIC) {
        debug("img header magic 0x%X != 0x%X, abort!\r\n", img->magic, IMG_HEADER_MAGIC);
        return -1;
    }

    debug("----- head info ------\r\n");
    debug("length: %d\r\n", img->length);
    debug("entry_point: 0x%08X\r\n", img->entry_point);
    debug("vect_table: 0x%08X\r\n", img->vect_table);
    debug("sections: %d\r\n", img->sections);

    entry_point = img->entry_point;
    vect_addr = img->vect_table;

    offset = sizeof(img_header_t);

    for (uint32_t i=0; i<img->sections && i<MAX_SECTIONS; i+=1) {
        char name[13];
        img_section_t *img_sect = &img->section[i];

        strncpy(name, (const char *)img_sect->name, sizeof(name)-1);

        debug("----- load section: %s -----\r\n", name);
        debug("length: %d\r\n", img_sect->length);
        debug("load from: 0x%08X\r\n", img_base + offset);
        debug("load to: 0x%08X\r\n", img_sect->load_addr);
        debug("crc: 0x%X\r\n", img_sect->crc);

        loaded = 0;

        for (int j=0; j<sizeof(memmap)/sizeof(memmap[0]); j+=1) {
            if (memmap[j].base_addr <= img_sect->load_addr &&
                memmap[j].base_addr+memmap[j].len >= img_sect->load_addr + img_sect->length) {

                void *dest_remap = (void *)img_sect->load_addr - memmap[j].base_addr + memmap[j].remap_addr;

        #if LOAD_DSP_BY_DMA == 0
                memcpy(dest_remap, img_base + offset, img_sect->length);
        #else
                //memcpy_using_dma((void *)img_sect->load_addr, img_base + offset, img_sect->length);
                if ( memcpy_using_dma(dest_remap, img_base + offset, img_sect->length) != 0) {
                    return -1;
                }
        #endif
                loaded = 1;
                break;
            }
        }

        if (loaded) {
            debug("load ok\r\n");
        } else {
            debug("outof RAM range, load failed!\r\n");
        }

        offset += img_sect->length;
    }

    return 0;
}
#endif
