STM32F4 bare-metal start-up and real bit banging speed

Last year, I made a post about the bit-banging capacities of the STM32F1 (ARM Cortex-M3) microcontroller. Here are some comparisons with the STM32F4 (Cortex-M4).

STM32F4 bare metal start-up (no compiler libraries added)

The STM32F4 runs at 168MHz CPU clock.

Because my oscilloscope only goes up to 50MHz, I made a frequency divider with an FPGA. The verilog code is below. Yes, I do know that when the division is a power of two an implementation with a chain of D flip-flops is much easier.

`define TRIGGER 16'd4

module freqdiv(
    input clk,
    input rst,
    output clk_divn_out
    );

reg [15:0]counter;

always @(posedge clk or negedge rst)
begin
        if(!rst)
                counter <= 16'd0;
        else
                if(counter == `TRIGGER)
                        counter <= 16'd0;
                else
                        counter <= counter + 1;
        end

        assign clk_divn_out = (counter == `TRIGGER);
endmodule

In foreground, my self designed STM32F4 board, with its JTAG debugger on the left, the Xilinx Spartan-6 FPGA on the right, and the Xilinx platform cable in the middle.

The simple CPU-powered GPIO loop...

{
        uint8_t buffer[8] = {0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00};

        while(1) {
                        GPIOC->ODR = buffer[0];
                        GPIOC->ODR = buffer[1];
                        GPIOC->ODR = buffer[2];
                        GPIOC->ODR = buffer[3];
                        GPIOC->ODR = buffer[4];
                        GPIOC->ODR = buffer[5];
                        GPIOC->ODR = buffer[6];
                        GPIOC->ODR = buffer[7];
        }
}

...already gives good results: GPIO output is just half the frequency of the CPU core: ~84MHz. It seems that the CPU needs two cycles to update the GPIO. According to the datasheet of the chip (STM32F405), this is the maximum GPIO speed, "[...] fast I/Os up to 84 MHz" (See the first page of [1]).

This screenshot shows ~42MHz because the frequency is divided by two using the FPGA (so as not to exceed 50MHz and the maximum bandwidth of my oscilloscope).

References

[1]ST, STM32F405xx and STM32F407xx datasheet, http://www.st.com/web/en/resource/technical/document/datasheet/DM00037051.pdf

Complete bare-metal code, including plain-C start-up code. Only a linker file is needed to compile.

/* register definitions */
#include "stm32f4xx.h"

/* Cortex-M architecture allows plain C startup code
 * from the linker file */
extern unsigned int __data_flash_start_addr, __data_flash_end_addr, __data_sram_start_addr, __data_sram_end_addr, __bss_start_addr, __bss_end_addr, __stack_end_addr;

/* Exception handlers prototypes */
void EmptyHandler(void);
void ResetHandler(void);
void NmiHandler(void);
void HardFaultHandler(void);
void MemManageHandler(void);
void BusFaultHandler(void);
void UsageFaultHandler(void);
void SvCallHandler(void);
void DebugMonitorHandler(void);

/* Exception and interrupt vector */
void (* const vector[])(void) __attribute__ ((section(".vector"))) __attribute__((used)) =
{
        (void (*)())&__stack_end_addr,  /* 0x0000_0000  stack address   */
        ResetHandler,                           /* 0x0000_0004  Reset                   */
        NmiHandler,                             /* 0x0000_0008  NMI                             */
        HardFaultHandler,                       /* 0x0000_000C  HardFault               */
        MemManageHandler,                       /* 0x0000_0010  MemManage               */
        BusFaultHandler,                        /* 0x0000_0014  BusFault                */
        UsageFaultHandler,              /* 0x0000_0018  UsageFault              */
        0x0,                                            /* 0x0000_001C  Reserved                */
        0x0,                                            /* 0x0000_0020  Reserved                */
        0x0,                                            /* 0x0000_0024  Reserved                */
        0x0,                                            /* 0x0000_0028  Reserved                */
        SvCallHandler,                  /* 0x0000_002C  SVcall                  */
        DebugMonitorHandler,            /* 0x0000_0030  Debug Monitor   */
        0x0,                                            /* 0x0000_0034  Reserved                */
        EmptyHandler,                           /* 0x0000_0038  PendSV                  */
        EmptyHandler,                           /* 0x0000_003C  SysTick                 */
};

/* stack */
char stack[4096] __attribute__ ((section (".stack"))) = { 0 };

/* http://www.danielvik.com/2010/02/fast-memcpy-in-c.html */
inline void memcpy(void* dest, const void* src, uint32_t length) {
        char* dst8 = (char*)dest;
        char* src8 = (char*)src;

        while (length--) {
                *dst8++ = *src8++;
        }
}

inline void mempat(void* dest, uint8_t pattern, uint32_t length) {
        char* dst8 = (char*)dest;

        while (length--) {
                *dst8++ = pattern;
        }
}

__attribute__ ((noreturn)) void EmptyHandler(void) {
        for(;;) {}
}

__attribute__ ((noreturn)) void NmiHandler(void) {
        for(;;) {}
}

__attribute__ ((noreturn)) void HardFaultHandler(void) {
        for(;;) {}
}

__attribute__ ((noreturn)) void MemManageHandler(void) {
        for(;;) {}
}

__attribute__ ((noreturn)) void BusFaultHandler(void) {
        for(;;) {}
}

__attribute__ ((noreturn)) void UsageFaultHandler(void) {
        for(;;) {}
}

__attribute__ ((noreturn)) void SvCallHandler(void) {
        for(;;) {}
}

__attribute__ ((noreturn)) void DebugMonitorHandler(void) {
        for(;;) {}
}

int main(void);

__attribute__ ((noreturn)) void ResetHandler(void) {
        /* Copy .data to SRAM */
        memcpy(&__data_sram_start_addr, &__data_flash_start_addr, &__data_sram_end_addr - &__data_sram_start_addr);
        /* Set .bss to zero */
        mempat(&__bss_start_addr, 0x00, &__bss_end_addr - &__bss_start_addr);

        /* jump to main */
        main();

        /* should never return from main */
        for(;;) {}
}

int main(void) {
        /* #1 configuration
         * CPU now running at 16MHz (HSI) */

        /* flash settings (see RM0090 rev9, p80)*/
        /* Enable or disable the Prefetch Buffer */
        FLASH->ACR =
                    FLASH_ACR_LATENCY_5WS       /* 6 CPU cycle wait */
                  | FLASH_ACR_PRFTEN                    /* enable prefetch */
                  | FLASH_ACR_ICEN                      /* instruction cache enable */
                  | FLASH_ACR_DCEN;                     /* data cache enable */

        /* Configure clocks
         * Max SYSCLK: 168MHz
         * Max AHB:  SYSCLK
         * Max APB1: SYSCLK/4 = 48MHz
         * Max APB2: SYSCLK/2 = 86MHz
         * + enable sys clock output 2 with clock divider = 4 */
        RCC->CFGR =
                          0x0                           /* Clock output 2 is SYSCLK (RCC_CFGR_MCO2) */
                        | ( 0x6 << 27)                  /* Clock output divider */
                        | RCC_CFGR_PPRE2_DIV2           /* APB2 prescaler */
                        | RCC_CFGR_PPRE1_DIV4;          /* APB2 prescaler */


        {
                /* Clock control register */
                RCC->CR = RCC_CR_HSEON;         /* Enable external oscillator */

                /* Wait for locked external oscillator */
                while((RCC->CR & RCC_CR_HSERDY) != RCC_CR_HSERDY);

                /* PLL config */
                RCC->PLLCFGR =
                          RCC_PLLCFGR_PLLSRC_HSE                /* PLL source */
                        | (4 << 0)                                              /* PLL input division */
                        | (168 << 6)                                    /* PLL multiplication */
                        | (0 << 16)                                     /* PLL sys clock division */
                        | (7 << 24);                                    /* PLL usb clock division =48MHz */

                /* crystal:  8MHz
                 * PLL in:   2MHz (div 4)
                 * PLL loop: 336MHz (mul 168)
                 * PLL out:  168MHz (div 2)
                 * PLL usb:  48MHz (div 7)
                 */

                /* Enable PLL */
                RCC->CR |=      RCC_CR_PLLON;


                /* Wait for locked PLL */
                while((RCC->CR & RCC_CR_PLLRDY) != RCC_CR_PLLRDY);

                /* select system clock */
                RCC->CFGR &= ~RCC_CFGR_SW; /* clear */
                RCC->CFGR |= RCC_CFGR_SW_PLL;   /* SYSCLK is PLL */

                /* Wait for SYSCLK to be PPL */
                while((RCC->CFGR & RCC_CFGR_SW_PLL) != RCC_CFGR_SW_PLL);
        }

        /* GPIO is in AHB1 peripherals */
        RCC->AHB1ENR =
                  RCC_AHB1ENR_GPIOAEN
                | RCC_AHB1ENR_GPIOBEN
                | RCC_AHB1ENR_GPIOCEN;

        /* PC9 = MCO2 = alternate func */
        GPIOC->MODER   = 0x00080001; /* output */
        GPIOC->OTYPER  = 0x00000000; /* push-pull */
        GPIOC->OSPEEDR = 0x000C0003; /* max speed */

        {
                uint8_t buffer[8] = {0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00};

                while(1) {
                                GPIOC->ODR = buffer[0];
                                GPIOC->ODR = buffer[1];
                                GPIOC->ODR = buffer[2];
                                GPIOC->ODR = buffer[3];
                                GPIOC->ODR = buffer[4];
                                GPIOC->ODR = buffer[5];
                                GPIOC->ODR = buffer[6];
                                GPIOC->ODR = buffer[7];
                }
        }
}