I have a simple blinking led program running on STM32f103C8 (without initialization boilerplate):
void soft_delay(void) {
for (volatile uint32_t i=0; i<2000000; ++i) { }
}
uint32_t iters = 0;
while (1)
{
LL_GPIO_TogglePin(LED_GPIO_Port, LED_Pin);
soft_delay();
++iters;
}
It was compiled with both Keil uVision v.5 (default compiler) and CLion using arm-none-eabi-gcc compiler.
The surprise is that arm-none-eabi-gcc program runs 50% slower in Release mode (-O2 -flto) and 100% slower in Debug mode.
I suspect 3 reasons:
Keil over-optimization (unlikely, because the code is very simple)
arm-none-eabi-gcc under-optimization due to wrong compiler flags (I use CLion Embedded plugins` CMakeLists.txt)
A bug in the initialization so that chip has lower clock frequency with arm-none-eabi-gcc (to be investigated)
I have not yet dived into the jungles of optimization and disassembling, I hope that there are many experienced embedded developers who already encountered this issue and have the answer.
UPDATE 1
Playing around with different optimization levels of Keil ArmCC, I see
how it affects the generated code. And it affects drastically, especially execution time. Here are the benchmarks and disassembly of soft_delay() function for each optimization level (RAM and Flash amounts include initialization code).
-O0: RAM: 1032, Flash: 1444, Execution Time (20 iterations): 18.7 sec
soft_delay PROC
PUSH {r3,lr}
MOVS r0,#0
STR r0,[sp,#0]
B |L6.14|
|L6.8|
LDR r0,[sp,#0]
ADDS r0,r0,#1
STR r0,[sp,#0]
|L6.14|
LDR r1,|L6.24|
LDR r0,[sp,#0]
CMP r0,r1
BCC |L6.8|
POP {r3,pc}
ENDP
-O1: RAM: 1032, Flash: 1216, Execution Time (20 iterations): 13.3 sec
soft_delay PROC
PUSH {r3,lr}
MOVS r0,#0
STR r0,[sp,#0]
LDR r0,|L6.24|
B |L6.16|
|L6.10|
LDR r1,[sp,#0]
ADDS r1,r1,#1
STR r1,[sp,#0]
|L6.16|
LDR r1,[sp,#0]
CMP r1,r0
BCC |L6.10|
POP {r3,pc}
ENDP
-O2 -Otime: RAM: 1032, Flash: 1136, Execution Time (20 iterations): 9.8 sec
soft_delay PROC
SUB sp,sp,#4
MOVS r0,#0
STR r0,[sp,#0]
LDR r0,|L4.24|
|L4.8|
LDR r1,[sp,#0]
ADDS r1,r1,#1
STR r1,[sp,#0]
CMP r1,r0
BCC |L4.8|
ADD sp,sp,#4
BX lr
ENDP
-O3: RAM: 1032, Flash: 1176, Execution Time (20 iterations): 9.9 sec
soft_delay PROC
PUSH {r3,lr}
MOVS r0,#0
STR r0,[sp,#0]
LDR r0,|L5.20|
|L5.8|
LDR r1,[sp,#0]
ADDS r1,r1,#1
STR r1,[sp,#0]
CMP r1,r0
BCC |L5.8|
POP {r3,pc}
ENDP
TODO: benchmarking and disassembly for arm-none-eabi-gcc.
This second answer is a demonstration of the kinds of things that would affect the performance results the OP may be seeing and examples of to possibly test for those STM32F103C8 blue pill.
Complete source code:
flash.ld
MEMORY
{
rom : ORIGIN = 0x08000000, LENGTH = 0x1000
ram : ORIGIN = 0x20000000, LENGTH = 0x1000
}
SECTIONS
{
.text : { *(.text*) } > rom
.rodata : { *(.rodata*) } > rom
.bss : { *(.bss*) } > ram
}
flash.s
.cpu cortex-m0
.thumb
.thumb_func
.global _start
_start:
stacktop: .word 0x20001000
.word reset
.word hang
.word hang
.thumb_func
reset:
bl notmain
b hang
.thumb_func
hang: b .
.align
.thumb_func
.globl PUT32
PUT32:
str r1,[r0]
bx lr
.thumb_func
.globl GET32
GET32:
ldr r0,[r0]
bx lr
.thumb_func
.globl dummy
dummy:
bx lr
test.s
.cpu cortex-m0
.thumb
.word 0,0,0
.word 0,0,0,0
.thumb_func
.globl TEST
TEST:
bx lr
notmain.c
//PA9 TX
//PA10 RX
void PUT32 ( unsigned int, unsigned int );
unsigned int GET32 ( unsigned int );
void dummy ( unsigned int );
#define USART1_BASE 0x40013800
#define USART1_SR (USART1_BASE+0x00)
#define USART1_DR (USART1_BASE+0x04)
#define USART1_BRR (USART1_BASE+0x08)
#define USART1_CR1 (USART1_BASE+0x0C)
#define USART1_CR2 (USART1_BASE+0x10)
#define USART1_CR3 (USART1_BASE+0x14)
//#define USART1_GTPR (USART1_BASE+0x18)
#define GPIOA_BASE 0x40010800
#define GPIOA_CRH (GPIOA_BASE+0x04)
#define RCC_BASE 0x40021000
#define RCC_APB2ENR (RCC_BASE+0x18)
#define STK_CSR 0xE000E010
#define STK_RVR 0xE000E014
#define STK_CVR 0xE000E018
#define STK_MASK 0x00FFFFFF
static void uart_init ( void )
{
//assuming 8MHz clock, 115200 8N1
unsigned int ra;
ra=GET32(RCC_APB2ENR);
ra|=1<<2; //GPIOA
ra|=1<<14; //USART1
PUT32(RCC_APB2ENR,ra);
//pa9 TX alternate function output push-pull
//pa10 RX configure as input floating
ra=GET32(GPIOA_CRH);
ra&=~(0xFF0);
ra|=0x490;
PUT32(GPIOA_CRH,ra);
PUT32(USART1_CR1,0x2000);
PUT32(USART1_CR2,0x0000);
PUT32(USART1_CR3,0x0000);
//8000000/16 = 500000
//500000/115200 = 4.34
//4 and 5/16 = 4.3125
//4.3125 * 16 * 115200 = 7948800
PUT32(USART1_BRR,0x0045);
PUT32(USART1_CR1,0x200C);
}
static void uart_putc ( unsigned int c )
{
while(1)
{
if(GET32(USART1_SR)&0x80) break;
}
PUT32(USART1_DR,c);
}
static void hexstrings ( unsigned int d )
{
//unsigned int ra;
unsigned int rb;
unsigned int rc;
rb=32;
while(1)
{
rb-=4;
rc=(d>>rb)&0xF;
if(rc>9) rc+=0x37; else rc+=0x30;
uart_putc(rc);
if(rb==0) break;
}
uart_putc(0x20);
}
static void hexstring ( unsigned int d )
{
hexstrings(d);
uart_putc(0x0D);
uart_putc(0x0A);
}
void soft_delay(void) {
for (volatile unsigned int i=0; i<2000000; ++i) { }
}
int notmain ( void )
{
PUT32(STK_CSR,4);
PUT32(STK_RVR,0x00FFFFFF);
PUT32(STK_CVR,0x00000000);
PUT32(STK_CSR,5);
uart_init();
hexstring(0x12345678);
hexstring(GET32(0xE000E018));
hexstring(GET32(0xE000E018));
return(0);
}
build
arm-none-eabi-as --warn --fatal-warnings -mcpu=cortex-m3 flash.s -o flash.o
arm-none-eabi-as --warn --fatal-warnings -mcpu=cortex-m3 test.s -o test.o
arm-none-eabi-gcc -Wall -Werror -O2 -nostdlib -nostartfiles -ffreestanding -mthumb -mcpu=cortex-m0 -march=armv6-m -c notmain.c -o notmain.thumb.o
arm-none-eabi-ld -o notmain.thumb.elf -T flash.ld flash.o test.o notmain.thumb.o
arm-none-eabi-objdump -D notmain.thumb.elf > notmain.thumb.list
arm-none-eabi-objcopy notmain.thumb.elf notmain.thumb.bin -O binary
arm-none-eabi-gcc -Wall -Werror -O2 -nostdlib -nostartfiles -ffreestanding -mthumb -mcpu=cortex-m3 -march=armv7-m -c notmain.c -o notmain.thumb2.o
arm-none-eabi-ld -o notmain.thumb2.elf -T flash.ld flash.o test.o notmain.thumb2.o
arm-none-eabi-objdump -D notmain.thumb2.elf > notmain.thumb2.list
arm-none-eabi-objcopy notmain.thumb2.elf notmain.thumb2.bin -O binary
uart output as shown
12345678
00FFE445
00FFC698
If I take your code, make it shorter, don't have all day.
void soft_delay(void) {
for (volatile unsigned int i=0; i<0x2000; ++i) { }
}
arm-none-eabi-gcc -c -O0 -mthumb -mcpu=cortex-m0 hello.c -o hello.o
yes I know this is an m3
arm-none-eabi-gcc --version
arm-none-eabi-gcc (GCC) 5.4.0
gives
00000000 <soft_delay>:
0: b580 push {r7, lr}
2: b082 sub sp, #8
4: af00 add r7, sp, #0
6: 2300 movs r3, #0
8: 607b str r3, [r7, #4]
a: e002 b.n 12 <soft_delay+0x12>
c: 687b ldr r3, [r7, #4]
e: 3301 adds r3, #1
10: 607b str r3, [r7, #4]
12: 687b ldr r3, [r7, #4]
14: 4a03 ldr r2, [pc, #12] ; (24 <soft_delay+0x24>)
16: 4293 cmp r3, r2
18: d9f8 bls.n c <soft_delay+0xc>
1a: 46c0 nop ; (mov r8, r8)
1c: 46bd mov sp, r7
1e: b002 add sp, #8
20: bd80 pop {r7, pc}
22: 46c0 nop ; (mov r8, r8)
24: 00001fff
first check the test infrastructure
.cpu cortex-m0
.thumb
.align 8
.word 0,0
.thumb_func
.globl TEST
TEST:
push {r4,r5,r6,lr}
mov r4,r0
mov r5,r1
ldr r6,[r4]
inner:
bl soft_delay
sub r5,#1
bne inner
ldr r3,[r4]
sub r0,r6,r3
pop {r4,r5,r6,pc}
.align 8
soft_delay:
bx lr
in the openocd telnet window
reset halt
flash write_image erase notmain.thumb.elf
reset
gives
12345678
00001B59
7001 clocks, assuming the systick matches the cpu, thats 7001 arm clocks, 4 instructions per loop.
Step back note I aligned some things
08000108 <TEST>:
8000108: b570 push {r4, r5, r6, lr}
800010a: 1c04 adds r4, r0, #0
800010c: 1c0d adds r5, r1, #0
800010e: 6826 ldr r6, [r4, #0]
08000110 <inner>:
8000110: f000 f876 bl 8000200 <soft_delay>
8000114: 3d01 subs r5, #1
8000116: d1fb bne.n 8000110 <inner>
8000118: 6823 ldr r3, [r4, #0]
800011a: 1af0 subs r0, r6, r3
800011c: bd70 pop {r4, r5, r6, pc}
08000200 <soft_delay>:
8000200: 4770 bx lr
both loops are nicely aligned.
Now if I do this:
0800010a <TEST>:
800010a: b570 push {r4, r5, r6, lr}
800010c: 1c04 adds r4, r0, #0
800010e: 1c0d adds r5, r1, #0
8000110: 6826 ldr r6, [r4, #0]
08000112 <inner>:
8000112: f000 f875 bl 8000200 <soft_delay>
8000116: 3d01 subs r5, #1
8000118: d1fb bne.n 8000112 <inner>
800011a: 6823 ldr r3, [r4, #0]
800011c: 1af0 subs r0, r6, r3
800011e: bd70 pop {r4, r5, r6, pc}
Simply changing the alignment of the code that is supposed to be testing the code under test I now get:
00001F40
8000 ticks to do that loop 1000 times with that call with the code function under test still being aligned
08000200 <soft_delay>:
8000200: 4770 bx lr
The .align 8, in general don't use .align with a number on gnu its behavior does not translate across targets. .balign is better. Anyway I used it. The two words are because the align made TEST aligned, but inner is what I wanted aligned so I added two words to make it aligned.
.align 8
.word 0,0
nop
.thumb_func
.globl TEST
TEST:
push {r4,r5,r6,lr}
mov r4,r0
mov r5,r1
ldr r6,[r4]
inner:
bl soft_delay
sub r5,#1
bne inner
ldr r3,[r4]
sub r0,r6,r3
pop {r4,r5,r6,pc}
A little code review to make sure I didn't make a mistake here.
r0 is the systick current value register
r1 is the number of loops I want to run the code under test
The calling convention allows for r0-r3 to be clobbered so I need to move r0 and r1 to non-volatile registers (per the calling convention).
I want to sample the time the instruction before the loop and the instruction after.
so I need two registers for r0 and r1 and a register to store the begin time so r4,r5,r6 and that fits in nicely to have an even number of registers pushed on the stack. Have to preserve lr so we can return.
we can now safely call soft_delay in the loop, subtract the count, branch if not equal to inner, once the count is done read the timer in r3. from output above this is a down counter so subtract end from beginning, technically since this is a 24 bit counter I should and with 0x00FFFFFF to correctly do that subtraction, but because this isn't going to roll over I can assume out that operation. result/return value goes in r0, pop everything which includes popping the pc to do the return to the C calling function which prints out r0's value.
I think the test code is good.
reading the CPUID register
411FC231
So that means r1p1, while the TRM I am using is written for r2p1 you have to be very careful to use the right document but also sometimes use the current document or all the ones in between if available to see what changed.
ICode memory interface
Instruction fetches from Code memory space 0x00000000 to 0x1FFFFFFF are performed over the 32-bit AHB-Lite bus. The Debugger cannot access this interface. All fetches are word-wide. The number of instructions fetched per word depends on the code running and the alignment of the code in memory.
Sometimes in ARM TRMs you see the fetch info up top near the processor features, this tells me what I wanted to know.
08000112 <inner>:
8000112: f000 f875 bl 8000200 <soft_delay>
8000116: 3d01 subs r5, #1
8000118: d1fb bne.n 8000112 <inner>
this requires a fetch at 110, 114 and 118.
08000110 <inner>:
8000110: f000 f876 bl 8000200 <soft_delay>
8000114: 3d01 subs r5, #1
8000116: d1fb bne.n 8000110 <inner>
This a fetch at 110 and 114, but not one at 118, so that extra fetch could be our added clock. the m3 was the first publicly available one and it has a lot of features in the core that went away and similar ones came back. Some of the smaller cores fetch differently and you don't see this alignment issue. with bigger cores like full sized ones they fetch sometimes 4 or 8 instructions at a time and you have to change your alignment even more to hit the boundary but you can hit the boundary and since it is 2 or 4 clocks plus bus overhead for the extra fetch you can see those.
If I put two nops
nop
nop
.thumb_func
.globl TEST
TEST:
gives
08000114 <inner>:
8000114: f000 f874 bl 8000200 <soft_delay>
8000118: 3d01 subs r5, #1
800011a: d1fb bne.n 8000114 <inner>
800011c: 6823 ldr r3, [r4, #0]
800011e: 1af0 subs r0, r6, r3
8000120: bd70 pop {r4, r5, r6, pc}
gives
00001B59
So that's good we are back to that number, could try a few more to confirm but it appears that alignment is sensitive to our outer test loop, which is bad, but we can manage that, don't change it it won't affect the test. If I didn't care about alignment and had something like this:
void soft_delay(void) {
for (volatile unsigned int i=0; i<0x2000; ++i) { }
}
int notmain ( void )
{
unsigned int ra;
unsigned int beg;
unsigned int end;
PUT32(STK_CSR,4);
PUT32(STK_RVR,0x00FFFFFF);
PUT32(STK_CVR,0x00000000);
PUT32(STK_CSR,5);
uart_init();
hexstring(0x12345678);
beg=GET32(STK_CVR);
for(ra=0;ra<1000;ra++)
{
soft_delay();
}
end=GET32(STK_CVR);
hexstring((beg-end)&0x00FFFFFF);
return(0);
}
Then as I played with optimization options and I also played with using different compilers any change in the program/binary in front of the test loop would/could move the test loop changing its performance, in my simple example it was a 14% performance difference, that's massive if you are doing performance tests. letting the compiler take care of all this without us being in control the everything in front of the function under test could mess with the function under test, as written above the compiler might opt to inline the function rather than call it making an even more interesting situation as the test loop while probably not as clean as mine, certainly not if not optimized, but now the code under test is dynamic as options or alignments change.
I'm very happy you happened to be using this core/chip...
If I re-align inner and now mess with this
.align 8
nop
soft_delay:
bx lr
08000202 <soft_delay>:
8000202: 4770 bx lr
it's a single instruction which is fetched at 0x200 from what we have read and seem to be able to tell. wouldn't expect this to change anything and it didn't
00001B59
but now that we know what we know, we can use our experience to mess with this trivial Not interesting at all example.
.align 8
nop
soft_delay:
nop
bx lr
gives
00001F41
as expected. and we can have even more fun:
.align 8
.word 0,0
nop
.thumb_func
.globl TEST
TEST:
combined gives
08000112 <inner>:
8000112: f000 f876 bl 8000202 <soft_delay>
8000116: 3d01 subs r5, #1
8000118: d1fb bne.n 8000112 <inner>
08000202 <soft_delay>:
8000202: 46c0 nop ; (mov r8, r8)
8000204: 4770 bx lr
no surprise if you know what you are doing:
00002328
9000 clocks, 29% performance difference. we are literally talking about 5 (technically 6) instructions, same exact machine code and by simply changing alignment the performance can be 29% different, compiler and options have nothing to do with it, yet, have not even gotten there.
How can we expect to do any kind of performance evaluation of a program using the time the code a bunch of times in a loop method? We cant unless we know what we are doing, have an understanding of the architecture, etc.
Now as it should be obvious and reading the documentation I am using the internal 8Mhz clock, everything is derived from that so the systick times are not going to sometimes vary as you might see with dram for example. The LATENCY bits in the FLASH_ACR register should have defaulted to zero wait states for 0 < SYSCLK <- 24Mhz. If I were to bump up the clock above 24Mhz, the processor is running faster but the flash is now slower relative to the processor.
Without messing with the clocks and simply adding a wait state by changing the FLASH_ACR register to 0x31.
000032C6
12998 up from 9000, I didn't expect it to double necessarily and it didn't.
Hmm for fun make a PUT16 using strh, and
.thumb_func
.globl HOP
HOP:
bx r2
and
PUT16(0x2000010a,0xb570); // 800010a: b570 push {r4, r5, r6, lr}
PUT16(0x2000010c,0x1c04); // 800010c: 1c04 adds r4, r0, #0
PUT16(0x2000010e,0x1c0d); // 800010e: 1c0d adds r5, r1, #0
PUT16(0x20000110,0x6826); // 8000110: 6826 ldr r6, [r4, #0]
PUT16(0x20000112,0xf000); // 8000112: f000 f876 bl 8000202 <soft_delay>
PUT16(0x20000114,0xf876); // 8000112: f000 f876 bl 8000202 <soft_delay>
PUT16(0x20000116,0x3d01); // 8000116: 3d01 subs r5, #1
PUT16(0x20000118,0xd1fb); // 8000118: d1fb bne.n 8000112 <inner>
PUT16(0x2000011a,0x6823); // 800011a: 6823 ldr r3, [r4, #0]
PUT16(0x2000011c,0x1af0); // 800011c: 1af0 subs r0, r6, r3
PUT16(0x2000011e,0xbd70); // 800011e: bd70 pop {r4, r5, r6, pc}
PUT16(0x20000202,0x46c0); // 8000202: 46c0 nop ; (mov r8, r8)
PUT16(0x20000204,0x4770); // 8000204: 4770 bx lr
hexstring(HOP(STK_CVR,1000,0x2000010B));
gives 0000464B
and that was not at all expected. but is 18,000 basically
Putting ram to bed after this
PUT16(0x20000108,0xb570); // 800010a: b570 push {r4, r5, r6, lr}
PUT16(0x2000010a,0x1c04); // 800010c: 1c04 adds r4, r0, #0
PUT16(0x2000010c,0x1c0d); // 800010e: 1c0d adds r5, r1, #0
PUT16(0x2000010e,0x6826); // 8000110: 6826 ldr r6, [r4, #0]
PUT16(0x20000110,0xf000); // 8000112: f000 f876 bl 8000202 <soft_delay>
PUT16(0x20000112,0xf876); // 8000112: f000 f876 bl 8000202 <soft_delay>
PUT16(0x20000114,0x3d01); // 8000116: 3d01 subs r5, #1
PUT16(0x20000116,0xd1fb); // 8000118: d1fb bne.n 8000112 <inner>
PUT16(0x20000118,0x6823); // 800011a: 6823 ldr r3, [r4, #0]
PUT16(0x2000011a,0x1af0); // 800011c: 1af0 subs r0, r6, r3
PUT16(0x2000011c,0xbd70); // 800011e: bd70 pop {r4, r5, r6, pc}
PUT16(0x20000200,0x46c0); // 8000202: 46c0 nop ; (mov r8, r8)
PUT16(0x20000200,0x4770); // 8000204: 4770 bx lr
hexstring(HOP(STK_CVR,1000,0x20000109));
00002EDE
The machine code did not change because I moved both back by 2 so the relative address between them was the same. Note that bl is two separate instructions not one 32 bit one. You cant see this in the newer docs you need to go back to the original/early ARM ARM where it is explained. And it is easy to do experiments where you split the two instructions and put other stuff in between and they work just fine, because they are two separate instructions.
At this point the reader should be able to make a 2 instruction test loop, time it and dramatically change the performance of the execution of those two instructions on this platform using the same exact machine code.
So let's try the volatile loop that you wrote.
.align 8
soft_delay:
push {r7, lr}
sub sp, #8
add r7, sp, #0
mov r3, #0
str r3, [r7, #4]
b L12
Lc:
ldr r3, [r7, #4]
add r3, #1
str r3, [r7, #4]
L12:
ldr r3, [r7, #4]
ldr r2, L24
cmp r3, r2
bls Lc
nop
mov sp, r7
add sp, #8
pop {r7, pc}
nop
.align
L24: .word 0x1FFF
this is I believe the unoptimized -O0 version. starting off with one test loop
hexstring(TEST(STK_CVR,1));
experience, the times we are seeing will overflow our 24 bit counter and the results will be very strange or lead to false conclusions.
0001801F
98,000, quick check for safety:
.align
L24: .word 0x1F
0000019F
not bad that is on par with 256 times faster.
so we have some wiggle room in our test loop but not much try 10
hexstring(TEST(STK_CVR,10));
000F012D
98334 ticks per loop.
changing the alignment
08000202 <soft_delay>:
8000202: b580 push {r7, lr}
8000204: b082 sub sp, #8
gave the same result
000F012D
not unheard of, you can examine the differences if you want count through each instruction check fetch cycles, etc.
had I made the test:
soft_delay:
nop
nop
bx lr
its two fetch cycles no matter what the alignment or if I had left it bx lr with no nops as we saw so by simply having an odd number of instructions in the test then alignment won't affect the results on fetches along, but note that from what we know now had some other code in the program moved the outer timing/test loop that may have changed performance and the results may show a difference between two tests that were purely the timing code and not the code under test (read Michael Abrash).
The cortex-m3 is based on the armv7-m architecture. If I change the compiler from -mcpu=cortex-m0 (all cortex-m compatible so far) to -mcpu=cortex-m3 (not all cortex-m compatible will break on half of them) it produces a little bit less code.
.align 8
soft_delay:
push {r7}
sub sp, #12
add r7, sp, #0
movs r3, #0
str r3, [r7, #4]
b L12
Lc:
ldr r3, [r7, #4]
add r3, #1
str r3, [r7, #4]
L12:
ldr r3, [r7, #4]
/*14: f5b3 5f00 cmp.w r3, #8192 ; 0x2000*/
//cmp.w r3, #8192
.word 0x5f00f5b3
bcc Lc
nop
add r7, #12
mov sp, r7
pop {r7}
bx lr
000C80FB 81945 ticks for the code under test.
I hate unified syntax, that was a massive mistake, so I fumble along in legacy mode. thus the .word thing there in the middle.
As part of writing this I kinda messed up my system in order to demonstrate something. I was building a gcc 5.4.0 but overwrote my 9.2.0 so had to re-build both.
2.95 was the version I started using with arm and didn't support thumb gcc 3.x.x was the first to. And either gcc 4.x.x or gcc 5.x.x produced "slower" code for some of my projects, at work we are currently moving from ubuntu 16.04 to 18.04 for our build systems which if you use the apt-got cross compiler for arm that moves you from 5.x.x to 7.x.x and it is making larger binaries for the same source code and where we are tight on memory it is pushing us beyond what's available so we have to either remove some code (easiest to make the printed messages shorter, cut text out) or stick to the older compiler by building our own or apt-getting the older one. 19.10 does no longer offers the 5.x.x version.
So both are now built.
18: d3f8 bcc.n c <soft_delay+0xc>
1a: bf00 nop
1c: bf00 nop
1e: 370c adds r7, #12
these nops after bcc are baffling to me...
18: d3f8 bcc.n c <soft_delay+0xc>
1a: bf00 nop
1c: 370c adds r7, #12
gcc 5.4.0 is putting one, gcc 9.2.0 is putting two nops, ARM doesn't have the branch shadow thing of MIPS (MIPS doesn't currently either).
000C80FB gcc 5.4.0
000C8105 gcc 9.2.0
I call the function 10 times, the nop is outside the code under tests loop so has a lesser effect.
Optimized all cortex-m variants (to date) using gcc 9.2.0
soft_delay:
mov r3, #0
mov r2, #128
sub sp, #8
str r3, [sp, #4]
ldr r3, [sp, #4]
lsl r2, r2, #6
cmp r3, r2
bcs L1c
L10:
ldr r3, [sp, #4]
add r3, #1
str r3, [sp, #4]
ldr r3, [sp, #4]
cmp r3, r2
bcc L10
L1c:
add sp, #8
bx lr
(also understand that not all say gcc 9.2.0 builds produce the same code when you build the compiler you have options and those options can affect the output making different builds of 9.2.0 possibly producing different results)
000C80B5
gcc 9.2.0 built for cortex-m3:
soft_delay:
mov r3, #0
sub sp, #8
str r3, [sp, #4]
ldr r3, [sp, #4]
/*8: f5b3 5f00 cmp.w r3, #8192 ; 0x2000*/
.word 0x5F00F5B3
bcs L1c
Le:
ldr r3, [sp, #4]
add r3, #1
str r3, [sp, #4]
ldr r3, [sp, #4]
/*16: f5b3 5f00 cmp.w r3, #8192 ; 0x2000*/
.word 0x5F00F5B3
bcc Le
L1c:
add sp, #8
bx lr
000C80A1
That's in the noise. despite the code built has differences. they simply didn't gain in comparing the 0x2000 in fewer instructions. and note if you change that 0x2000 to some other number then that does not simply make the loop take that much longer it can change the generated code for architectures like this.
How I like to make these counted delay loops is to use a function outside the compile domain
extern void dummy ( unsigned int );
void soft_delay(void) {
for (unsigned int i=0; i<0x2000; ++i) { dummy(i); }
}
soft_delay:
push {r4, r5, r6, lr}
mov r5, #128
mov r4, #0
lsl r5, r5, #6
L8:
mov r0, r4
add r4, #1
bl dummy
cmp r4, r5
bne L8
pop {r4, r5, r6, pc}
the feature there is you don't need the overhead of what volatile does you do have a call and clearly there is overhead as well due to the call but not as much
000B40C9
or even better:
soft_delay:
sub r0,#1
bne soft_delay
bx lr
I would have to change the code wrapped around the code under test to make that function work.
Another note specific to these targets but also something you deal with
unsigned int more_fun ( unsigned int, unsigned int );
unsigned int fun ( unsigned int a, unsigned int b )
{
return(more_fun(a,b)+a+(b<<2));
}
00000000 <fun>:
0: b570 push {r4, r5, r6, lr}
2: 000c movs r4, r1
4: 0005 movs r5, r0
6: f7ff fffe bl 0 <more_fun>
a: 00a4 lsls r4, r4, #2
c: 1964 adds r4, r4, r5
e: 1820 adds r0, r4, r0
10: bd70 pop {r4, r5, r6, pc}
12: 46c0 nop ; (mov r8, r8)
a question repeated here at SO on a period basis. why is it pushing r6 it isn't using r6.
The compiler operates using what I call and used to be called a calling convention, now they use terms ABI, EABI, whatever either case it is the same thing it is a set of rules the compiler follows for a particular target. Arm added a rule to keep the stack aligned on a 64 bit address boundary instead of 32, this caused the extra item to keep the stack aligned, what register is used there can vary. If you use an older gcc vs a newer this can/will affect the performance of your code all by itself.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With