- 
          
- 
                Notifications
    You must be signed in to change notification settings 
- Fork 65
Open
Description
Analysis of the MuxTO binary shows quite a bit of flash space being used on gcc library functions that are not optimized for either small ARM chips, or the Arduino environment.
Providing space-optimized and perhaps simplified versions of 32bit divide and malloc/free saves about 750bytes.
(add this "picolib.c" file to the MuxTO directory.)
/*
 * Smaller replacements for some big libgcc and libc functions,
 * suitable for use with this program
 */
#define SMALLER_UDIV 1
#define SMALLER_MALLOC 1
#define HEAP_SIZE 1500
#include <stdint.h>
#ifdef SMALLER_UDIV
/*
;;; Smaller 32bit division function for CM0+ The gnu libc __udivsi3
;;;   function unrolls the bitwise loop, which is quicker, but
;;;   sigificantly large (255+ bytes), and undesirable on chips with
;;;   small flash memories.  This is slower and smaller
;;;
;;; This is essentially code take from Yiu's "The Definative Guide to
;;;   the Cortex-M0 and Cortex-M0+ Processors", with slight modificatins.
;;; By Bill Westfield (WestfW), Aug 2025
;;; Note that the libgcc module being replaced defines three different
;;;   symbols, and we have to define all three if we want our code to replace
;;;   that module without causing "multiple definition" link errors.
;;;     __udivsi3 is the basic worker function
;;;     __aeabi_uidiv is an alias for that (ARM CMSIS name rather than gcc?)
;;;     __aeabi_uidivmod does 0 divisor check and has the explicit remainder.
;;;   (those are all compatible, WRT to the actual math preformed.)
*/
asm("__udivsi3: .global __wrap___udivsi3\n"
    "__aeabi_uidivmod: .global __aeabi_uidivmod\n"
    "__aeabi_uidiv: .global __aeabi_uidiv\n"
    "myudivsi3: .global myudivsi3\n"
    ".syntax unified\n"
    ".thumb\n"
    /*
    * Inputs:
    *    R0 = dividend
    *    R1 = divider
    * Outputs
    *    R0 = quotient
    *    R1 = remainder
    */
    "        cmp      r1, #0\n"
    "        beq     divzero\n"
    "        push    {r2-r4, lr}\n" // Save registers
    "        movs    r2, r0\n"      // copy dividend
    "        movs    r3, #1\n"      // counter
    "        lsls    r3, #31\n"     // N = 0x80000000
    "        movs    r0, #0\n"      // initial Quotient
    "        movs    r4, #0\n"      // initial Tmp
    "loop:\n"
    "        lsls    r2, #1\n"      // Shift dividend, MSB go into carry
    "        adcs    r4, r4\n"      // Shift Tmp, carry move into LSB
    "        cmp     r4, r1\n"
    "        bcc     lessthan\n"
    "        adds    r0, r3\n"      // Increment quotient
    "        subs    r4, r1\n"
    "lessthan:\n"
    "        lsrs    r3, #1\n"
    "        bne     loop\n"
    "        movs    R1, R4\n"      // remainder in R1, Quotient already in R0
    "        pop     {r2-r4, pc}\n"
    "divzero: movs   r0, #0\n"      // Divide by yields 0, like libc
    "         bx      lr\n"
    "        .size myudivsi3, .-myudivsi3\n"
   );
#endif
#ifdef SMALLER_MALLOC
/*
 * MuxTO uses malloc in relatively few places, for initialization/construction
 * of various structures in the Arduino core (notably Serial and USB buffers.)
 * They are probably not free'ed (Hmm.  Maybe on USB re-initialization?)
 * So we can probably get by with a much simpler equivalent of malloc()
 * The total space required is 4*Serial buffers (64 bytes each) plus a
 * USB DoubleBufferedEPOutHandler (184 bytes or so?), for a total of 440bytes.
 * Note that this heap is already zeroed as a result of being in bss.
 */
uint16_t malloc_total = 0;
void *malloc(unsigned int size) {
  static uint8_t theHeap[HEAP_SIZE];
  static uint8_t *next_free = theHeap;
  uint8_t *p = next_free;
  next_free += size;
  malloc_total += size;
  return (void *)p;
}
void free(void *p) {
}
#endif
Ashnice
Metadata
Metadata
Assignees
Labels
No labels