I been experimenting a little with expressive templates and rope data structures to try and see what kind of gains can be achieved. So far it is working quite nicely. However, when attempting to concatenate more than 5 arguments together the compiler fails to optimize properly and generates unnecessary temporaries. Could someone enlighten me why this is happening? Is it a compiler limitation or do my optimization options require tweaking?
I am using g++ 4.4.1 (mingw32) with the following options: -O3 -Winline -Wextra -Wall -fno-exceptions -fno-rtti -fomit-frame-pointer -fexpensive-optimizations -fverbose-asm -S
The code is below, it is just an experimentation so it does not really follow any standards:
#include <stdio.h>
template<typename Derived>
struct rope_base {
    const Derived & ref() const;
};
struct string {
    size_t m_length;
    char * m_value;
    template<typename Derived>
    string(const rope_base<Derived> & rope);
    ~string();
    const char * data() const;
    size_t length() const;
    char * write_to(char * dst) const;
};
struct static_string {
    const char * m_value;
    const size_t m_length;
    static_string(const char * value);
    size_t length() const;
    char * write_to(char * dst) const;
};
template<typename T>
struct rope_traits {
    typedef const T type;
};
template<>
struct rope_traits<string> {
    typedef const string & type;
};
template<>
struct rope_traits<static_string> {
    typedef const static_string & type;
};
template<typename Left, typename Right>
struct rope : public rope_base<rope<Left, Right> > {
    typename rope_traits<Left>::type m_left;
    typename rope_traits<Right>::type m_right;
    rope(const Left & left, const Right & right);
    size_t length() const;
    char * write_to(char * dst) const;
};
inline static_string::static_string(const char * value)
: m_value(value)
, m_length(__builtin_strlen(m_value)) {}
inline size_t static_string::length() const {
    return m_length;
}
inline char * static_string::write_to(char * dst) const {
    __builtin_memcpy(dst, m_value, m_length);
    return dst + m_length;
}
template<typename Derived>
inline string::string(const rope_base<Derived> & rope)
: m_length(rope.ref().length())
, m_value(new char[m_length + 1]) {
    *rope.ref().write_to(m_value) = 0;
}
inline string::~string() {
    delete[] m_value;
}
inline const char * string::data() const {
    return m_value;
}
inline size_t string::length() const {
    return m_length;
}
template<typename Derived>
inline const Derived & rope_base<Derived>::ref() const {
    return static_cast<const Derived &>(*this);
}
template<typename Left, typename Right>
inline rope<Left, Right>::rope(const Left & left, const Right & right)
: m_left(left)
, m_right(right) {}
template<typename Left, typename Right>
inline size_t rope<Left, Right>::length() const {
    return m_left.length() + m_right.length();
}
template<typename Left, typename Right>
inline char * rope<Left, Right>::write_to(char * dst) const {
    return m_right.write_to(m_left.write_to(dst));
}
inline rope<static_string, static_string> operator+(const static_string & left, const static_string & right) {
    return rope<static_string, static_string>(left, right);
}
template<typename Left>
inline rope<Left, static_string> operator+(const rope_base<Left> & left, const static_string & right) {
    return rope<Left, static_string>(left.ref(), right);
}
template<typename Right>
inline rope<static_string, Right> operator+(const static_string & left, const rope_base<Right> & right) {
    return rope<static_string, Right>(left, right.ref());
}
template<typename Left, typename Right>
inline rope<Left, Right> operator+(const rope_base<Left> & left, const rope_base<Right> & right) {
    return rope<Left, Right>(left.ref(), right.ref());
}
typedef static_string ss;
int main(int, char **)
{
    // works up to 5
    string s(ss("111111111111") + "222222222222" + "333333333333" + "444444444444" + "555555555555");
    printf("%d %s\n", s.length(), s.data());
    return 0;
}
The code above generates quite nice assembler output which is fully inlined and all arguments have been reduced to constants:
.def    ___main;    .scl    2;    .type    32;    .endef
.section .rdata,"dr"
LC0:
.ascii "444444444444\0"
LC1:
.ascii "333333333333\0"
LC2:
.ascii "222222222222\0"
LC3:
.ascii "111111111111\0"
LC4:
.ascii "555555555555\0"
LC5:
.ascii "%d %s\12\0"
.text
.p2align 2,,3
.globl _main
.def    _main;    .scl    2;    .type    32;    .endef
_main:
pushl    %ebp     #
movl    %esp, %ebp     #,
andl    $-16, %esp     #,
pushl    %edi     #
pushl    %esi     #
pushl    %ebx     #
subl    $20, %esp     #,
call    ___main     #
movl    $LC3, %esi     #, D.2495
movl    $61, (%esp)     #,
call    __Znaj     #
movl    %eax, %ebx     #, D.3126
movl    $3, %ecx     #, tmp74
movl    %eax, %edi     # D.3126, D.3125
rep movsl
leal    12(%eax), %eax     #, D.3180
movb    $3, %cl     #,
movl    %eax, %edi     # D.3180, D.3180
movl    $LC2, %esi     #, D.2496
rep movsl
leal    24(%ebx), %eax     #, D.3186
movb    $3, %cl     #,
movl    %eax, %edi     # D.3186, D.3186
movl    $LC1, %esi     #, D.2502
rep movsl
leal    36(%ebx), %eax     #, D.3192
movb    $3, %cl     #,
movl    %eax, %edi     # D.3192, D.3192
movl    $LC0, %esi     #, D.2539
rep movsl
leal    48(%ebx), %eax     #, D.3198
movl    $LC4, %esi     #, tmp87
movb    $3, %cl     #,
movl    %eax, %edi     # D.3198, D.3198
rep movsl
movb    $0, 12(%eax)     #,
movl    %ebx, 8(%esp)     # D.3126,
movl    $60, 4(%esp)     #,
movl    $LC5, (%esp)     #,
call    _printf     #
testl    %ebx, %ebx     # D.3126
je    L2     #,
movl    %ebx, (%esp)     # D.3126,
call    __ZdaPv     #
L2:
xorl    %eax, %eax     #
addl    $20, %esp     #,
popl    %ebx     #
popl    %esi     #
popl    %edi     #
leave
ret
.def    __Znaj;    .scl    2;    .type    32;    .endef
.def    _printf;    .scl    2;    .type    32;    .endef
.def    __ZdaPv;    .scl    2;    .type    32;    .endef
Inlining fails when adding one or more parameters to the concatenation, resulting in temporaries being copied around and parameters being treated as variables:
    .def    ___main;    .scl    2;    .type    32;    .endef
    .section .rdata,"dr"
LC0:
    .ascii "777777777777\0"
LC1:
    .ascii "666666666666\0"
LC2:
    .ascii "555555555555\0"
LC3:
    .ascii "444444444444\0"
LC4:
    .ascii "333333333333\0"
LC5:
    .ascii "222222222222\0"
LC6:
    .ascii "111111111111\0"
LC7:
    .ascii "888888888888\0"
LC8:
    .ascii "%d %s\12\0"
    .text
    .p2align 2,,3
.globl _main
    .def    _main;    .scl    2;    .type    32;    .endef
_main:
    pushl    %ebp     #
    movl    %esp, %ebp     #,
    andl    $-16, %esp     #,
    pushl    %edi     #
    pushl    %esi     #
    pushl    %ebx     #
    subl    $228, %esp     #,
    call    ___main     #
    movl    $LC0, 168(%esp)     #, D.2650.m_value
    movl    $12, 172(%esp)     #, D.2650.m_length
    movl    $LC1, 176(%esp)     #, D.2613.m_value
    movl    $12, 180(%esp)     #, D.2613.m_length
    movl    $LC2, 184(%esp)     #, D.2576.m_value
    movl    $12, 188(%esp)     #, D.2576.m_length
    movl    $LC3, 192(%esp)     #, D.2539.m_value
    movl    $12, 196(%esp)     #, D.2539.m_length
    movl    $LC4, 200(%esp)     #, D.2502.m_value
    movl    $12, 204(%esp)     #, D.2502.m_length
    movl    $LC5, 208(%esp)     #, D.2496.m_value
    movl    $12, 212(%esp)     #, D.2496.m_length
    movl    $LC6, 216(%esp)     #, D.2495.m_value
    movl    $12, 220(%esp)     #, D.2495.m_length
    leal    216(%esp), %eax     #, tmp78
    movl    %eax, 152(%esp)     # tmp78, D.2571.m_left.m_left.m_left
    leal    208(%esp), %eax     #, tmp79
    movl    %eax, 156(%esp)     # tmp79, D.2571.m_left.m_left.m_right
    leal    200(%esp), %eax     #, tmp80
    movl    %eax, 160(%esp)     # tmp80, D.2571.m_left.m_right
    leal    192(%esp), %eax     #, tmp81
    movl    %eax, 164(%esp)     # tmp81, D.2571.m_right
    leal    132(%esp), %edi     #, tmp82
    leal    152(%esp), %esi     #, tmp83
    movl    $4, %ecx     #, tmp84
    rep movsl
    leal    184(%esp), %eax     #, tmp85
    movl    %eax, 148(%esp)     # tmp85, D.2608.m_right
    leal    108(%esp), %edi     #, tmp86
    leal    132(%esp), %esi     #, tmp87
    movb    $5, %cl     #,
    rep movsl
    leal    176(%esp), %eax     #, tmp89
    movl    %eax, 128(%esp)     # tmp89, D.2645.m_right
    leal    80(%esp), %edi     #, tmp90
    leal    108(%esp), %esi     #, tmp91
    movb    $6, %cl     #,
    rep movsl
    leal    168(%esp), %eax     #, tmp93
    movl    %eax, 104(%esp)     # tmp93, D.2682.m_right
    leal    48(%esp), %edi     #, tmp94
    leal    80(%esp), %esi     #, tmp95
    movb    $7, %cl     #,
    rep movsl
    movl    48(%esp), %ebx     # D.2719.m_left.m_left.m_left.m_left.m_left.m_left.m_left, SR.35
    movl    52(%esp), %edx     # D.2719.m_left.m_left.m_left.m_left.m_left.m_left.m_right, SR.34
    movl    56(%esp), %eax     # D.2719.m_left.m_left.m_left.m_left.m_left.m_right,
    movl    %eax, 36(%esp)     #, %sfp
    movl    60(%esp), %eax     # D.2719.m_left.m_left.m_left.m_left.m_right,
    movl    %eax, 32(%esp)     #, %sfp
    movl    64(%esp), %eax     # D.2719.m_left.m_left.m_left.m_right,
    movl    %eax, 28(%esp)     #, %sfp
    movl    68(%esp), %eax     # D.2719.m_left.m_left.m_right,
    movl    %eax, 24(%esp)     #, %sfp
    movl    72(%esp), %eax     # D.2719.m_left.m_right,
    movl    %eax, 20(%esp)     #, %sfp
    movl    4(%ebx), %eax     # <variable>.m_length, tmp97
    addl    4(%edx), %eax     # <variable>.m_length, tmp97
    addl    $12, %eax     #,
    movl    %eax, 44(%esp)     #, %sfp
    movl    36(%esp), %eax     # %sfp,
    movl    4(%eax), %eax     # <variable>.m_length,
    addl    %eax, 44(%esp)     #, %sfp
    movl    32(%esp), %eax     # %sfp,
    movl    4(%eax), %eax     # <variable>.m_length,
    addl    %eax, 44(%esp)     #, %sfp
    movl    28(%esp), %eax     # %sfp,
    movl    4(%eax), %eax     # <variable>.m_length,
    addl    %eax, 44(%esp)     #, %sfp
    movl    24(%esp), %eax     # %sfp,
    movl    4(%eax), %eax     # <variable>.m_length,
    addl    %eax, 44(%esp)     #, %sfp
    movl    20(%esp), %eax     # %sfp,
    movl    4(%eax), %eax     # <variable>.m_length,
    addl    %eax, 44(%esp)     #, %sfp
    movl    44(%esp), %eax     # %sfp, tmp105
    incl    %eax     # tmp105
    movl    %eax, (%esp)     # tmp105,
    movl    %edx, 16(%esp)     #,
    call    __Znaj     #
    movl    %eax, 40(%esp)     #, %sfp
    movl    (%ebx), %esi     # <variable>.m_value, <variable>.m_value
    movl    4(%ebx), %ecx     # <variable>.m_length, <variable>.m_length
    movl    %eax, %edi     #, D.3662
    rep movsb
    movl    40(%esp), %eax     # %sfp, D.3735
    addl    4(%ebx), %eax     # <variable>.m_length, D.3735
    movl    16(%esp), %edx     #,
    movl    (%edx), %esi     # <variable>.m_value, <variable>.m_value
    movl    4(%edx), %ecx     # <variable>.m_length, <variable>.m_length
    movl    %eax, %edi     # D.3735, D.3735
    rep movsb
    addl    4(%edx), %eax     # <variable>.m_length, D.3741
    movl    36(%esp), %edx     # %sfp,
    movl    (%edx), %esi     # <variable>.m_value, <variable>.m_value
    movl    4(%edx), %ecx     # <variable>.m_length, <variable>.m_length
    movl    %eax, %edi     # D.3741, D.3741
    rep movsb
    addl    4(%edx), %eax     # <variable>.m_length, D.3747
    movl    32(%esp), %edx     # %sfp,
    movl    (%edx), %esi     # <variable>.m_value, <variable>.m_value
    movl    4(%edx), %ecx     # <variable>.m_length, <variable>.m_length
    movl    %eax, %edi     # D.3747, D.3747
    rep movsb
    addl    4(%edx), %eax     # <variable>.m_length, D.3753
    movl    28(%esp), %edx     # %sfp,
    movl    (%edx), %esi     # <variable>.m_value, <variable>.m_value
    movl    4(%edx), %ecx     # <variable>.m_length, <variable>.m_length
    movl    %eax, %edi     # D.3753, D.3753
    rep movsb
    addl    4(%edx), %eax     # <variable>.m_length, D.3759
    movl    24(%esp), %edx     # %sfp,
    movl    (%edx), %esi     # <variable>.m_value, <variable>.m_value
    movl    4(%edx), %ecx     # <variable>.m_length, <variable>.m_length
    movl    %eax, %edi     # D.3759, D.3759
    rep movsb
    addl    4(%edx), %eax     # <variable>.m_length, D.3765
    movl    20(%esp), %edx     # %sfp,
    movl    (%edx), %esi     # <variable>.m_value, <variable>.m_value
    movl    4(%edx), %ecx     # <variable>.m_length, <variable>.m_length
    movl    %eax, %edi     # D.3765, D.3765
    rep movsb
    addl    4(%edx), %eax     # <variable>.m_length, D.3771
    movl    $LC7, %esi     #, tmp148
    movb    $3, %cl     #,
    movl    %eax, %edi     # D.3771, D.3771
    rep movsl
    movb    $0, 12(%eax)     #,
    movl    40(%esp), %eax     # %sfp,
    movl    %eax, 8(%esp)     #,
    movl    44(%esp), %edx     # %sfp,
    movl    %edx, 4(%esp)     #,
    movl    $LC8, (%esp)     #,
    call    _printf     #
    movl    40(%esp), %eax     # %sfp,
    testl    %eax, %eax     #
    je    L2     #,
    movl    40(%esp), %eax     # %sfp,
    movl    %eax, (%esp)     #,
    call    __ZdaPv     #
L2:
    xorl    %eax, %eax     #
    addl    $228, %esp     #,
    popl    %ebx     #
    popl    %esi     #
    popl    %edi     #
    leave
    ret
    .def    __Znaj;    .scl    2;    .type    32;    .endef
    .def    _printf;    .scl    2;    .type    32;    .endef
    .def    __ZdaPv;    .scl    2;    .type    32;    .endef
A few things (per the discussion in comments):
Upgrade your gcc. A lot of optimisation improvements were made in the 4.6 series, including some improvements to inlining.
With a newer gcc -Winline warns that your string constructor will not be inlined:
warning: inlining failed in call to 'string::string(const rope_base<Derived>&) [with Derived = rope<rope<rope<rope<rope<static_string, static_string>, static_string> static_string>, static_string>, static_string>]': call is unlikely and code size would grow [-Winline]
I'm not quite sure why gcc gives this particular message (which is usually pretty clearly related to an inline call in an unlikely branch), but the root of the problem is the call to new() in that constructor. Using a static fixed size buffer produces nice compact asm. Using malloc, or wrapping new in a non-inlined function call allows the constructor (but not malloc/function) to be inlined.
Which approach to take really depends on your specific use case. If the length of the strings could be known at compile time, or given a maximum value, as in your example, you can obviously completely inline. In the general case though, you are always going to have to give something up.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With