I have one project of data structure implement by C, it exporting sorts of APIs for other program. Recently I would like to do some optimization on Hot function profiled by grofile. Here is the project for your reference.
https://github.com/Incarnation-p-lee/libds There is one hot function binary_search_tree_node_insert, as below:
/*
 * RETURN the pointer of inserted node of the binary search tree
 *        If root is NULL or node is NULL, RETURN NULL.
 */
struct binary_search_tree *
binary_search_tree_node_insert(struct binary_search_tree *root,
    struct binary_search_tree *node)
{
    register struct binary_search_tree **iter;
    if (!node || !root) {
        pr_log_warn("Attempt to access NULL pointer.\n");
    } else {
        iter = &root;
        while (*iter) {
            if (node->chain.nice == (*iter)->chain.nice) {
                if (*iter == node) {
                    pr_log_info("Insert node exist, nothing will be done.\n");
                } else {
                    doubly_linked_list_merge((*iter)->chain.link, node->chain.link);
                }
                return *iter;
#ifndef OPT_HOT
            } else if (node->chain.nice > (*iter)->chain.nice) {
                    iter = &(*iter)->right;
            } else if (node->chain.nice < (*iter)->chain.nice) {
                    iter = &(*iter)->left;
#else
            } else {
                binary_search_tree_insert_path_go_through(node, iter);
#endif
            }
        }
        return *iter = node;
    }
    return NULL;
}
I want to optimization the two else-if part as it's half-to-half branch, which may impact on pipeline a lot. So I write one macro binary_search_tree_insert_path_go_through replace these two branch. And the implementation as follow:
/*
 * 1. node->nice => rbx, *iter => rcx.
 * 2. compare rbx, and 0x8(rcx).
 * 3. update iter.
 */
#define binary_search_tree_insert_path_go_through(node, iter) \
    asm volatile (                                            \
        "mov $0x18, %%rax\n\t"                                \
        "mov $0x20, %%rdx\n\t"                                \
        "mov 0x8(%1), %%rbx\n\t"                              \
        "mov (%0), %%rcx\n\t"                                 \
        "cmp 0x8(%%rcx), %%rbx\n\t"                           \
        "cmovg %%rdx, %%rax\n\t"                              \
        "lea (%%rcx, %%rax), %0\n\t"                          \
        :"+r"(iter)                                           \
        :"r"(node)                                            \
        :"rax", "rbx", "rcx", "rdx")
But the unit test of this function has dropped about 6-8% about this change. From the objdump code(optimized code on right hand), it has less instructions, I can hardly understand why it cost more time before optimization.

And there are some details for you reference:
struct collision_chain {
    struct doubly_linked_list *link;
    sint64                    nice;
};
/*
 * binary search tree
 */
struct binary_search_tree {
    struct collision_chain chain;
    sint32                 height;  /* reserved for avl */
    /* root node has height 0, NULL node has height -1 */
    union {
        struct binary_search_tree *left;
        struct avl_tree           *avl_left;    /* reserved for avl   */
        struct splay_tree         *splay_left;  /* reserved for splay */
    };
    union {
        struct binary_search_tree *right;
        struct avl_tree           *avl_right;    /* reserved for avl   */
        struct splay_tree         *splay_right;  /* reserved for splay */
    };
};
struct doubly_linked_list {
    uint32                    sid;
    void                      *val;
    struct doubly_linked_list *next;
    struct doubly_linked_list *previous;
};
It runs on virtual-box with 2 core of i5-3xxM, and cpuinfo as following:
processor       : 0
vendor_id       : GenuineIntel
cpu family      : 6
model           : 58
model name      : Intel(R) Core(TM) i5-3230M CPU @ 2.60GHz
stepping        : 9
microcode       : 0x19
cpu MHz         : 2568.658
cache size      : 6144 KB
physical id     : 0
siblings        : 2
core id         : 0
cpu cores       : 2
apicid          : 0
initial apicid  : 0
fpu             : yes
fpu_exception   : yes
cpuid level     : 5
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx rdtscp lm constant_tsc rep_good nopl pni ssse3 lahf_lm
bogomips        : 5137.31
clflush size    : 64
cache_alignment : 64
address sizes   : 36 bits physical, 48 bits virtual
power management:
processor       : 1
vendor_id       : GenuineIntel
cpu family      : 6
model           : 58
model name      : Intel(R) Core(TM) i5-3230M CPU @ 2.60GHz
stepping        : 9
microcode       : 0x19
cpu MHz         : 2568.658
cache size      : 6144 KB
physical id     : 0
siblings        : 2
core id         : 1
cpu cores       : 2
apicid          : 1
initial apicid  : 1
fpu             : yes
fpu_exception   : yes
cpuid level     : 5
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx rdtscp lm constant_tsc rep_good nopl pni ssse3 lahf_lm
bogomips        : 5137.31
clflush size    : 64
cache_alignment : 64
address sizes   : 36 bits physical, 48 bits virtual
power management:
I don't know if it's the same for modern processors, but Linus really didn't like the CMOV instruction back in '07.
Since you are micro-optimizing, move the equality check to the last position. It's almost always false, yet you're making it in every iteration.
Moreover, I'd try not using the pointer-to-pointer pattern. Indirection tends to make optimizers choke due to pointer aliasing concerns. Instead, use an inch-worm pattern with two pointers:
void insert(NODE *x, NODE **root) {
  NODE *trail = NULL;
  NODE *lead = *root;
  while (lead) {
    trail = lead;
    if (x->key < lead->key)
      lead = lead->left;
    else if (x->key > lead->key)
      lead = lead->right;
    else return; // do nothing;
  }
  // lead has found null, so insert
  if (trail)
    // redo the last key comparison
    if (x->key < trail->key)
      trail->left = x;
    else
      trail->right = x;
  else 
    *root = x;
}
On my MacBook, this compiles to the following 64-bit code, where the loop includes only 10 instructions. It's hard to tell from the tiny listings in your post, but it looks like it's is considerably longer:
    pushq   %rbp
    movq    %rsp, %rbp
    movq    (%rsi), %rdx
    testq   %rdx, %rdx
    je      LBB0_11
    movl    16(%rdi), %ecx
LBB0_2:                                 
    movq    %rdx, %rax     # dx=lead, ax=trail
    cmpl    16(%rax), %ecx # comparison with key
    jge     LBB0_4         # first branch
    movq    %rax, %rdx     # go left (redundant because offset(left)==0!)
    jmp     LBB0_6
LBB0_4:                                 
    jle     LBB0_12        # second branch
    leaq    8(%rax), %rdx  # go right
LBB0_6:                                 
    movq    (%rdx), %rdx   # move lead down the tree
    testq   %rdx, %rdx     # test for null
    jne     LBB0_2         # loop if not
    testq   %rax, %rax     # insertion logic
    je      LBB0_11
    movl    16(%rdi), %ecx
    cmpl    16(%rax), %ecx
    jge     LBB0_10
    movq    %rdi, (%rax)
    popq    %rbp
    retq
LBB0_11:
    movq    %rdi, (%rsi)
LBB0_12:                   # return for equal keys
    popq    %rbp
    retq
LBB0_10:
    movq    %rdi, 8(%rax)
    popq    %rbp
    retq
If comparisons are expensive, (you don't show what "nice" is), you could also experiment with storing the binary result of the trail comparison so that the final check uses this rather than redoing the comparison.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With