c - unrolling loops macros needed -


consider following code:

extern int conn_fds[max_clients]; extern int fl_conn_indexes[max_clients]; extern int fl_req_bufs[max_clients]; extern struct epoll_event estab_events[max_threads]; extern req_buf_t req_bufs[max_req_bufs]; extern int fl_req_bufs_top; extern int conn_statuses[max_clients]; extern int fl_conn_indexes_top; extern  tcpl_nc_t nc_http_list; extern struct sockaddr_in conn_addresses[max_clients];  void accept_connections(unsigned int num_conns) {     int fds[max_threads];     int conn_indexes[max_threads];     int conn_idx=0;     int new_bottom;     socklen_t slenghts[max_threads];     void *labels1[max_threads] = {&&a0,&&a1,&&a2,&&a3,&&a4,&&a5,&&a6,&&a7,&&a8,&&a9,&&a10,&&a11,&&a12,&&a13,&&a14,&&a15};     void *labels2[max_threads] = {&&b0,&&b1,&&b2,&&b3,&&b4,&&b5,&&b6,&&b7,&&b8,&&b9,&&b10,&&b11,&&b12,&&b13,&&b14,&&b15};     void *labels3[max_threads] = {&&c0,&&c1,&&c2,&&c3,&&c4,&&c5,&&c6,&&c7,&&c8,&&c9,&&c10,&&c11,&&c12,&&c13,&&c14,&&c15};      new_bottom=fl_conn_indexes_top-num_conns;     if (new_bottom<=0) return;      goto *labels1[num_conns];  a15:     conn_indexes[max_threads- 1]=fl_conn_indexes[fl_conn_indexes_top- 1];  a14:     conn_indexes[max_threads- 2]=fl_conn_indexes[fl_conn_indexes_top- 2];  a13:     conn_indexes[max_threads- 3]=fl_conn_indexes[fl_conn_indexes_top- 3];  a12:     conn_indexes[max_threads- 4]=fl_conn_indexes[fl_conn_indexes_top- 4];  a11:     conn_indexes[max_threads- 5]=fl_conn_indexes[fl_conn_indexes_top- 5];  a10:     conn_indexes[max_threads- 6]=fl_conn_indexes[fl_conn_indexes_top- 6];  a9:     conn_indexes[max_threads- 7]=fl_conn_indexes[fl_conn_indexes_top- 7];  a8:     conn_indexes[max_threads- 8]=fl_conn_indexes[fl_conn_indexes_top- 8];  a7:     conn_indexes[max_threads- 9]=fl_conn_indexes[fl_conn_indexes_top- 9];  a6:     conn_indexes[max_threads-10]=fl_conn_indexes[fl_conn_indexes_top-10];  a5:     conn_indexes[max_threads-11]=fl_conn_indexes[fl_conn_indexes_top-11];  a4:     conn_indexes[max_threads-12]=fl_conn_indexes[fl_conn_indexes_top-12];  a3:     conn_indexes[max_threads-13]=fl_conn_indexes[fl_conn_indexes_top-13];  a2:     conn_indexes[max_threads-14]=fl_conn_indexes[fl_conn_indexes_top-14];  a1:     conn_indexes[max_threads-15]=fl_conn_indexes[fl_conn_indexes_top-15];  a0:     conn_indexes[max_threads-16]=fl_conn_indexes[fl_conn_indexes_top-16];      fl_conn_indexes_top=fl_conn_indexes_top-num_conns;     goto *labels2[num_conns];  b15:     conn_fds[conn_indexes[max_threads- 1]]=accept(nc_http_list.sock_fd,(struct sockaddr*) &conn_addresses[conn_indexes[max_threads- 1]],&slenghts[max_threads- 1]);  b14:     conn_fds[conn_indexes[max_threads- 2]]=accept(nc_http_list.sock_fd,(struct sockaddr*) &conn_addresses[conn_indexes[max_threads- 2]],&slenghts[max_threads- 2]);  b13:     conn_fds[conn_indexes[max_threads- 3]]=accept(nc_http_list.sock_fd,(struct sockaddr*) &conn_addresses[conn_indexes[max_threads- 3]],&slenghts[max_threads- 3]);  b12:     conn_fds[conn_indexes[max_threads- 4]]=accept(nc_http_list.sock_fd,(struct sockaddr*) &conn_addresses[conn_indexes[max_threads- 4]],&slenghts[max_threads- 4]);  b11:     conn_fds[conn_indexes[max_threads- 5]]=accept(nc_http_list.sock_fd,(struct sockaddr*) &conn_addresses[conn_indexes[max_threads- 5]],&slenghts[max_threads- 5]);  b10:     conn_fds[conn_indexes[max_threads- 6]]=accept(nc_http_list.sock_fd,(struct sockaddr*) &conn_addresses[conn_indexes[max_threads- 6]],&slenghts[max_threads- 6]);  b9:     conn_fds[conn_indexes[max_threads- 7]]=accept(nc_http_list.sock_fd,(struct sockaddr*) &conn_addresses[conn_indexes[max_threads- 7]],&slenghts[max_threads- 7]);  b8:     conn_fds[conn_indexes[max_threads- 8]]=accept(nc_http_list.sock_fd,(struct sockaddr*) &conn_addresses[conn_indexes[max_threads- 8]],&slenghts[max_threads- 8]);  b7:     conn_fds[conn_indexes[max_threads- 9]]=accept(nc_http_list.sock_fd,(struct sockaddr*) &conn_addresses[conn_indexes[max_threads- 9]],&slenghts[max_threads- 9]);  b6:     conn_fds[conn_indexes[max_threads-10]]=accept(nc_http_list.sock_fd,(struct sockaddr*) &conn_addresses[conn_indexes[max_threads-10]],&slenghts[max_threads-10]);  b5:     conn_fds[conn_indexes[max_threads-11]]=accept(nc_http_list.sock_fd,(struct sockaddr*) &conn_addresses[conn_indexes[max_threads-11]],&slenghts[max_threads-11]);  b4:     conn_fds[conn_indexes[max_threads-12]]=accept(nc_http_list.sock_fd,(struct sockaddr*) &conn_addresses[conn_indexes[max_threads-12]],&slenghts[max_threads-12]);  b3:     conn_fds[conn_indexes[max_threads-13]]=accept(nc_http_list.sock_fd,(struct sockaddr*) &conn_addresses[conn_indexes[max_threads-13]],&slenghts[max_threads-13]);  b2:     conn_fds[conn_indexes[max_threads-14]]=accept(nc_http_list.sock_fd,(struct sockaddr*) &conn_addresses[conn_indexes[max_threads-14]],&slenghts[max_threads-14]);  b1:     conn_fds[conn_indexes[max_threads-15]]=accept(nc_http_list.sock_fd,(struct sockaddr*) &conn_addresses[conn_indexes[max_threads-15]],&slenghts[max_threads-15]);  b0:     conn_fds[conn_indexes[max_threads-16]]=accept(nc_http_list.sock_fd,(struct sockaddr*) &conn_addresses[conn_indexes[max_threads-16]],&slenghts[max_threads-16]);      goto *labels3[num_conns];  c15:     conn_statuses[conn_indexes[max_threads- 1]]=conn_status_reading;  c14:     conn_statuses[conn_indexes[max_threads- 2]]=conn_status_reading;  c13:     conn_statuses[conn_indexes[max_threads- 3]]=conn_status_reading;  c12:     conn_statuses[conn_indexes[max_threads- 4]]=conn_status_reading;  c11:     conn_statuses[conn_indexes[max_threads- 5]]=conn_status_reading;  c10:     conn_statuses[conn_indexes[max_threads- 6]]=conn_status_reading;  c9:     conn_statuses[conn_indexes[max_threads- 7]]=conn_status_reading;  c8:     conn_statuses[conn_indexes[max_threads- 8]]=conn_status_reading;  c7:     conn_statuses[conn_indexes[max_threads- 9]]=conn_status_reading;  c6:     conn_statuses[conn_indexes[max_threads-10]]=conn_status_reading;  c5:     conn_statuses[conn_indexes[max_threads-11]]=conn_status_reading;  c4:     conn_statuses[conn_indexes[max_threads-12]]=conn_status_reading;  c3:     conn_statuses[conn_indexes[max_threads-13]]=conn_status_reading;  c2:     conn_statuses[conn_indexes[max_threads-14]]=conn_status_reading;  c1:     conn_statuses[conn_indexes[max_threads-15]]=conn_status_reading;  c0:     conn_statuses[conn_indexes[max_threads-16]]=conn_status_reading;  } 

what doing here processing in parallel max_threads number of connections. loops unrolled on purpose speed. code long. need reduce using macros, kind of this:

extern int conn_fds[max_clients]; extern int fl_conn_indexes[max_clients]; extern int fl_req_bufs[max_clients]; extern struct epoll_event estab_events[max_threads]; extern req_buf_t req_bufs[max_req_bufs]; extern int fl_req_bufs_top; extern int conn_statuses[max_clients]; extern int fl_conn_indexes_top; extern  tcpl_nc_t nc_http_list; extern struct sockaddr_in conn_addresses[max_clients];  void accept_connections(unsigned int num_conns) {     int fds[max_threads];     int conn_indexes[max_threads];     int conn_idx=0;     int new_bottom;     socklen_t slenghts[max_threads];     void *labels1[max_threads] = {&&a0,&&a1,&&a2,&&a3,&&a4,&&a5,&&a6,&&a7,&&a8,&&a9,&&a10,&&a11,&&a12,&&a13,&&a14,&&a15};     void *labels2[max_threads] = {&&b0,&&b1,&&b2,&&b3,&&b4,&&b5,&&b6,&&b7,&&b8,&&b9,&&b10,&&b11,&&b12,&&b13,&&b14,&&b15};     void *labels3[max_threads] = {&&c0,&&c1,&&c2,&&c3,&&c4,&&c5,&&c6,&&c7,&&c8,&&c9,&&c10,&&c11,&&c12,&&c13,&&c14,&&c15};      new_bottom=fl_conn_indexes_top-num_conns;     if (new_bottom<=0) return;      goto *labels1[num_conns];  loop(1,max_threads) {  a{iterator}:     conn_indexes[max_threads- {iterator}]=fl_conn_indexes[fl_conn_indexes_top- {iterator}]; }     fl_conn_indexes_top=fl_conn_indexes_top-num_conns;     goto *labels2[num_conns]; loop(1,max_threads) {        b{iterator}:     conn_fds[conn_indexes[max_threads- {iterator}]]=accept(nc_http_list.sock_fd,(struct sockaddr*) &conn_addresses[conn_indexes[max_threads- 1{iterator}]],&slenghts[max_threads- {iterator}]); }     goto *labels3[num_conns]; loop(1,max_threads) {  c{iterator}:     conn_statuses[conn_indexes[max_threads- {iterator}]]=conn_status_reading; }  } 

the {iterator} goes 1 max_threads inside loop() labels neded because number of connections can lower max_threads , have skip empty variables otherwise segfaults.

can implemented preprocessor of gcc easily? if not, maybe use m4, how integrate m4 gcc comfortable development. because, going edit these macros along many other files within source code , easiness of coding needed.

will appreciate help

if require answer in m4:

dnl usage: unroll(var, initial, max, code) dnl e.g. unroll(`i', 0, 10, ``total += arr[i]'') dnl above goes 0 9, not include final 10. define(`unroll',        `define(`$1', `$2')ifelse(eval($2 < $3),                                  1,                                  `$4`'unroll(`$1',incr($2),$3,`$4')',                                  `undefine(`$1')')')dnl  void accept_connections(unsigned int num_conns) {     int fds[max_threads];     int conn_indexes[max_threads];     int conn_idx=0;     int new_bottom;     socklen_t slenghts[max_threads];     void *labels1[max_threads] = {unroll(`i', 0, 16, `&&a`'i,')};     void *labels2[max_threads] = {unroll(`i', 0, 16, `&&b`'i,')};     void *labels3[max_threads] = {unroll(`i', 0, 16, `&&c`'i,')};      new_bottom=fl_conn_indexes_top-num_conns;     if (new_bottom<=0) return;      define(`max_num_conns', 16)     goto *labels1[num_conns]; unroll(`count', 1, incr(max_num_conns()), `dnl  a`'eval(max_num_conns() - count):     conn_indexes[max_threads - count]=fl_conn_indexes[fl_conn_indexes_top - count]; ')dnl      fl_conn_indexes_top=fl_conn_indexes_top-num_conns;     goto *labels2[num_conns]; unroll(`count', 1, incr(max_num_conns()), `dnl define(`index', `max_threads - count')dnl  b`'eval(max_num_conns() - count):     conn_fds[conn_indexes[index]]=accept(nc_http_list.sock_fd,(struct sockaddr*) &conn_addresses[conn_indexes[index]],&slenghts[index]); ')dnl undefine(`index')      goto *labels3[num_conns]; unroll(`count', 1, incr(max_num_conns()), `dnl  c`'eval(max_num_conns() - count):     conn_statuses[conn_indexes[max_threads - count]]=conn_status_reading; ')dnl  } 

pay special attention macros within code: define, unroll, eval, max_num_conns, count, index, i. notice quirky quoting m4 likes.

i can suggest improvement (regardless of whether or not use m4). use case here of gotos following directly after each other 1 covered more well-known control structure: switch (because fallthrough default behaviour without break):

    define(`max_num_conns', 16)     switch (num_conns) { unroll(`count', 1, incr(max_num_conns()), `dnl     case eval(max_num_conns() - count):         conn_indexes[max_threads - count]=fl_conn_indexes[fl_conn_indexes_top - count]; ')dnl     }      fl_conn_indexes_top=fl_conn_indexes_top-num_conns;     switch (num_conns) { unroll(`count', 1, incr(max_num_conns()), `dnl define(`index', `max_threads - count')dnl     case eval(max_num_conns() - count):         conn_fds[conn_indexes[index]]=accept(nc_http_list.sock_fd,(struct sockaddr*) &conn_addresses[conn_indexes[index]],&slenghts[index]); ')dnl     } undefine(`index')      switch (num_conns) { unroll(`count', 1, incr(max_num_conns()), `dnl     case eval(max_num_conns() - count):         conn_statuses[conn_indexes[max_threads - count]]=conn_status_reading; ')dnl     } 

if conversion switch, can remove label arrays.

finally, perhaps 1 switch:

    define(`max_num_conns', 16)     switch (num_conns) { unroll(`count', 1, incr(max_num_conns()), `dnl define(`index', `max_threads - count')dnl     case eval(max_num_conns() - count):         conn_indexes[index]=fl_conn_indexes[fl_conn_indexes_top - count];         conn_fds[conn_indexes[index]]=accept(nc_http_list.sock_fd,(struct sockaddr*) &conn_addresses[conn_indexes[index]],&slenghts[index]);         conn_statuses[conn_indexes[index]]=conn_status_reading; ')dnl     } undefine(`index')     fl_conn_indexes_top=fl_conn_indexes_top-num_conns; 

note i've moved fl_conn_indexes_top modification after whole unrolled loop finished - i'm not sure if that's valid thing do, because depends on when variable read or written to. can make judgement call there.


as mentioned in comment, can done compiler flags too. gcc supports -funroll-loops option, @ compiler level. gcc supports -fprofile-use, (when fed profile -fprofile-generate) unroll loops proved cause performance issues.


finally:

i think premature optimization. doubt loop's comparison check bottleneck when each iteration calls accept!


Comments

Popular posts from this blog

c++ - llvm function pass ReplaceInstWithInst malloc -

Cross-Compiling Linux Kernel for Raspberry Pi - ${CCPREFIX}gcc -v does not work -

java.lang.NoClassDefFoundError When Creating New Android Project -