diff --git a/include/proto/fd.h b/include/proto/fd.h index 240a2e275..f0265b795 100644 --- a/include/proto/fd.h +++ b/include/proto/fd.h @@ -30,6 +30,12 @@ #include #include +/* public variables */ +extern int fd_nbspec; // number of speculative events in the list +extern int fd_nbupdt; // number of updates in the list +extern unsigned int *fd_spec; // speculative I/O list +extern unsigned int *fd_updt; // FD updates list + /* Deletes an FD from the fdsets, and recomputes the maxfd limit. * The file descriptor is also closed. */ @@ -70,7 +76,49 @@ int list_pollers(FILE *out); */ void run_poller(); -#define EV_FD_ISSET(fd, ev) (cur_poller.is_set((fd), (ev))) +/* Mark fd as updated and allocate an entry in the update list for this if + * it was not already there. This can be done at any time. + */ +static inline void updt_fd(const int fd) +{ + if (fdtab[fd].updated) + /* already scheduled for update */ + return; + fd_updt[fd_nbupdt++] = fd; + fdtab[fd].updated = 1; +} + + +/* allocate an entry for a speculative event. This can be done at any time. */ +static inline void alloc_spec_entry(const int fd) +{ + if (fdtab[fd].spec_p) + /* FD already in speculative I/O list */ + return; + fd_spec[fd_nbspec++] = fd; + fdtab[fd].spec_p = fd_nbspec; +} + +/* Removes entry used by fd from the spec list and replaces it with the + * last one. The fdtab.spec is adjusted to match the back reference if needed. + * If the fd has no entry assigned, return immediately. + */ +static inline void release_spec_entry(int fd) +{ + unsigned int pos; + + pos = fdtab[fd].spec_p; + if (!pos) + return; + fdtab[fd].spec_p = 0; + fd_nbspec--; + if (pos <= fd_nbspec) { + /* was not the last entry */ + fd = fd_spec[fd_nbspec]; + fd_spec[pos - 1] = fd; + fdtab[fd].spec_p = pos; + } +} /* event manipulation primitives for use by I/O callbacks */ static inline void fd_want_recv(int fd) diff --git a/include/types/fd.h b/include/types/fd.h index a99c6bfa5..fab0c6c64 100644 --- a/include/types/fd.h +++ b/include/types/fd.h @@ -25,13 +25,13 @@ #include #include +/* Direction for each FD event update */ enum { DIR_RD=0, DIR_WR=1, - DIR_SIZE }; -/* +/* Polling status flags returned in fdtab[].ev : * FD_POLL_IN remains set as long as some data is pending for read. * FD_POLL_OUT remains set as long as the fd accepts to write data. * FD_POLL_ERR and FD_POLL_ERR remain set forever (until processed). @@ -45,6 +45,26 @@ enum { #define FD_POLL_DATA (FD_POLL_IN | FD_POLL_OUT) #define FD_POLL_STICKY (FD_POLL_ERR | FD_POLL_HUP) +/* Event state for an FD in each direction, as found in the 4 lower bits of + * fdtab[].spec_e, and in the 4 next bits. + */ +#define FD_EV_ACTIVE 1U +#define FD_EV_POLLED 4U +#define FD_EV_STATUS (FD_EV_ACTIVE | FD_EV_POLLED) +#define FD_EV_STATUS_R (FD_EV_STATUS) +#define FD_EV_STATUS_W (FD_EV_STATUS << 1) + +#define FD_EV_POLLED_R (FD_EV_POLLED) +#define FD_EV_POLLED_W (FD_EV_POLLED << 1) +#define FD_EV_POLLED_RW (FD_EV_POLLED_R | FD_EV_POLLED_W) + +#define FD_EV_ACTIVE_R (FD_EV_ACTIVE) +#define FD_EV_ACTIVE_W (FD_EV_ACTIVE << 1) +#define FD_EV_ACTIVE_RW (FD_EV_ACTIVE_R | FD_EV_ACTIVE_W) + +#define FD_EV_CURR_MASK 0x0FU +#define FD_EV_PREV_MASK 0xF0U + /* info about one given fd */ struct fdtab { int (*iocb)(int fd); /* I/O handler, returns FD_WAIT_* */ diff --git a/src/ev_sepoll.c b/src/ev_sepoll.c index 35ce34c75..ff185d3bd 100644 --- a/src/ev_sepoll.c +++ b/src/ev_sepoll.c @@ -7,85 +7,6 @@ * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. - * - * - * This code implements "speculative I/O" under Linux. The principle is to - * try to perform expected I/O before registering the events in the poller. - * Each time this succeeds, it saves an expensive epoll_ctl(). It generally - * succeeds for all reads after an accept(), and for writes after a connect(). - * It also improves performance for streaming connections because even if only - * one side is polled, the other one may react accordingly depending on the - * level of the buffer. - * - * More importantly, it enables I/O operations that are backed by invisible - * buffers. For example, SSL is able to read a whole socket buffer and not - * deliver it to the application buffer because it's full. Unfortunately, it - * won't be reported by epoll() anymore until some new activity happens. The - * only way to call it again thus is to perform speculative I/O as soon as - * reading on the FD is enabled again. - * - * The speculative I/O relies on a double list of expected events and updates. - * Expected events are events that are expected to come and that we must report - * to the application until it asks to stop or to poll. Updates are new requests - * for changing an FD state. Updates are the only way to create new events. This - * is important because it means that the number of speculative events cannot - * increase between updates and will only grow one at a time while processing - * updates. All updates must always be processed, though events might be - * processed by small batches if required. The result is that there is no need - * for preallocating room for spec events, updates evinced from the list always - * release at least as much as necessary. - * - * In order to limit memory usage, events and updates share the same list (an - * array to be exact). The lower part (0..nbevts) is used by events and the - * higher part by updates. This way, an fd may be mapped to any entry (evt or - * update) using a single index. Updates may be simply turned to events. When - * events are deleted, the last event from the list must replace the deleted - * event, and if there were updates past this event, one must be moved to take - * its place. It still means that any file descriptor might be present in the - * event or update list, so the list must be at least as large as the maximum - * number of simultaneous file descriptors. - * - * It is important to understand that as long as all expected events are - * processed, they might starve the polled events, especially because polled - * I/O starvation quickly induces more speculative I/O. One solution to this - * consists in only processing a part of the events at once, but one drawback - * is that unhandled events will still wake epoll_wait() up. Using EPOLL_ET - * will solve this issue though. - * - * A file descriptor has a distinct state for each direction. This state is a - * combination of two bits : - * bit 0 = active Y/N : is set if the FD is active, which means that its - * handler will be called without prior polling ; - * bit 1 = polled Y/N : is set if the FD was subscribed to polling - * - * It is perfectly valid to have both bits set at a time, which generally means - * that the FD was reported by polling, was marked active and not yet unpolled. - * Such a state must not last long to avoid unneeded wakeups. - * - * The state of the FD as of last change is preserved in two other bits. These - * ones are useful to save a significant amount of system calls during state - * changes, because there is no need to call epoll_ctl() until we're about to - * call epoll_wait(). - * - * Since we do not want to scan all the FD list to find speculative I/O events, - * we store them in a list consisting in a linear array holding only the FD - * indexes right now. Note that a closed FD cannot exist in the spec list, - * because it is closed by fd_delete() which in turn calls __fd_clo() which - * always removes it from the list. - * - * For efficiency reasons, we will store the Read and Write bits interlaced to - * form a 4-bit field, so that we can simply shift the value right by 0/1 and - * get what we want : - * 3 2 1 0 - * Wp Rp Wa Ra - * - * The FD array has to hold a back reference to the speculative list. This - * reference is always valid unless the FD if currently being polled and not - * updated (in which case the reference points to index 0). - * - * We store the FD state in the 4 lower bits of fdtab[fd].spec_e, and save the - * previous state upon changes in the 4 higher bits, so that changes are easy - * to spot. */ #include @@ -108,36 +29,9 @@ #include -#define FD_EV_ACTIVE 1U -#define FD_EV_POLLED 4U -#define FD_EV_STATUS (FD_EV_ACTIVE | FD_EV_POLLED) -#define FD_EV_STATUS_R (FD_EV_STATUS) -#define FD_EV_STATUS_W (FD_EV_STATUS << 1) - -#define FD_EV_POLLED_R (FD_EV_POLLED) -#define FD_EV_POLLED_W (FD_EV_POLLED << 1) -#define FD_EV_POLLED_RW (FD_EV_POLLED_R | FD_EV_POLLED_W) - -#define FD_EV_ACTIVE_R (FD_EV_ACTIVE) -#define FD_EV_ACTIVE_W (FD_EV_ACTIVE << 1) -#define FD_EV_ACTIVE_RW (FD_EV_ACTIVE_R | FD_EV_ACTIVE_W) - -#define FD_EV_CURR_MASK 0x0FU -#define FD_EV_PREV_MASK 0xF0U - -/* This is the minimum number of events successfully processed in speculative - * mode above which we agree to return without checking epoll() (1/2 times). - */ -#define MIN_RETURN_EVENTS 25 - -static int nbspec = 0; // number of speculative events in the list -static int nbupdt = 0; // number of updates in the list static int absmaxevents = 0; // absolute maximum amounts of polled events static int in_poll_loop = 0; // non-null if polled events are being processed -static unsigned int *spec_list = NULL; // speculative I/O list -static unsigned int *updt_list = NULL; // FD updates list - /* private data */ static struct epoll_event *epoll_events; static int epoll_fd; @@ -147,51 +41,6 @@ static int epoll_fd; */ static struct epoll_event ev; - -/* Mark fd as updated and allocate an entry in the update list for this if - * it was not already there. This can be done at any time. - */ -REGPRM1 static inline void updt_fd(const int fd) -{ - if (fdtab[fd].updated) - /* already scheduled for update */ - return; - updt_list[nbupdt++] = fd; - fdtab[fd].updated = 1; -} - - -/* allocate an entry for a speculative event. This can be done at any time. */ -REGPRM1 static inline void alloc_spec_entry(const int fd) -{ - if (fdtab[fd].spec_p) - /* FD already in speculative I/O list */ - return; - spec_list[nbspec++] = fd; - fdtab[fd].spec_p = nbspec; -} - -/* Removes entry used by fd from the spec list and replaces it with the - * last one. The fdtab.spec is adjusted to match the back reference if needed. - * If the fd has no entry assigned, return immediately. - */ -REGPRM1 static void release_spec_entry(int fd) -{ - unsigned int pos; - - pos = fdtab[fd].spec_p; - if (!pos) - return; - fdtab[fd].spec_p = 0; - nbspec--; - if (pos <= nbspec) { - /* was not the last entry */ - fd = spec_list[nbspec]; - spec_list[pos - 1] = fd; - fdtab[fd].spec_p = pos; - } -} - /* * Returns non-zero if is already monitored for events in direction . */ @@ -298,8 +147,8 @@ REGPRM2 static void _do_poll(struct poller *p, int exp) int wait_time; /* first, scan the update list to find changes */ - for (updt_idx = 0; updt_idx < nbupdt; updt_idx++) { - fd = updt_list[updt_idx]; + for (updt_idx = 0; updt_idx < fd_nbupdt; updt_idx++) { + fd = fd_updt[updt_idx]; en = fdtab[fd].spec_e & 15; /* new events */ eo = fdtab[fd].spec_e >> 4; /* previous events */ @@ -348,11 +197,11 @@ REGPRM2 static void _do_poll(struct poller *p, int exp) fdtab[fd].updated = 0; fdtab[fd].new = 0; } - nbupdt = 0; + fd_nbupdt = 0; /* compute the epoll_wait() timeout */ - if (nbspec || run_queue || signal_queue_len) { + if (fd_nbspec || run_queue || signal_queue_len) { /* Maybe we still have events in the spec list, or there are * some tasks left pending in the run_queue, so we must not * wait in epoll() otherwise we would delay their delivery by @@ -403,7 +252,7 @@ REGPRM2 static void _do_poll(struct poller *p, int exp) ((e & EPOLLHUP) ? FD_POLL_HUP : 0); if (fdtab[fd].iocb && fdtab[fd].owner && fdtab[fd].ev) { - int new_updt, old_updt = nbupdt; /* Save number of updates to detect creation of new FDs. */ + int new_updt, old_updt = fd_nbupdt; /* Save number of updates to detect creation of new FDs. */ /* Mark the events as speculative before processing * them so that if nothing can be done we don't need @@ -426,8 +275,8 @@ REGPRM2 static void _do_poll(struct poller *p, int exp) * scan the new entries backwards. */ - for (new_updt = nbupdt; new_updt > old_updt; new_updt--) { - fd = updt_list[new_updt - 1]; + for (new_updt = fd_nbupdt; new_updt > old_updt; new_updt--) { + fd = fd_updt[new_updt - 1]; if (!fdtab[fd].new) continue; @@ -446,9 +295,9 @@ REGPRM2 static void _do_poll(struct poller *p, int exp) /* we can remove this update entry if it's the last one and is * unused, otherwise we don't touch anything. */ - if (new_updt == nbupdt && fdtab[fd].spec_e == 0) { + if (new_updt == fd_nbupdt && fdtab[fd].spec_e == 0) { fdtab[fd].updated = 0; - nbupdt--; + fd_nbupdt--; } } } @@ -456,8 +305,8 @@ REGPRM2 static void _do_poll(struct poller *p, int exp) /* now process speculative events if any */ - for (spec_idx = 0; spec_idx < nbspec; ) { - fd = spec_list[spec_idx]; + for (spec_idx = 0; spec_idx < fd_nbspec; ) { + fd = fd_spec[spec_idx]; eo = fdtab[fd].spec_e; /* @@ -483,7 +332,7 @@ REGPRM2 static void _do_poll(struct poller *p, int exp) /* if the fd was removed from the spec list, it has been * replaced by the next one that we don't want to skip ! */ - if (spec_idx < nbspec && spec_list[spec_idx] != fd) + if (spec_idx < fd_nbspec && fd_spec[spec_idx] != fd) continue; spec_idx++; @@ -500,8 +349,6 @@ REGPRM2 static void _do_poll(struct poller *p, int exp) */ REGPRM1 static int _do_init(struct poller *p) { - __label__ fail_spec, fail_ee, fail_fd; - p->private = NULL; epoll_fd = epoll_create(global.maxsock + 1); @@ -516,18 +363,8 @@ REGPRM1 static int _do_init(struct poller *p) if (epoll_events == NULL) goto fail_ee; - if ((spec_list = (uint32_t *)calloc(1, sizeof(uint32_t) * global.maxsock)) == NULL) - goto fail_spec; - - if ((updt_list = (uint32_t *)calloc(1, sizeof(uint32_t) * global.maxsock)) == NULL) - goto fail_updt; - return 1; - fail_updt: - free(spec_list); - fail_spec: - free(epoll_events); fail_ee: close(epoll_fd); epoll_fd = -1; @@ -542,8 +379,6 @@ REGPRM1 static int _do_init(struct poller *p) */ REGPRM1 static void _do_term(struct poller *p) { - free(updt_list); - free(spec_list); free(epoll_events); if (epoll_fd >= 0) { @@ -551,10 +386,7 @@ REGPRM1 static void _do_term(struct poller *p) epoll_fd = -1; } - updt_list = NULL; - spec_list = NULL; epoll_events = NULL; - p->private = NULL; p->pref = 0; } diff --git a/src/fd.c b/src/fd.c index 9a73d35c2..5d63cc371 100644 --- a/src/fd.c +++ b/src/fd.c @@ -1,13 +1,85 @@ /* * File descriptors management functions. * - * Copyright 2000-2008 Willy Tarreau + * Copyright 2000-2012 Willy Tarreau * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * + * This code implements "speculative I/O". The principle is to try to perform + * expected I/O before registering the events in the poller. Each time this + * succeeds, it saves a possibly expensive system call to set the event. It + * generally succeeds for all reads after an accept(), and for writes after a + * connect(). It also improves performance for streaming connections because + * even if only one side is polled, the other one may react accordingly + * depending on the fill level of the buffer. This behaviour is also the only + * one compatible with event-based pollers (eg: EPOLL_ET). + * + * More importantly, it enables I/O operations that are backed by invisible + * buffers. For example, SSL is able to read a whole socket buffer and not + * deliver it to the application buffer because it's full. Unfortunately, it + * won't be reported by a poller anymore until some new activity happens. The + * only way to call it again thus is to perform speculative I/O as soon as + * reading on the FD is enabled again. + * + * The speculative I/O uses a list of expected events and a list of updates. + * Expected events are events that are expected to come and that we must report + * to the application until it asks to stop or to poll. Updates are new requests + * for changing an FD state. Updates are the only way to create new events. This + * is important because it means that the number of speculative events cannot + * increase between updates and will only grow one at a time while processing + * updates. All updates must always be processed, though events might be + * processed by small batches if required. + * + * There is no direct link between the FD and the updates list. There is only a + * bit in the fdtab[] to indicate than a file descriptor is already present in + * the updates list. Once an fd is present in the updates list, it will have to + * be considered even if its changes are reverted in the middle or if the fd is + * replaced. + * + * It is important to understand that as long as all expected events are + * processed, they might starve the polled events, especially because polled + * I/O starvation quickly induces more speculative I/O. One solution to this + * consists in only processing a part of the events at once, but one drawback + * is that unhandled events will still wake the poller up. Using an event-driven + * poller such as EPOLL_ET will solve this issue though. + * + * A file descriptor has a distinct state for each direction. This state is a + * combination of two bits : + * bit 0 = active Y/N : is set if the FD is active, which means that its + * handler will be called without prior polling ; + * bit 1 = polled Y/N : is set if the FD was subscribed to polling + * + * It is perfectly valid to have both bits set at a time, which generally means + * that the FD was reported by polling, was marked active and not yet unpolled. + * Such a state must not last long to avoid unneeded wakeups. + * + * The state of the FD as of last change is preserved in two other bits. These + * ones are useful to save a significant amount of system calls during state + * changes, because there is no need to update the FD status in the system until + * we're about to call the poller. + * + * Since we do not want to scan all the FD list to find speculative I/O events, + * we store them in a list consisting in a linear array holding only the FD + * indexes right now. Note that a closed FD cannot exist in the spec list, + * because it is closed by fd_delete() which in turn calls __fd_clo() which + * always removes it from the list. + * + * For efficiency reasons, we will store the Read and Write bits interlaced to + * form a 4-bit field, so that we can simply shift the value right by 0/1 and + * get what we want : + * 3 2 1 0 + * Wp Rp Wa Ra + * + * The FD array has to hold a back reference to the speculative list. This + * reference is always valid unless the FD if currently being polled and not + * updated (in which case the reference points to index 0). + * + * We store the FD state in the 4 lower bits of fdtab[fd].spec_e, and save the + * previous state upon changes in the 4 higher bits, so that changes are easy + * to spot. */ #include @@ -18,6 +90,8 @@ #include #include +#include + #include #include @@ -31,6 +105,11 @@ struct poller pollers[MAX_POLLERS]; struct poller cur_poller; int nbpollers = 0; +/* FD status is defined by the poller's status and by the speculative I/O list */ +int fd_nbspec = 0; // number of speculative events in the list +int fd_nbupdt = 0; // number of updates in the list +unsigned int *fd_spec = NULL; // speculative I/O list +unsigned int *fd_updt = NULL; // FD updates list /* Deletes an FD from the fdsets, and recomputes the maxfd limit. * The file descriptor is also closed. @@ -68,6 +147,11 @@ int init_pollers() int p; struct poller *bp; + if ((fd_spec = (uint32_t *)calloc(1, sizeof(uint32_t) * global.maxsock)) == NULL) + goto fail_spec; + + if ((fd_updt = (uint32_t *)calloc(1, sizeof(uint32_t) * global.maxsock)) == NULL) + goto fail_updt; do { bp = NULL; @@ -84,6 +168,11 @@ int init_pollers() } } while (!bp || bp->pref == 0); return 0; + + fail_updt: + free(fd_spec); + fail_spec: + return 0; } /* @@ -100,6 +189,11 @@ void deinit_pollers() { if (bp && bp->pref) bp->term(bp); } + + free(fd_updt); + free(fd_spec); + fd_updt = NULL; + fd_spec = NULL; } /*