xref: /third_party/node/deps/uv/src/unix/epoll.c (revision 1cb0ef41)
1/* Copyright libuv contributors. All rights reserved.
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to
5 * deal in the Software without restriction, including without limitation the
6 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7 * sell copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 * IN THE SOFTWARE.
20 */
21
22#include "uv.h"
23#include "internal.h"
24#include <errno.h>
25#include <sys/epoll.h>
26
27int uv__epoll_init(uv_loop_t* loop) {
28  int fd;
29  fd = epoll_create1(O_CLOEXEC);
30
31  /* epoll_create1() can fail either because it's not implemented (old kernel)
32   * or because it doesn't understand the O_CLOEXEC flag.
33   */
34  if (fd == -1 && (errno == ENOSYS || errno == EINVAL)) {
35    fd = epoll_create(256);
36
37    if (fd != -1)
38      uv__cloexec(fd, 1);
39  }
40
41  loop->backend_fd = fd;
42  if (fd == -1)
43    return UV__ERR(errno);
44
45  return 0;
46}
47
48
49void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) {
50  struct epoll_event* events;
51  struct epoll_event dummy;
52  uintptr_t i;
53  uintptr_t nfds;
54
55  assert(loop->watchers != NULL);
56  assert(fd >= 0);
57
58  events = (struct epoll_event*) loop->watchers[loop->nwatchers];
59  nfds = (uintptr_t) loop->watchers[loop->nwatchers + 1];
60  if (events != NULL)
61    /* Invalidate events with same file descriptor */
62    for (i = 0; i < nfds; i++)
63      if (events[i].data.fd == fd)
64        events[i].data.fd = -1;
65
66  /* Remove the file descriptor from the epoll.
67   * This avoids a problem where the same file description remains open
68   * in another process, causing repeated junk epoll events.
69   *
70   * We pass in a dummy epoll_event, to work around a bug in old kernels.
71   */
72  if (loop->backend_fd >= 0) {
73    /* Work around a bug in kernels 3.10 to 3.19 where passing a struct that
74     * has the EPOLLWAKEUP flag set generates spurious audit syslog warnings.
75     */
76    memset(&dummy, 0, sizeof(dummy));
77    epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy);
78  }
79}
80
81
82int uv__io_check_fd(uv_loop_t* loop, int fd) {
83  struct epoll_event e;
84  int rc;
85
86  memset(&e, 0, sizeof(e));
87  e.events = POLLIN;
88  e.data.fd = -1;
89
90  rc = 0;
91  if (epoll_ctl(loop->backend_fd, EPOLL_CTL_ADD, fd, &e))
92    if (errno != EEXIST)
93      rc = UV__ERR(errno);
94
95  if (rc == 0)
96    if (epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &e))
97      abort();
98
99  return rc;
100}
101
102
103void uv__io_poll(uv_loop_t* loop, int timeout) {
104  /* A bug in kernels < 2.6.37 makes timeouts larger than ~30 minutes
105   * effectively infinite on 32 bits architectures.  To avoid blocking
106   * indefinitely, we cap the timeout and poll again if necessary.
107   *
108   * Note that "30 minutes" is a simplification because it depends on
109   * the value of CONFIG_HZ.  The magic constant assumes CONFIG_HZ=1200,
110   * that being the largest value I have seen in the wild (and only once.)
111   */
112  static const int max_safe_timeout = 1789569;
113  static int no_epoll_pwait_cached;
114  static int no_epoll_wait_cached;
115  int no_epoll_pwait;
116  int no_epoll_wait;
117  struct epoll_event events[1024];
118  struct epoll_event* pe;
119  struct epoll_event e;
120  int real_timeout;
121  QUEUE* q;
122  uv__io_t* w;
123  sigset_t sigset;
124  uint64_t sigmask;
125  uint64_t base;
126  int have_signals;
127  int nevents;
128  int count;
129  int nfds;
130  int fd;
131  int op;
132  int i;
133  int user_timeout;
134  int reset_timeout;
135
136  if (loop->nfds == 0) {
137    assert(QUEUE_EMPTY(&loop->watcher_queue));
138    return;
139  }
140
141  memset(&e, 0, sizeof(e));
142
143  while (!QUEUE_EMPTY(&loop->watcher_queue)) {
144    q = QUEUE_HEAD(&loop->watcher_queue);
145    QUEUE_REMOVE(q);
146    QUEUE_INIT(q);
147
148    w = QUEUE_DATA(q, uv__io_t, watcher_queue);
149    assert(w->pevents != 0);
150    assert(w->fd >= 0);
151    assert(w->fd < (int) loop->nwatchers);
152
153    e.events = w->pevents;
154    e.data.fd = w->fd;
155
156    if (w->events == 0)
157      op = EPOLL_CTL_ADD;
158    else
159      op = EPOLL_CTL_MOD;
160
161    /* XXX Future optimization: do EPOLL_CTL_MOD lazily if we stop watching
162     * events, skip the syscall and squelch the events after epoll_wait().
163     */
164    if (epoll_ctl(loop->backend_fd, op, w->fd, &e)) {
165      if (errno != EEXIST)
166        abort();
167
168      assert(op == EPOLL_CTL_ADD);
169
170      /* We've reactivated a file descriptor that's been watched before. */
171      if (epoll_ctl(loop->backend_fd, EPOLL_CTL_MOD, w->fd, &e))
172        abort();
173    }
174
175    w->events = w->pevents;
176  }
177
178  sigmask = 0;
179  if (loop->flags & UV_LOOP_BLOCK_SIGPROF) {
180    sigemptyset(&sigset);
181    sigaddset(&sigset, SIGPROF);
182    sigmask |= 1 << (SIGPROF - 1);
183  }
184
185  assert(timeout >= -1);
186  base = loop->time;
187  count = 48; /* Benchmarks suggest this gives the best throughput. */
188  real_timeout = timeout;
189
190  if (uv__get_internal_fields(loop)->flags & UV_METRICS_IDLE_TIME) {
191    reset_timeout = 1;
192    user_timeout = timeout;
193    timeout = 0;
194  } else {
195    reset_timeout = 0;
196    user_timeout = 0;
197  }
198
199  /* You could argue there is a dependency between these two but
200   * ultimately we don't care about their ordering with respect
201   * to one another. Worst case, we make a few system calls that
202   * could have been avoided because another thread already knows
203   * they fail with ENOSYS. Hardly the end of the world.
204   */
205  no_epoll_pwait = uv__load_relaxed(&no_epoll_pwait_cached);
206  no_epoll_wait = uv__load_relaxed(&no_epoll_wait_cached);
207
208  for (;;) {
209    /* Only need to set the provider_entry_time if timeout != 0. The function
210     * will return early if the loop isn't configured with UV_METRICS_IDLE_TIME.
211     */
212    if (timeout != 0)
213      uv__metrics_set_provider_entry_time(loop);
214
215    /* See the comment for max_safe_timeout for an explanation of why
216     * this is necessary.  Executive summary: kernel bug workaround.
217     */
218    if (sizeof(int32_t) == sizeof(long) && timeout >= max_safe_timeout)
219      timeout = max_safe_timeout;
220
221    if (sigmask != 0 && no_epoll_pwait != 0)
222      if (pthread_sigmask(SIG_BLOCK, &sigset, NULL))
223        abort();
224
225    if (no_epoll_wait != 0 || (sigmask != 0 && no_epoll_pwait == 0)) {
226      nfds = epoll_pwait(loop->backend_fd,
227                         events,
228                         ARRAY_SIZE(events),
229                         timeout,
230                         &sigset);
231      if (nfds == -1 && errno == ENOSYS) {
232        uv__store_relaxed(&no_epoll_pwait_cached, 1);
233        no_epoll_pwait = 1;
234      }
235    } else {
236      nfds = epoll_wait(loop->backend_fd,
237                        events,
238                        ARRAY_SIZE(events),
239                        timeout);
240      if (nfds == -1 && errno == ENOSYS) {
241        uv__store_relaxed(&no_epoll_wait_cached, 1);
242        no_epoll_wait = 1;
243      }
244    }
245
246    if (sigmask != 0 && no_epoll_pwait != 0)
247      if (pthread_sigmask(SIG_UNBLOCK, &sigset, NULL))
248        abort();
249
250    /* Update loop->time unconditionally. It's tempting to skip the update when
251     * timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the
252     * operating system didn't reschedule our process while in the syscall.
253     */
254    SAVE_ERRNO(uv__update_time(loop));
255
256    if (nfds == 0) {
257      assert(timeout != -1);
258
259      if (reset_timeout != 0) {
260        timeout = user_timeout;
261        reset_timeout = 0;
262      }
263
264      if (timeout == -1)
265        continue;
266
267      if (timeout == 0)
268        return;
269
270      /* We may have been inside the system call for longer than |timeout|
271       * milliseconds so we need to update the timestamp to avoid drift.
272       */
273      goto update_timeout;
274    }
275
276    if (nfds == -1) {
277      if (errno == ENOSYS) {
278        /* epoll_wait() or epoll_pwait() failed, try the other system call. */
279        assert(no_epoll_wait == 0 || no_epoll_pwait == 0);
280        continue;
281      }
282
283      if (errno != EINTR)
284        abort();
285
286      if (reset_timeout != 0) {
287        timeout = user_timeout;
288        reset_timeout = 0;
289      }
290
291      if (timeout == -1)
292        continue;
293
294      if (timeout == 0)
295        return;
296
297      /* Interrupted by a signal. Update timeout and poll again. */
298      goto update_timeout;
299    }
300
301    have_signals = 0;
302    nevents = 0;
303
304    {
305      /* Squelch a -Waddress-of-packed-member warning with gcc >= 9. */
306      union {
307        struct epoll_event* events;
308        uv__io_t* watchers;
309      } x;
310
311      x.events = events;
312      assert(loop->watchers != NULL);
313      loop->watchers[loop->nwatchers] = x.watchers;
314      loop->watchers[loop->nwatchers + 1] = (void*) (uintptr_t) nfds;
315    }
316
317    for (i = 0; i < nfds; i++) {
318      pe = events + i;
319      fd = pe->data.fd;
320
321      /* Skip invalidated events, see uv__platform_invalidate_fd */
322      if (fd == -1)
323        continue;
324
325      assert(fd >= 0);
326      assert((unsigned) fd < loop->nwatchers);
327
328      w = loop->watchers[fd];
329
330      if (w == NULL) {
331        /* File descriptor that we've stopped watching, disarm it.
332         *
333         * Ignore all errors because we may be racing with another thread
334         * when the file descriptor is closed.
335         */
336        epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, pe);
337        continue;
338      }
339
340      /* Give users only events they're interested in. Prevents spurious
341       * callbacks when previous callback invocation in this loop has stopped
342       * the current watcher. Also, filters out events that users has not
343       * requested us to watch.
344       */
345      pe->events &= w->pevents | POLLERR | POLLHUP;
346
347      /* Work around an epoll quirk where it sometimes reports just the
348       * EPOLLERR or EPOLLHUP event.  In order to force the event loop to
349       * move forward, we merge in the read/write events that the watcher
350       * is interested in; uv__read() and uv__write() will then deal with
351       * the error or hangup in the usual fashion.
352       *
353       * Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user
354       * reads the available data, calls uv_read_stop(), then sometime later
355       * calls uv_read_start() again.  By then, libuv has forgotten about the
356       * hangup and the kernel won't report EPOLLIN again because there's
357       * nothing left to read.  If anything, libuv is to blame here.  The
358       * current hack is just a quick bandaid; to properly fix it, libuv
359       * needs to remember the error/hangup event.  We should get that for
360       * free when we switch over to edge-triggered I/O.
361       */
362      if (pe->events == POLLERR || pe->events == POLLHUP)
363        pe->events |=
364          w->pevents & (POLLIN | POLLOUT | UV__POLLRDHUP | UV__POLLPRI);
365
366      if (pe->events != 0) {
367        /* Run signal watchers last.  This also affects child process watchers
368         * because those are implemented in terms of signal watchers.
369         */
370        if (w == &loop->signal_io_watcher) {
371          have_signals = 1;
372        } else {
373          uv__metrics_update_idle_time(loop);
374          w->cb(loop, w, pe->events);
375        }
376
377        nevents++;
378      }
379    }
380
381    if (reset_timeout != 0) {
382      timeout = user_timeout;
383      reset_timeout = 0;
384    }
385
386    if (have_signals != 0) {
387      uv__metrics_update_idle_time(loop);
388      loop->signal_io_watcher.cb(loop, &loop->signal_io_watcher, POLLIN);
389    }
390
391    loop->watchers[loop->nwatchers] = NULL;
392    loop->watchers[loop->nwatchers + 1] = NULL;
393
394    if (have_signals != 0)
395      return;  /* Event loop should cycle now so don't poll again. */
396
397    if (nevents != 0) {
398      if (nfds == ARRAY_SIZE(events) && --count != 0) {
399        /* Poll for more events but don't block this time. */
400        timeout = 0;
401        continue;
402      }
403      return;
404    }
405
406    if (timeout == 0)
407      return;
408
409    if (timeout == -1)
410      continue;
411
412update_timeout:
413    assert(timeout > 0);
414
415    real_timeout -= (loop->time - base);
416    if (real_timeout <= 0)
417      return;
418
419    timeout = real_timeout;
420  }
421}
422
423