Understanding the select System Call
The select system call provides synchronous I/O multiplexing, allowing a program to monitor multiple file descriptors for readiness. This analysis explores its internal implementation in the Linux kernel.
User Interface
The select API is defined as follows:
#include <sys/select.h>
int select(int maxfd, fd_set *read_set, fd_set *write_set,
fd_set *except_set, struct timeval *timeout);
void FD_CLR(int fd, fd_set *set);
int FD_ISSET(int fd, fd_set *set);
void FD_SET(int fd, fd_set *set);
void FD_ZERO(fd_set *set);
Key parameters:
maxfd: Highest file descriptor number plus oneread_set,write_set,except_set: Sets for read, write, and exception monitoringtimeout: Maximum wait time with microsecond precision
Implementation Overview
The select implementation relies on file operations through the f_op->poll method. The process involves:
- Iterating through file descriptors and calling
f_op->poll() - Setting up wakeup callbacks via
__pollwait() - Adding entries to file wait queues
- Checking readiness masks and setting appropriate flags
Core Data Structures
typedef struct poll_table_struct {
poll_queue_proc _qproc; // Callback function
__poll_t _key; // Event mask
} poll_table;
struct poll_wqueues {
poll_table pt;
struct poll_table_page *table;
struct task_struct *polling_task;
int triggered;
int error;
int inline_index;
struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};
struct poll_table_entry {
struct file *filp;
__poll_t key;
wait_queue_entry_t wait;
wait_queue_head_t *wait_address;
};
Implementation Details
Timeout Handling
The system call converts microsecond precision to nanosecond precision for internal timing:
static int kern_select(int n, fd_set __user *inp, fd_set __user *outp,
fd_set __user *exp, struct timeval __user *tvp)
{
struct timespec64 end_time, *to = NULL;
struct timeval tv;
if (tvp) {
if (copy_from_user(&tv, tvp, sizeof(tv)))
return -EFAULT;
to = &end_time;
if (poll_select_set_timeout(to,
tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
return -EINVAL;
}
return core_sys_select(n, inp, outp, exp, to);
}
File Descriptor Processing
The core processing involves:
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
fd_set __user *exp, struct timespec64 *end_time)
{
fd_set_bits fds;
void *bits;
long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
// Validate and adjust n
rcu_read_lock();
fdt = files_fdtable(current->files);
max_fds = fdt->max_fds;
rcu_read_unlock();
if (n > max_fds) n = max_fds;
// Allocate memory for descriptor sets
size = FDS_BYTES(n);
bits = stack_fds;
if (size > sizeof(stack_fds) / 6) {
alloc_size = 6 * size;
bits = kvmalloc(alloc_size, GFP_KERNEL);
}
// Set up descriptor set pointers
fds.in = bits;
fds.out = bits + size;
fds.ex = bits + 2*size;
fds.res_in = bits + 3*size;
fds.res_out = bits + 4*size;
fds.res_ex = bits + 5*size;
// Copy user sets to kernel space
if ((ret = get_fd_set(n, inp, fds.in)) ||
(ret = get_fd_set(n, outp, fds.out)) ||
(ret = get_fd_set(n, exp, fds.ex)))
goto out;
// Clear result sets
zero_fd_set(n, fds.res_in);
zero_fd_set(n, fds.res_out);
zero_fd_set(n, fds.res_ex);
ret = do_select(n, &fds, end_time);
// Copy results back to user space
if (set_fd_set(n, inp, fds.res_in) ||
set_fd_set(n, outp, fds.res_out) ||
set_fd_set(n, exp, fds.res_ex))
ret = -EFAULT;
return ret;
}
Polling Mechanism
The actual polling occurs in do_select():
static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
struct poll_wqueues table;
poll_table *wait;
int retval, i, timed_out = 0;
poll_initwait(&table);
wait = &table.pt;
for (;;) {
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
inp = fds->in; outp = fds->out; exp = fds->ex;
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
unsigned long in, out, ex, all_bits, bit = 1, j;
unsigned long res_in = 0, res_out = 0, res_ex = 0;
__poll_t mask;
in = *inp++; out = *outp++; ex = *exp++;
all_bits = in | out | ex;
if (all_bits == 0) {
i += BITS_PER_LONG;
continue;
}
for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
struct fd f;
if (i >= n) break;
if (!(bit & all_bits)) continue;
f = fdget(i);
if (f.file) {
const struct file_operations *f_op = f.file->f_op;
mask = DEFAULT_POLLMASK;
if (f_op->poll) {
wait_key_set(wait, in, out, bit, busy_flag);
mask = (*f_op->poll)(f.file, wait);
}
fdput(f);
if ((mask & POLLIN_SET) && (in & bit)) {
res_in |= bit;
retval++;
wait->_qproc = NULL;
}
// Similar checks for POLLOUT_SET and POLLEX_SET
}
}
if (res_in) *rinp = res_in;
if (res_out) *routp = res_out;
if (res_ex) *rexp = res_ex;
cond_resched();
}
if (retval || timed_out || signal_pending(current))
break;
if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack))
timed_out = 1;
}
poll_freewait(&table);
return retval;
}
Technical Limitations
The select implementation has a fixed limit of 1024 file descriptors due to the fd_set structure definition:
#define __FD_SETSIZE 1024
typedef long int __fd_mask;
typedef struct {
__fd_mask __fds_bits[__FD_SETSIZE / __NFDBITS];
} fd_set;
The first parameter requires the highest file descriptor plus one because select uses this value both as an identifier and for memory alllocation, with polling starting from file descriptor 0.