Analyzing the Implementation of the select System Call

Understanding the select System Call

The select system call provides synchronous I/O multiplexing, allowing a program to monitor multiple file descriptors for readiness. This analysis explores its internal implementation in the Linux kernel.

User Interface

The select API is defined as follows:

#include <sys/select.h>

int select(int maxfd, fd_set *read_set, fd_set *write_set,
           fd_set *except_set, struct timeval *timeout);

void FD_CLR(int fd, fd_set *set);
int  FD_ISSET(int fd, fd_set *set);
void FD_SET(int fd, fd_set *set);
void FD_ZERO(fd_set *set);

Key parameters:

  • maxfd: Highest file descriptor number plus one
  • read_set, write_set, except_set: Sets for read, write, and exception monitoring
  • timeout: Maximum wait time with microsecond precision

Implementation Overview

The select implementation relies on file operations through the f_op->poll method. The process involves:

  1. Iterating through file descriptors and calling f_op->poll()
  2. Setting up wakeup callbacks via __pollwait()
  3. Adding entries to file wait queues
  4. Checking readiness masks and setting appropriate flags

Core Data Structures

typedef struct poll_table_struct {
    poll_queue_proc _qproc;     // Callback function
    __poll_t _key;              // Event mask
} poll_table;

struct poll_wqueues {
    poll_table pt;
    struct poll_table_page *table;
    struct task_struct *polling_task;
    int triggered;
    int error;
    int inline_index;
    struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};

struct poll_table_entry {
    struct file *filp;
    __poll_t key;
    wait_queue_entry_t wait;
    wait_queue_head_t *wait_address;
};

Implementation Details

Timeout Handling

The system call converts microsecond precision to nanosecond precision for internal timing:

static int kern_select(int n, fd_set __user *inp, fd_set __user *outp,
                       fd_set __user *exp, struct timeval __user *tvp)
{
    struct timespec64 end_time, *to = NULL;
    struct timeval tv;
    
    if (tvp) {
        if (copy_from_user(&tv, tvp, sizeof(tv)))
            return -EFAULT;
        
        to = &end_time;
        if (poll_select_set_timeout(to,
                tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
                (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
            return -EINVAL;
    }
    
    return core_sys_select(n, inp, outp, exp, to);
}

File Descriptor Processing

The core processing involves:

int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
                    fd_set __user *exp, struct timespec64 *end_time)
{
    fd_set_bits fds;
    void *bits;
    long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
    
    // Validate and adjust n
    rcu_read_lock();
    fdt = files_fdtable(current->files);
    max_fds = fdt->max_fds;
    rcu_read_unlock();
    if (n > max_fds) n = max_fds;
    
    // Allocate memory for descriptor sets
    size = FDS_BYTES(n);
    bits = stack_fds;
    if (size > sizeof(stack_fds) / 6) {
        alloc_size = 6 * size;
        bits = kvmalloc(alloc_size, GFP_KERNEL);
    }
    
    // Set up descriptor set pointers
    fds.in      = bits;
    fds.out     = bits +   size;
    fds.ex      = bits + 2*size;
    fds.res_in  = bits + 3*size;
    fds.res_out = bits + 4*size;
    fds.res_ex  = bits + 5*size;
    
    // Copy user sets to kernel space
    if ((ret = get_fd_set(n, inp, fds.in)) ||
        (ret = get_fd_set(n, outp, fds.out)) ||
        (ret = get_fd_set(n, exp, fds.ex)))
        goto out;
    
    // Clear result sets
    zero_fd_set(n, fds.res_in);
    zero_fd_set(n, fds.res_out);
    zero_fd_set(n, fds.res_ex);
    
    ret = do_select(n, &fds, end_time);
    
    // Copy results back to user space
    if (set_fd_set(n, inp, fds.res_in) ||
        set_fd_set(n, outp, fds.res_out) ||
        set_fd_set(n, exp, fds.res_ex))
        ret = -EFAULT;
    
    return ret;
}

Polling Mechanism

The actual polling occurs in do_select():

static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
    struct poll_wqueues table;
    poll_table *wait;
    int retval, i, timed_out = 0;
    
    poll_initwait(&table);
    wait = &table.pt;
    
    for (;;) {
        unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
        
        inp = fds->in; outp = fds->out; exp = fds->ex;
        rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
        
        for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
            unsigned long in, out, ex, all_bits, bit = 1, j;
            unsigned long res_in = 0, res_out = 0, res_ex = 0;
            __poll_t mask;
            
            in = *inp++; out = *outp++; ex = *exp++;
            all_bits = in | out | ex;
            if (all_bits == 0) {
                i += BITS_PER_LONG;
                continue;
            }
            
            for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
                struct fd f;
                if (i >= n) break;
                if (!(bit & all_bits)) continue;
                
                f = fdget(i);
                if (f.file) {
                    const struct file_operations *f_op = f.file->f_op;
                    mask = DEFAULT_POLLMASK;
                    
                    if (f_op->poll) {
                        wait_key_set(wait, in, out, bit, busy_flag);
                        mask = (*f_op->poll)(f.file, wait);
                    }
                    
                    fdput(f);
                    
                    if ((mask & POLLIN_SET) && (in & bit)) {
                        res_in |= bit;
                        retval++;
                        wait->_qproc = NULL;
                    }
                    // Similar checks for POLLOUT_SET and POLLEX_SET
                }
            }
            
            if (res_in) *rinp = res_in;
            if (res_out) *routp = res_out;
            if (res_ex) *rexp = res_ex;
            cond_resched();
        }
        
        if (retval || timed_out || signal_pending(current))
            break;
        
        if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack))
            timed_out = 1;
    }
    
    poll_freewait(&table);
    return retval;
}

Technical Limitations

The select implementation has a fixed limit of 1024 file descriptors due to the fd_set structure definition:

#define __FD_SETSIZE 1024
typedef long int __fd_mask;

typedef struct {
    __fd_mask __fds_bits[__FD_SETSIZE / __NFDBITS];
} fd_set;

The first parameter requires the highest file descriptor plus one because select uses this value both as an identifier and for memory alllocation, with polling starting from file descriptor 0.

Tags: SELECT system-calls io-multiplexing linux-kernel file-descriptors

Posted on Thu, 14 May 2026 13:42:45 +0000 by immot