kernel

Wrong ‘struct timeval’ for setsockopt()

What if we deliberately use ‘struct timeval’ like this incorrect way to set timeout of receiving to 3 seconds:

struct timeval tv = {1, 2000000};
setsockopt(fd,SOL_SOCKET,SO_RCVTIMEO,&tv,sizeof(tv));

the ‘setsockopt’ will return fail (-1).
Let’s look up the linux kernel code for systemcall sys_setsockopt():

SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname,
        char __user *, optval, int, optlen)
{
    int err, fput_needed;
    struct socket *sock;
    if (optlen < 0)
        return -EINVAL;
    sock = sockfd_lookup_light(fd, &err, &fput_needed);
    if (sock != NULL) {
        err = security_socket_setsockopt(sock, level, optname);
        if (err)
            goto out_put;
        if (level == SOL_SOCKET)
            err =
                sock_setsockopt(sock, level, optname, optval,
                        optlen);
        else
            err =
                sock->ops->setsockopt(sock, level, optname, optval,
                          optlen);
out_put:
        fput_light(sock->file, fput_needed);
    }
    return err;
}

sock_setsockopt() will invoke sock_set_timeout() and sock_set_timeout() looks like:

static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
{
    struct timeval tv;
    if (optlen < sizeof(tv))
        return -EINVAL;
    if (copy_from_user(&tv, optval, sizeof(tv)))
        return -EFAULT;
    if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
        return -EDOM;
......

That’s it. If ‘tv.tv_usec’ is greater than USEC_PER_SEC (which equals 1000000), it will return -EDOM and setsockopt() will fail.

Avoid “page allocation failure” for linux kernel in big memory server

After adding pressure to a Key-Value cluster, I found many error in dmesg:

[551336.912108]  [] ? dequeue_task+0x8e/0xb0
[551336.912114]  [] ? ext4_get_block+0x0/0x120 [ext4]
[551336.912118]  [] ? __do_fault+0xd0/0x530
[551336.912122]  [] ? copy_user_generic+0xe/0x20
[551336.912124]  [] ? handle_pte_fault+0x9c/0xba0
[551336.912131]  [] ? rwsem_down_failed_common+0x95/0x1e0
[551336.912134]  [] ? rwsem_down_read_failed+0x26/0x30
[551336.912137]  [] ? handle_mm_fault+0x23a/0x310
[551336.912142]  [] ? call_rwsem_down_read_failed+0x14/0x30
[551336.912145]  [] ? __do_page_fault+0x139/0x480
[551336.912149]  [] ? finish_task_switch+0x4f/0xe0
[551336.912152]  [] ? do_page_fault+0x3e/0xb0
[551336.912156]  [] ? page_fault+0x25/0x30
[552116.858565] swapper: page allocation failure. order:1, mode:0x20
[552116.858569] Pid: 0, comm: swapper Tainted: G           --------------- H  #1
[552116.858571] Call Trace:
[552116.858573]    [] ? __alloc_pages_nodemask+0x76a/0x8f0
[552116.858588]  [] ? dev_hard_start_xmit+0x303/0x570
[552116.858593]  [] ? kmem_getpages+0x62/0x170
[552116.858596]  [] ? fallback_alloc+0x1be/0x270
[552116.858599]  [] ? cache_grow+0x2d1/0x320
[552116.858602]  [] ? ____cache_alloc_node+0x99/0x160
[552116.858605]  [] ? kmem_cache_alloc+0x11b/0x1b0
[552116.858610]  [] ? sk_prot_alloc+0x48/0x1d0
[552116.858615]  [] ? sk_clone+0x22/0x2c0
[552116.858619]  [] ? inet_csk_clone+0x16/0xd0
[552116.858624]  [] ? tcp_create_openreq_child+0x60/0x490
[552116.858627]  [] ? tcp_v4_syn_recv_sock+0x6a/0x310
[552116.858630]  [] ? tcp_check_req+0x249/0x4d0
[552116.858633]  [] ? tcp_v4_do_rcv+0x398/0x470
[552116.858636]  [] ? tcp_v4_rcv+0x52a/0x8d0
[552116.858644]  [] ? bond_start_xmit+0xbb/0x5d0 [bonding]
[552116.858648]  [] ? ip_local_deliver_finish+0xdd/0x2d0
[552116.858651]  [] ? ip_local_deliver+0x98/0xa0
[552116.858653]  [] ? ip_rcv_finish+0x12d/0x440
[552116.858656]  [] ? ip_rcv+0x285/0x370
[552116.858659]  [] ? __netif_receive_skb+0x4bb/0x780
[552116.858662]  [] ? tcp4_gro_receive+0x5a/0xd0
......

It’s hard to understand the “page allocation failure” error because the memory capacity is very big in our servers. By looking at the result “free” command, I noticed that a large mount of memory was used to cache files. Maybe the “free” memory is too small so the kernel could not get enough pages when it need many.
But how to reserve more “free” memory in linux kernel? According to this article，we could modify “/proc/sys/vm/min_free_kbytes” to adjust the watermark of linux-memory-management. And the kernel will try hardly to reserve enough “free” memory:

After changing the “/proc/sys/vm/min_free_kbytes” to 1G, the errors became rare but still exists. Then I change it to 4G, and this time, there wasn’t any errors in dmesg now.
At conclude, the default value of “min_free_kbytes” in kernel is too small, we’d better turn up “min_free_kbytes” in machines with big memory.

Too many “ext4-dio-unwrit” processes in system

After adding pressure to application which will write tremendous data into ext4 file system, we see many “ext4-dio-unwrit” kernel threads in “top” screen. Many guys say this is a normal phenomenon, so I check the source code of ext4 in 2.6.32 linux kernel.
The beginning of writing a file in kernel is write-back kernel thread, it will call generic_writepages() and then ext4_write_page():

ext4_write_page()
    --> ext4_set_bh_endio()
        --> ext4_end_io_buffer_write()
            --> ext4_add_complete_io()

Let’s look at ext4_add_complete_io():

/* Add the io_end to per-inode completed end_io list. */
void ext4_add_complete_io(ext4_io_end_t *io_end)
{
    struct ext4_inode_info *ei = EXT4_I(io_end->inode);
    struct workqueue_struct *wq;
    unsigned long flags;
    BUG_ON(!(io_end->flag & DIO_AIO_UNWRITTEN));
    wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
    spin_lock_irqsave(&ei->i_completed_io_lock, flags);
    if (list_empty(&ei->i_aio_dio_complete_list)) {
        io_end->flag |= DIO_AIO_QUEUED;
        queue_work(wq, &io_end->work);
    }
    list_add_tail(&io_end->list, &ei->i_aio_dio_complete_list);
    spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
}

It will put “io_end” into the work queue “dio_unwritten_wq” which is in the ext4_sb_info. But where does the “dio_unwritten_wq” come from ? In the fs/ext4/super.c:

static int ext4_fill_super(struct super_block *sb, void *data, int silent)
{
......
    EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
    if (!EXT4_SB(sb)->dio_unwritten_wq) {
        printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
        goto failed_mount_wq;
    }
......

Oh, it is the “ext4-dio-unwritten” kernel thread! So, the problem is solved: the application dirty the page of file system cache, then the write-back kernel thread will write these dirty pages into specific file system (ext4 in this article), finally ext4 will put io into the work queue “ext4-dio-unwritten” and wait to convert unwritten-extent into written extent of ext4.
Therefore, if we don’t have unwritten-extent in ext4 (just using system-call write() to appending a normal file)，the “ext4-dio-unwritten” kernel threads will exist but not using any CPU.

Upgrade to kernel-4.4.1 on CentOS 7

After I compiled and installed kernel-4.4.1 (from kernel.org) on my CentOS 7, I reboot the machine. But it can’t boot up correctly.
Using

/usr/lib/dracut/skipcpio /boot/initramfs-4.4.1-XXX.img |zcat| cpio -id

to extract the content in initramfs and check them, I found out the ‘mpt2sas’ kernel driver had not been added into initramfs so /boot partition could not be loaded.
Seems this problem is common. Because changing dracut source code or configure file on all servers is not viable, I chose to add command in my kernel rpm spec file:

dracut --add-drivers "raid_class megaraid_sas dm-mod nvme mpt3sas scsi_transport_sas xfs" --kver %{version}-%{release}.%{_target_cpu} --force

This will add drivers to the corresponding initramfs file.
But the kernel could not boot up either. This time, I found that the command line in GRUB2 is like:

root=LABEL=XXX

Looks we should change it to UUID. Add another command in kernel rpm spec file:

new-kernel-pkg --package %{name} --kernel-args="root=`grep -o -P "(?<=root=)\S+" /proc/cmdline`" --update %{version}-%{release}.%{_target_cpu}

This will get UUID of boot disk from /proc/cmdline and give it to GRUB2 configure file.
Now, the kernel-4.4.1 boot up correctly on CentOS 7.

“kmem_cache_create: duplicate cache XXX”

In my kernel module, firstly I wrote:

int alloc_device(const char *name, int number)
{
    char name[64];
    snprintf(name, sizeof(name), "worker%d", number);
    request_cache = kmem_cache_create(name, SECTOR_SIZE, 0, NULL, NULL);
    ......
}

In centos 7, this module works fine, but after port to centos 6, this kernel module reports:

kmem_cache_create: duplicate cache worker0
......

The key to this problme is in the implementation of kmem_cache_create() in 2.6.32 linux kernel (for centos 6):

struct kmem_cache *
kmem_cache_create (const char *name, size_t size, size_t align,
        unsigned long flags, void (*ctor)(void *))
{
        ......
        cachep->ctor = ctor;
        cachep->name = name;
......

After creating a new pool, it only point to ‘name’, not strdup() a new one. But the ‘name’ in my kernel module is a temporary variable (in stack), so it considers the name is “duplicated”.
The correct code should like:

static char *names[64];
/* before calling alloc_device() */
names = kcalloc(NR_OF_DEVICE, 64, GFP_KERNEL);
......
int alloc_device(const char *name, int number)
{
    snprintf(names[number], 64, "worker%d", number);
    request_cache = kmem_cache_create(names[number], SECTOR_SIZE, 0, NULL, NULL);
    ......
}

But why the old kernel module did not report error in centos 7? Because in centos 7 the default memory allocator is SLUB, and in centos 6 it is SLAB. They have totally different implementation.

Run docker on centos6

Docker use thin-provision of device mapper as its default storage, therefore if we wan’t run docker on centos6, we should update kernel first. I use linux kernel 4.11 and notice these kernel options should be set:

CONFIG_DM_THIN_PROVISIONING=m
CONFIG_NF_NAT=m
CONFIG_NF_NAT_MASQUERADE_IPV4=m
CONFIG_MEMCG=y
CONFIG_IP_NF_TARGET_MASQUERADE

After build and reboot the kernel, I still can’t launch docker service, and finally find out the solution:

sudo route del -net 172.16.0.0 netmask 255.240.0.0

Solve a USB network card problem

I am doing some source code porting works on linux kernel recently.
After I reboot my server (ubuntu 14.04) into new kernel version of 3.19.8, it can’t be connected by using ssh but only by using serial port.
The server is using a USB network card, so firstly I suspect some kernel driver for USB NIC has missing in .config file. Therefore I boot back into the old version kernel and try to find some useful information in ‘dmesg’:

asix 3-1.4:1.0 eth5: register 'asix' at usb-0000:00:11.0-1.4, ASIX AX88772B USB 2.0 Ethernet, 00:24:9c:04:44:28
......
eth3: link up, 100Mbps, full-duplex, lpa 0x45E1

The eth3 is using MII port, so when I try to grep “link up …lpa” in 3.19.8 kernel source code, I find out it must be printed by these codes in drivers/net/mii.c:

unsigned int mii_check_media (struct mii_if_info *mii,
                  unsigned int ok_to_print,
                  unsigned int init_media)
{
......
    if (ok_to_print)
        netdev_info(mii->dev, "link up, %uMbps, %s-duplex, lpa 0x%04X\n",
                lpa2 & (LPA_1000FULL | LPA_1000HALF) ? 1000 :
                media & (ADVERTISE_100FULL | ADVERTISE_100HALF) ?
                100 : 10,
                duplex ? "full" : "half",
                lpa);
......

The only place “ASIX AX88772B” driver call mii_check_media is in drivers/net/usb/asix_devices.c:

static int ax88772_link_reset(struct usbnet *dev)
{
    u16 mode;
    struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET };
    mii_check_media(&dev->mii, 1, 1);
    mii_ethtool_gset(&dev->mii, &ecmd);
    mode = AX88772_MEDIUM_DEFAULT;
......
static const struct driver_info ax88772b_info = {
    .description = "ASIX AX88772B USB 2.0 Ethernet",
    .bind = ax88772_bind,
    .unbind = ax88772_unbind,
    .status = asix_status,
    .link_reset = ax88772_link_reset,
    .reset = ax88772_reset,
    .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_LINK_INTR |
             FLAG_MULTI_PACKET,
    .rx_fixup = asix_rx_fixup_common,
    .tx_fixup = asix_tx_fixup,
    .data = FLAG_EEPROM_MAC,
};

So far, the reason is that: the system does not “reset” the USB network card after booting up. But why it only forget to reset USB NIC in 3.19.8 kernel? After checking the /etc/network/interfaces:

auto eth0
iface eth0 inet dhcp
auto eth3
iface eth3 inet dhcp
auto usbnet0

the answer is: the device name of the USB NIC has been changed to “eth5” by udevd in 3.9.18 kernel (new version kernel recognise new network port so eth0/eth1/eth2/eth3/eth4 all has been occupied by system) so the network scripts can’t start it up.
Solution:
Fix the name of USB NIC by adding below content into /etc/udev/rules.d/70-persistent-net.rules:

SUBSYSTEM=="net", ACTION=="add", DRIVERS=="?*", ATTR{address}=="00:24:9c:04:44:28", ATTR{dev_id}=="0x0", ATTR{type}=="1", KERNEL=="eth*", NAME="usbnet0"

and add configurations into /etc/network/interfaces to start “usbnet0” up automatically:

auto usbnet0
iface usbnet0 inet dhcp

The size of pipe in linux

We use pipe in our program and face a new problem: it fail when we try to write 16MB data into a pipe in one time. Looks pipe has a limited size. But what exactly the size is? After searching on the web, the answers are not inconsistent, some say it’s 16KB and others say it’s 64KB. Therefore I have to watch kernel code by myself to find the correct answer.
Since all the servers in my company is using ali_kernel, which is based on 2.6.32 centos kernel, I find the original routine of codes:

sys_pipe() --> sys_pipe2() --> do_pipe_flags() --> create_write_pipe():
struct file *create_write_pipe(int flags)
{
......
        path.dentry->d_flags &= ~DCACHE_UNHASHED;
        d_instantiate(path.dentry, inode);
        err = -ENFILE;
        f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops);
        if (!f)
                goto err_dentry;
        f->f_mapping = inode->i_mapping;
......

Looks all the operations to the pipe about write are managed by “write_pipefifio_fops”. Let’s get in:

const struct file_operations write_pipefifo_fops = {
        .llseek         = no_llseek,
        .read           = bad_pipe_r,
        .write          = do_sync_write,
        .aio_write      = pipe_write,
        .poll           = pipe_poll,
        .unlocked_ioctl = pipe_ioctl,
        .open           = pipe_write_open,
        .release        = pipe_write_release,
        .fasync         = pipe_write_fasync,
};

Clearly, pipe_write() is responsed for writting. Keep going.

static ssize_t
pipe_write(struct kiocb *iocb, const struct iovec *_iov,
            unsigned long nr_segs, loff_t ppos)
{
......
        for (;;) {
                int bufs;
                if (!pipe->readers) {
                        send_sig(SIGPIPE, current, 0);
                        if (!ret)
                                ret = -EPIPE;
                        break;
                }
                bufs = pipe->nrbufs;
                if (bufs < PIPE_BUFFERS) {
                        int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
                        struct pipe_buffer *buf = pipe->bufs + newbuf;
                        struct page *page = pipe->tmp_page;
                        char *src;
                        int error, atomic = 1;
                        if (!page) {
                                page = alloc_page(GFP_HIGHUSER);
                                if (unlikely(!page)) {
                                        ret = ret ? : -ENOMEM;
                                        break;
                                }
                                pipe->tmp_page = page;
                        }
......
                        pipe->nrbufs = ++bufs;
                        pipe->tmp_page = NULL;
                        total_len -= chars;
                        if (!total_len)
                                break;
                }
......
                pipe_wait(pipe);
......

As above, kernel will allocate a page if new operation of write comes and pipe has not enough space. Every time it add a page, it increase the ‘pipe->nrbufs’, and if the ‘nrbufs’ is great than PIPE_BUFFERS, the routine will be blocked, which means the system-call of write() will be waiting. The ‘PIPE_BUFFERS’ is setted to 16, and a page in linux kernel is 4KB, so a pipe in ali_kernel can store 64KB (16 * 4KB) data at one time.
This condition has changed since kernel version of 3.6.35, which add a new proc entry in ‘/proc/sys/fs/pipe-max-size’.