Linux内核共享内存的创建和映射

创建共享内存

共享内存的创建通过shmget()实现。该函数创建对应的ipc_namespaace指针并指向该进程的ipc_ns,初始化共享内存对应的操作shm_ops,并将传参key, size, shmflg封装为传参shm_params,最终调用ipcget()。

SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg)
{
    struct ipc_namespace *ns;
    static const struct ipc_ops shm_ops = {
        .getnew = newseg,
        .associate = shm_security,
        .more_checks = shm_more_checks,
    };
    struct ipc_params shm_params;
    ns = current->nsproxy->ipc_ns;
    shm_params.key = key;
    shm_params.flg = shmflg;
    shm_params.u.size = size;
    return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params);
}

ipcget()会根据传参key的类型是否是IPC_PRIVATE选择调用ipcget_new()创建或者调用ipcget_public()打开对应的共享内存。

int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
            const struct ipc_ops *ops, struct ipc_params *params)
{
    if (params->key == IPC_PRIVATE)
        return ipcget_new(ns, ids, ops, params);
    else
        return ipcget_public(ns, ids, ops, params);
}

ipcget_new()会根据定义的ops->getnew()创建新的ipc对象,即上面定义的newseg()。ipcget_public()会按照 key查找 struct kern_ipc_perm。如果没有找到,那就看是否设置了 IPC_CREAT,如果设置了,就调用ops->getnew()创建一个新的,否则返回错误ENOENT。如果找到了,就将对应的 id 返回。

static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids,
        const struct ipc_ops *ops, struct ipc_params *params)
{
    int err;
    down_write(&ids->rwsem);
    err = ops->getnew(ns, params);
    up_write(&ids->rwsem);
    return err;
}

static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids,
        const struct ipc_ops *ops, struct ipc_params *params)
{
    struct kern_ipc_perm *ipcp;
    int flg = params->flg;
    int err;
    /*
     * Take the lock as a writer since we are potentially going to add
     * a new entry + read locks are not "upgradable"
     */
    down_write(&ids->rwsem);
    ipcp = ipc_findkey(ids, params->key);
    if (ipcp == NULL) {
        /* key not used */
        if (!(flg & IPC_CREAT))
            err = -ENOENT;
        else
            err = ops->getnew(ns, params);
    } else {
......
            if (!err)
                /*
                 * ipc_check_perms returns the IPC id on
                 * success
                 */
                err = ipc_check_perms(ns, ipcp, ops, params);
        }
        ipc_unlock(ipcp);
    }
    up_write(&ids->rwsem);
    return err;
}

所以新的创建最后都会走到注册的newseg()函数。该函数主要逻辑为

  • 通过 kvmalloc() 在直接映射区分配一个 struct shmid_kernel 结构体,该结构体用于描述共享内存。
  • 调用hugetlb_file_setup()或shmem_kernel_file_setup()关联文件。虚拟地址空间可以和物理内存关联,但是页表的申请条件中会避开已分配的映射,即物理内存是某个进程独享的。所以如何实现物理内存向多个进程的虚拟内存映射呢?这里就要靠文件来实现了:虚拟地址空间也可以映射到一个文件,文件是可以跨进程共享的。这里我们并不是映射到硬盘上存储的文件,而是映射到内存文件系统上的文件。这里定要注意区分 shmem 和 shm ,前者是一个文件系统,后者是进程通信机制。
  • 通过 ipc_addid() 将新创建的 struct shmid_kernel 结构挂到 shm_ids 里面的基数树上,返回相应的 id,并且将 struct shmid_kernel 挂到当前进程的 sysvshm 队列中。
/**
 * newseg - Create a new shared memory segment
 * @ns: namespace
 * @params: ptr to the structure that contains key, size and shmflg
 *
 * Called with shm_ids.rwsem held as a writer.
 */
static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
{
    key_t key = params->key;
    int shmflg = params->flg;
    size_t size = params->u.size;
    int error;
    struct shmid_kernel *shp;
    size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
    struct file *file;
    char name[13];
......
    shp = kvmalloc(sizeof(*shp), GFP_KERNEL);
......
    shp->shm_perm.key = key;
    shp->shm_perm.mode = (shmflg & S_IRWXUGO);
    shp->mlock_user = NULL;
    shp->shm_perm.security = NULL;
......
    if (shmflg & SHM_HUGETLB) {
......
        file = hugetlb_file_setup(name, hugesize, acctflag,
                  &shp->mlock_user, HUGETLB_SHMFS_INODE,
                (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
    } else {
......
        file = shmem_kernel_file_setup(name, size, acctflag);
    }
......
    shp->shm_cprid = get_pid(task_tgid(current));
    shp->shm_lprid = NULL;
    shp->shm_atim = shp->shm_dtim = 0;
    shp->shm_ctim = ktime_get_real_seconds();
    shp->shm_segsz = size;
    shp->shm_nattch = 0;
    shp->shm_file = file;
    shp->shm_creator = current;
    /* ipc_addid() locks shp upon success. */
    error = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
......
    list_add(&shp->shm_clist, &current->sysvshm.shm_clist);
    /*
     * shmid gets reported as "inode#" in /proc/pid/maps.
     * proc-ps tools use this. Changing this will break them.
     */
    file_inode(file)->i_ino = shp->shm_perm.id;
    ns->shm_tot += numpages;
    error = shp->shm_perm.id;
......
}

实际上shmem_kernel_file_setup()会在shmem文件系统里面创建一个文件:__shmem_file_setup() 会创建新的 shmem 文件对应的 dentry 和 inode,并将它们两个关联起来,然后分配一个 struct file 结构来表示新的 shmem 文件,并且指向独特的 shmem_file_operations。

/**
 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be kernel internal.  
 * @name: name for dentry (to be seen in /proc/<pid>/maps
 * @size: size to be set for the file
 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */
struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
{
    return __shmem_file_setup(name, size, flags, S_PRIVATE);
}

static struct file *__shmem_file_setup(const char *name, loff_t size,
               unsigned long flags, unsigned int i_flags)
{
    struct file *res;
    struct inode *inode;
    struct path path;
    struct super_block *sb;
    struct qstr this;
......
    this.name = name;
    this.len = strlen(name);
    this.hash = 0; /* will go */
    sb = shm_mnt->mnt_sb;
    path.mnt = mntget(shm_mnt);
    path.dentry = d_alloc_pseudo(sb, &this);
    d_set_d_op(path.dentry, &anon_ops);
......
    inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
    inode->i_flags |= i_flags;
    d_instantiate(path.dentry, inode);
    inode->i_size = size;
......
    res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
        &shmem_file_operations);
    return res;
}

共享内存的映射

从上面的代码解析中我们知道,共享内存的数据结构 struct shmid_kernel通过它的成员 struct file *shm_file来管理内存文件系统 shmem 上的内存文件。无论这个共享内存是否被映射,shm_file 都是存在的。

对于用户来说,共享内存的映射通过调用shmat()完成。该函数主要逻辑为:

  • 调用shm_obtain_object_check()通过共享内存的 id,在基数树中找到对应的 struct shmid_kernel 结构,通过它找到 shmem 上的内存文件base。
  • 分配结构体struct shm_file_data sfd表示该内存文件base。
  • 创建base的备份文件file,指向该内存文件base,并将private_data保存为sfd。在源码中注释部分已经叙述了为什么要再创建一个文件而不是直接使用base,简而言之就是base是共享内存文件系统shmem中的shm_file,用于管理内存文件,是一个中立、独立于任何一个进程的文件。新创建的 struct file 则专门用于做内存映射。
  • 调用do_mmap_pgoff(),分配vm_area_struct指向虚拟地址空间中未分配区域,其vm_file指向文件file,接着调用shm_file_operations中的mmap()函数,即shm_mmap()完成映射。
SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg)
{ 
    unsigned long ret; 
    long err; 
    err = do_shmat(shmid, shmaddr, shmflg, &ret, SHMLBA); 
    force_successful_syscall_return(); 
    return (long)ret;
}

long do_shmat(int shmid, char __user *shmaddr, int shmflg,
          ulong *raddr, unsigned long shmlba)
{
    struct shmid_kernel *shp;
    unsigned long addr = (unsigned long)shmaddr;
    unsigned long size;
    struct file *file, *base;
    int    err;
    unsigned long flags = MAP_SHARED;
    unsigned long prot;
    int acc_mode;
    struct ipc_namespace *ns;
    struct shm_file_data *sfd;
    int f_flags;
    unsigned long populate = 0;
......
    if (shmflg & SHM_RDONLY) {
        prot = PROT_READ;
        acc_mode = S_IRUGO;
        f_flags = O_RDONLY;
    } else {
        prot = PROT_READ | PROT_WRITE;
        acc_mode = S_IRUGO | S_IWUGO;
        f_flags = O_RDWR;
    }
    if (shmflg & SHM_EXEC) {
        prot |= PROT_EXEC;
        acc_mode |= S_IXUGO;
    }
    /*
     * We cannot rely on the fs check since SYSV IPC does have an
     * additional creator id...
     */
    ns = current->nsproxy->ipc_ns;
    shp = shm_obtain_object_check(ns, shmid);
......
    /*
     * We need to take a reference to the real shm file to prevent the
     * pointer from becoming stale in cases where the lifetime of the outer
     * file extends beyond that of the shm segment.  It's not usually
     * possible, but it can happen during remap_file_pages() emulation as
     * that unmaps the memory, then does ->mmap() via file reference only.
     * We'll deny the ->mmap() if the shm segment was since removed, but to
     * detect shm ID reuse we need to compare the file pointers.
     */
    base = get_file(shp->shm_file);
    shp->shm_nattch++;
    size = i_size_read(file_inode(base));
    ipc_unlock_object(&shp->shm_perm);
    rcu_read_unlock();
    err = -ENOMEM;
    sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
......
    file = alloc_file_clone(base, f_flags,
              is_file_hugepages(base) ?
                &shm_file_operations_huge :
                &shm_file_operations);
......
    sfd->id = shp->shm_perm.id;
    sfd->ns = get_ipc_ns(ns);
    sfd->file = base;
    sfd->vm_ops = NULL;
    file->private_data = sfd;
......
    addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate, NULL);
    *raddr = addr;
    err = 0;
......
}
  shm_mmap() 中调用了 shm_file_data 中的 file 的 mmap() 函数,这次调用的是 shmem_file_operations 的 mmap,也即 shmem_mmap()。

static int shm_mmap(struct file *file, struct vm_area_struct *vma)
{
    struct shm_file_data *sfd = shm_file_data(file);
    int ret;
    /*
     * In case of remap_file_pages() emulation, the file can represent an
     * IPC ID that was removed, and possibly even reused by another shm
     * segment already.  Propagate this case as an error to caller.
     */
    ret = __shm_open(vma);
    if (ret)
        return ret;
    ret = call_mmap(sfd->file, vma);
    if (ret) {
        shm_close(vma);
        return ret;
    }
    sfd->vm_ops = vma->vm_ops;
    vma->vm_ops = &shm_vm_ops;
    return 0;
}

static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{ 
    file_accessed(file); 
    vma->vm_ops = &shmem_vm_ops; 
    return 0;
}

这里vm_area_struct 的 vm_ops 指向 shmem_vm_ops。等从 call_mmap() 中返回之后,shm_file_data 的 vm_ops 指向了 shmem_vm_ops,而 vm_area_struct 的 vm_ops 改为指向 shm_vm_ops。

static const struct vm_operations_struct shm_vm_ops = {
    .open  = shm_open,  /* callback for a new vm-area open */
    .close  = shm_close,  /* callback for when the vm-area is released */
    .fault  = shm_fault,
};

static const struct vm_operations_struct shmem_vm_ops = {
    .fault    = shmem_fault,
    .map_pages  = filemap_map_pages,
};

在前文内存映射中,我们提到了实际物理内存的分配不是在映射关系建立时就分配,而是当实际访问的时候通过缺页异常再进行分配。对于共享内存也是一样。当访问不到的时候,先调用 vm_area_struct 的 vm_ops,也即 shm_vm_ops 的 fault 函数 shm_fault()。然后它会转而调用 shm_file_data 的 vm_ops,也即 shmem_vm_ops 的 fault 函数 shmem_fault()。

shmem_fault() 会调用 shmem_getpage_gfp() 在 page cache 和 swap 中找一个空闲页,如果找不到就通过 shmem_alloc_and_acct_page() 分配一个新的页,他最终会调用内存管理系统的 alloc_page_vma 在物理内存中分配一个页。

static int shm_fault(struct vm_fault *vmf)
{
    struct file *file = vmf->vma->vm_file;
    struct shm_file_data *sfd = shm_file_data(file);
    return sfd->vm_ops->fault(vmf);
}

static int shmem_fault(struct vm_fault *vmf)
{
    struct vm_area_struct *vma = vmf->vma;
    struct inode *inode = file_inode(vma->vm_file);
    gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
......
    error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
          gfp, vma, vmf, &ret);
......
}

/*
 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
 *
 * If we allocate a new one we do not mark it dirty. That's up to the
 * vm. If we swap it in we mark it dirty since we also free the swap
 * entry since a page cannot live in both the swap and page cache.
 *
 * fault_mm and fault_type are only supplied by shmem_fault:
 * otherwise they are NULL.
 */
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
    struct page **pagep, enum sgp_type sgp, gfp_t gfp,
    struct vm_area_struct *vma, struct vm_fault *vmf, int *fault_type)
{
......
    page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
          index, false);
......
}

至此,共享内存才真的映射到了虚拟地址空间中,进程可以像访问本地内存一样访问共享内存。

酷客网相关文章:

赞(0)

评论 抢沙发

评论前必须登录!