系统调用,系统调用之“一调到底”

先写个简单的test.c
#include #include #include #include #include struct cdev test_cdev; dev_t devno; unsigned int major = 0; unsigned int minor = 0; int test_open (struct inode *nod, struct file *filp) { printk(" %s\n", __FUNCTION__); return 0; } struct file_operations test_ops = { .open = test_open, }; int init_test(void) { int err = 0; err = alloc_chrdev_region(&devno, 0, 1, "alloc register"); if(err){ printk(" cdev_add failed\n"); err = -EBUSY; goto fail; } major = MAJOR(devno); minor = MINOR(devno); printk("major is [%d], minor is [%d]\n", major, minor); cdev_init(&test_cdev, &test_ops); err = cdev_add(&test_cdev, devno, 1); if(err){ printk(" cdev_add failed\n"); err = -ENODEV; goto fail1; } printk("init \n"); return 0; fail: return err; fail1: unregister_chrdev_region(devno, 1); return err; } void exit_test(void) { cdev_del(&test_cdev); unregister_chrdev_region(devno, 1); printk("bye\n"); } module_init(init_test); module_exit(exit_test); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jesse"); MODULE_DESCRIPTION("this is a test module"); MODULE_VERSION("v0.1");

仅一个简单的open,应该不会再有更简单的字符设备驱动了吧。
app 层还应该有这么个东西。
fd = open("/dev/test", O_RDWR);
好了,上面下面都有了。那,中间是怎么个回事?
大致的过程是这么回事:
fd = open("/dev/test", O_RDWR);
sys_open
test_open
这个sys_open()可不是一个简单的函数,它包括了文件路径查找,文件权限判断等各种复杂BT的步骤。况且,不知何时起,内核里的sys_open已不是曾经的那个光明磊落的sys_open,tag不到,即便find到,也是一些bt的形式,早已面目全非。
-- fs/open.c -- SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode) { long ret; if (force_o_largefile()) flags |= O_LARGEFILE; ret = do_sys_open(AT_FDCWD, filename, flags, mode); //==>bb /* avoid REGPARM breakage _disibledevent=>*/ asmlinkage_protect(3, ret, filename, flags, mode); return ret; }
有人问了,这个SYSCALL_DEFINE3是个什么东西,“你最好不要追究这样的问题”,就好像linus说,“你知道我现在想到的一个数字是多少吗?” “我怎么知道,你随便一想,就让我一通乱猜吗,坑爹啊”
内核里的各种宏定义,您若看透了,请通知我,我对您的敬佩必如滔滔江水。好了,还是简单的gcc -E一下简单的瞧瞧。
#define __SYSCALL_DEFINEx(x, name, ...) \ asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)); \ static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)); \ asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__)) \ { \ __SC_TEST##x(__VA_ARGS__); \ return (long) SYSC##name(__SC_CAST##x(__VA_ARGS__)); \ } \ SYSCALL_ALIAS(sys##name, SyS##name); \ static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)) #define SYSCALL_DEFINEx(x, sname, ...) \ static const char *types_##sname[] = { \ __SC_STR_TDECL##x(__VA_ARGS__) \ }; \ static const char *args_##sname[] = { \ __SC_STR_ADECL##x(__VA_ARGS__) \ }; \ SYSCALL_METADATA(sname, x); \ __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) int main(void) { SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode); }
真面目如下:
int main(void) { static const char *types__open[] = { __SC_STR_TDECL3(const char __user *, filename, int, flags, int, mode) }; static const char *args__open[] = { __SC_STR_ADECL3(const char __user *, filename, int, flags, int, mode) }; SYSCALL_METADATA(_open, 3); asmlinkage long sys_open(__SC_DECL3(const char __user *, filename, int, flags, int, mode)); static inline long SYSC_open(__SC_DECL3(const char __user *, filename, int, flags, int, mode)); asmlinkage long SyS_open(__SC_LONG3(const char __user *, filename, int, flags, int, mode)) { __SC_TEST3(const char __user *, filename, int, flags, int, mode); return (long) SYSC_open(__SC_CAST3(const char __user *, filename, int, flags, int, mode)); } SYSCALL_ALIAS(sys_open, SyS_open); static inline long SYSC_open(__SC_DECL3(const char __user *, filename, int, flags, int, mode)); }
一些宏还未展开,点到为止,见好就收吧。
我们继续往下看。
bb:
long do_sys_open(int dfd, const char __user *filename, int flags, int mode) { char *tmp = getname(filename); //filename复制到了内核空间,即 *tmp ==>cc int fd = PTR_ERR(tmp); //return (long) ptr; if (!IS_ERR(tmp)) { fd = get_unused_fd_flags(flags); //得到一个有效的fd ==>dd if (fd >= 0) { struct file *f = do_filp_open(dfd, tmp, flags, mode, 0); // ==>ee if (IS_ERR(f)) { put_unused_fd(fd); fd = PTR_ERR(f); } else { fsnotify_open(f->f_path.dentry); //==>ff fd_install(fd, f); //将 fd 与file结构关联,以便 read write 等系统调用使用 ==>gg } } putname(tmp); //分配完毕,释放掉暂时保存filename的内核空间:kmem_cache_free } return fd; }

cc:
#define __getname_gfp(gfp) kmem_cache_alloc(names_cachep, (gfp)) #define __getname() __getname_gfp(GFP_KERNEL) char * getname(const char __user * filename) { char *tmp, *result; result = ERR_PTR(-ENOMEM); tmp = __getname(); // kmem_cache_alloc: 内存分配出一块空间 if (tmp) { int retval = do_getname(filename, tmp); //copy filenames to the kernel data space(*tmp) before using them result = tmp; if (retval < 0) { __putname(tmp); result = ERR_PTR(retval); } } audit_getname(result); return result; }
dd:
#define get_unused_fd_flags(flags) alloc_fd(0, (flags)) -- fs/file.c -- /* * allocate a file descriptor, mark it busy. */ int alloc_fd(unsigned start, unsigned flags) { struct files_struct *files = current->files; unsigned int fd; int error; struct fdtable *fdt; spin_lock(&files->file_lock); repeat: fdt = files_fdtable(files); fd = start; if (fd < files->next_fd) fd = files->next_fd; if (fd < fdt->max_fds) fd = find_next_zero_bit(fdt->open_fds->fds_bits, fdt->max_fds, fd); //这个很熟悉的函数==>ddD error = expand_files(files, fd); if (error < 0) goto out; /* * If we needed to expand the fs array we * might have blocked - try again. */ if (error) goto repeat; if (start <= files->next_fd) files->next_fd = fd + 1; FD_SET(fd, fdt->open_fds); if (flags & O_CLOEXEC) FD_SET(fd, fdt->close_on_exec); else FD_CLR(fd, fdt->close_on_exec); error = fd; #if 1 /* Sanity check */ if (rcu_dereference(fdt->fd[fd]) != NULL) { printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); rcu_assign_pointer(fdt->fd[fd], NULL); } #endif out: spin_unlock(&files->file_lock); return error; }
ddD:
一个出镜率很高的函数,常用于各种什么符的的分配。当然了,这些符都是按顺序分配di,用类似数组的形式,数组里的0表示未分配,然后遍历去找这些0就是了。
unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BITOP_WORD(offset); //p = addr unsigned long result = offset & ~(BITS_PER_LONG-1); //result = 0 unsigned long tmp; if (offset >= size) return size; size -= result; offset %= BITS_PER_LONG; if (offset) { tmp = *(p++); tmp |= ~0UL >> (BITS_PER_LONG - offset); if (size < BITS_PER_LONG) goto found_first; if (~tmp) goto found_middle; size -= BITS_PER_LONG; result += BITS_PER_LONG; } while (size & ~(BITS_PER_LONG-1)) { if (~(tmp = *(p++))) goto found_middle; result += BITS_PER_LONG; size -= BITS_PER_LONG; } if (!size) return result; tmp = *p; found_first: tmp |= ~0UL << size; if (tmp == ~0UL) /* Are any bits zero? */ return result + size; /* Nope. */ found_middle: return result + ffz(tmp); }
下面是我们理解的重点,也是一调到底的精髓。重点在于struct file的分配。
ee:
/* * Note that the low bits of the passed in "open_flag" * are not the same as in the local variable "flag". See * open_to_namei_flags() for more details. */ struct file *do_filp_open(int dfd, const char *pathname, int open_flag, int mode, int acc_mode) { struct file *filp; struct nameidata nd; int error; struct path path; struct dentry *dir; int count = 0; int will_write; int flag = open_to_namei_flags(open_flag); /* 设置open的 mode */ if (!acc_mode) acc_mode = MAY_OPEN | ACC_MODE(flag); /* O_TRUNC implies we need access checks for write permissions */ if (flag & O_TRUNC) acc_mode |= MAY_WRITE; /* Allow the LSM permission hook to distinguish append access from general write access. */ if (flag & O_APPEND) acc_mode |= MAY_APPEND; /* * The simplest case - just a plain lookup. */ if (!(flag & O_CREAT)) { error = path_lookup_open(dfd, pathname, lookup_flags(flag), &nd, flag); if (error) return ERR_PTR(error); goto ok; } ... ... ok: /* * Consider: * 1. may_open() truncates a file * 2. a rw->ro mount transition occurs * 3. nameidata_to_filp() fails due to * the ro mount. * That would be inconsistent, and should * be avoided. Taking this mnt write here * ensures that (2) can not occur. */ will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode); if (will_write) { error = mnt_want_write(nd.path.mnt); if (error) goto exit; } error = may_open(&nd.path, acc_mode, flag); //** if (error) { if (will_write) mnt_drop_write(nd.path.mnt); goto exit; } filp = nameidata_to_filp(&nd, open_flag); //分配struct file,得到filp if (IS_ERR(filp)) ima_counts_put(&nd.path, acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC)); /* * It is now safe to drop the mnt write * because the filp has had a write taken * _disibledevent=>*/ if (will_write) mnt_drop_write(nd.path.mnt); if (nd.root.mnt) path_put(&nd.root); return filp; ... ... }

struct file *nameidata_to_filp(struct nameidata *nd, int flags) { const struct cred *cred = current_cred(); struct file *filp; /* Pick up the filp from the open intent */ filp = nd->intent.open.file; /* Has the filesystem initialised the file for us? */ if (filp->f_path.dentry == NULL) filp = __dentry_open(nd->path.dentry, nd->path.mnt, flags, filp, NULL, cred); //!!! else path_put(&nd->path); return filp; }

static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags, struct file *f, int (*open)(struct inode *, struct file *), const struct cred *cred) { struct inode *inode; int error; f->f_flags = flags; f->f_mode = (__force fmode_t)((flags+1) & O_ACCMODE) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; inode = dentry->d_inode; if (f->f_mode & FMODE_WRITE) { error = __get_file_write_access(inode, mnt); if (error) goto cleanup_file; if (!special_file(inode->i_mode)) file_take_write(f); } f->f_mapping = inode->i_mapping; f->f_path.dentry = dentry; f->f_path.mnt = mnt; f->f_pos = 0; f->f_op = fops_get(inode->i_fop); //!!! !!! file_move(f, &inode->i_sb->s_files); error = security_dentry_open(f, cred); if (error) goto cleanup_all; if (!open && f->f_op) //f->f_op若有,则执行open open = f->f_op->open; if (open) { error = open(inode, f); if (error) goto cleanup_all; } f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
说下六个感叹号的地方。记得我们在注册字符设备的时候是否有个cdev_init ? 她的体内是不是有个 cdev->ops = fops ?
inode里是不是有个i_cdev ?
这里,file的f_op是不是被赋了inode的i_fop ?
打开struct file, struct inode的定义处,多瞧上两眼。这里就不贴了。
就这样,fd = open("/dev/test", O_RDWR) 最终还是调到了test_open 。
最后就是个首尾函数,将得到的fd和struct file关联起来。
gg:
void fd_install(unsigned int fd, struct file *file) { struct files_struct *files = current->files; struct fdtable *fdt; spin_lock(&files->file_lock); fdt = files_fdtable(files); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); spin_unlock(&files->file_lock); }

do_sys_open 的结尾 return fd; 返回给 app。
fd = open("/dev/test", O_RDWR)
你懂的。
Tags:  系统调用实现过程 系统调用的目的 系统调用失败 什么是系统调用 系统调用

延伸阅读

最新评论

发表评论