From: Miklos Szeredi Date: Fri, 25 Mar 2005 12:19:43 +0000 (+0000) Subject: merge from 2_2_merge1 to 2_2_merge2 X-Git-Tag: fuse_2_3_pre1~2 X-Git-Url: http://git.maquefel.me/?a=commitdiff_plain;h=407e6a7297a7e126182934c53946eac44fe6b54b;p=qemu-gpiodev%2Flibfuse.git merge from 2_2_merge1 to 2_2_merge2 --- diff --git a/ChangeLog b/ChangeLog index 9ffdbda..ba85a21 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2005-03-24 Miklos Szeredi + + * kernel: trivial cleanups + 2005-03-21 Miklos Szeredi * Add fsyncdir() operation diff --git a/Makefile.am b/Makefile.am index 28f5b32..caeaed7 100644 --- a/Makefile.am +++ b/Makefile.am @@ -7,7 +7,8 @@ EXTRA_DIST = \ README* \ Filesystems \ FAQ \ - doc/how-fuse-works + doc/how-fuse-works \ + doc/kernel.txt pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = fuse.pc diff --git a/doc/kernel.txt b/doc/kernel.txt new file mode 100644 index 0000000..6fcfada --- /dev/null +++ b/doc/kernel.txt @@ -0,0 +1,130 @@ +The following diagram shows how a filesystem operation (in this +example unlink) is performed in FUSE. + +NOTE: everything in this description is greatly simplified + + | "rm /mnt/fuse/file" | FUSE filesystem daemon + | | + | | >sys_read() + | | >fuse_dev_read() + | | >request_wait() + | | [sleep on fc->waitq] + | | + | >sys_unlink() | + | >fuse_unlink() | + | [get request from | + | fc->unused_list] | + | >request_send() | + | [queue req on fc->pending] | + | [wake up fc->waitq] | [woken up] + | >request_wait_answer() | + | [sleep on req->waitq] | + | | pending] + | | [copy req to read buffer] + | | [add req to fc->processing] + | | sys_write() + | | >fuse_dev_write() + | | [look up req in fc->processing] + | | [remove from fc->processing] + | | [copy write buffer to req] + | [woken up] | [wake up req->waitq] + | | unused_list] | + | sys_unlink("/mnt/fuse/file") | + | [acquire inode semaphore | + | for "file"] | + | >fuse_unlink() | + | [sleep on req->waitq] | + | | sys_unlink("/mnt/fuse/file") + | | [acquire inode semaphore + | | for "file"] + | | *DEADLOCK* + +The solution for this is to allow requests to be interrupted while +they are in userspace: + + | [interrupted by signal] | + | fuse_unlink() + | | [queue req on fc->pending] + | | [wake up fc->waitq] + | | [sleep on req->waitq] + +If the filesystem daemon was single threaded, this will stop here, +since there's no other thread to dequeue and execute the request. +In this case the solution is to kill the FUSE daemon as well. If +there are multiple serving threads, you just have to kill them as +long as any remain. + +Moral: a filesystem which deadlocks, can soon find itself dead. + +Scenario 2 - Tricky deadlock +---------------------------- + +This one needs a carefully crafted filesystem. It's a variation on +the above, only the call back to the filesystem is not explicit, +but is caused by a pagefault. + + | Kamikaze filesystem thread 1 | Kamikaze filesystem thread 2 + | | + | [fd = open("/mnt/fuse/file")] | [request served normally] + | [mmap fd to 'addr'] | + | [close fd] | [FLUSH triggers 'magic' flag] + | [read a byte from addr] | + | >do_page_fault() | + | [find or create page] | + | [lock page] | + | >fuse_readpage() | + | [queue READ request] | + | [sleep on req->waitq] | + | | [read request to buffer] + | | [create reply header before addr] + | | >sys_write(addr - headerlength) + | | >fuse_dev_write() + | | [look up req in fc->processing] + | | [remove from fc->processing] + | | [copy write buffer to req] + | | >do_page_fault() + | | [find or create page] + | | [lock page] + | | * DEADLOCK * + +Solution is again to let the the request be interrupted (not +elaborated further). + +An additional problem is that while the write buffer is being +copied to the request, the request must not be interrupted. This +is because the destination address of the copy may not be valid +after the request is interrupted. + +This is solved with doing the copy atomically, and allowing +interruption while the page(s) belonging to the write buffer are +faulted with get_user_pages(). The 'req->locked' flag indicates +when the copy is taking place, and interruption is delayed until +this flag is unset. + diff --git a/kernel/dev.c b/kernel/dev.c index 5941035..b17296a 100644 --- a/kernel/dev.c +++ b/kernel/dev.c @@ -146,6 +146,11 @@ struct fuse_req *fuse_get_request(struct fuse_conn *fc) return do_get_request(fc); } +/* + * Non-interruptible version of the above function is for operations + * which can't legally return -ERESTART{SYS,NOINTR}. This can still + * return NULL, but only in case the signal is SIGKILL. + */ struct fuse_req *fuse_get_request_nonint(struct fuse_conn *fc) { int intr; @@ -165,6 +170,7 @@ static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req) else fuse_request_free(req); + /* If we are in debt decrease that first */ if (fc->outstanding_debt) fc->outstanding_debt--; else @@ -180,10 +186,8 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) void fuse_release_background(struct fuse_req *req) { - if (req->inode) - iput(req->inode); - if (req->inode2) - iput(req->inode2); + iput(req->inode); + iput(req->inode2); if (req->file) fput(req->file); spin_lock(&fuse_lock); @@ -191,7 +195,17 @@ void fuse_release_background(struct fuse_req *req) spin_unlock(&fuse_lock); } -/* Called with fuse_lock, unlocks it */ +/* + * This function is called when a request is finished. Either a reply + * has arrived or it was interrupted (and not yet sent) or some error + * occured during communication with userspace, or the device file was + * closed. It decreases the referece count for the request. In case + * of a background request the referece to the stored objects are + * released. The requester thread is woken up (if still waiting), and + * finally the request is either freed or put on the unused_list + * + * Called with fuse_lock, unlocks it + */ static void request_end(struct fuse_conn *fc, struct fuse_req *req) { int putback; @@ -218,10 +232,37 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) fuse_putback_request(fc, req); } +/* + * Unfortunately request interruption not just solves the deadlock + * problem, it causes problems too. These stem from the fact, that an + * interrupted request is continued to be processed in userspace, + * while all the locks and object references (inode and file) held + * during the operation are released. + * + * To release the locks is exactly why there's a need to interrupt the + * request, so there's not a lot that can be done about this, except + * introduce additional locking in userspace. + * + * More important is to keep inode and file references until userspace + * has replied, otherwise FORGET and RELEASE could be sent while the + * inode/file is still used by the filesystem. + * + * For this reason the concept of "background" request is introduced. + * An interrupted request is backgrounded if it has been already sent + * to userspace. Backgrounding involves getting an extra reference to + * inode(s) or file used in the request, and adding the request to + * fc->background list. When a reply is received for a background + * request, the object references are released, and the request is + * removed from the list. If the filesystem is unmounted while there + * are still background requests, the list is walked and references + * are released as if a reply was received. + * + * There's one more use for a background request. The RELEASE message is + * always sent as background, since it doesn't return an error or + * data. + */ static void background_request(struct fuse_conn *fc, struct fuse_req *req) { - /* Need to get hold of the inode(s) and/or file used in the - request, so FORGET and RELEASE are not sent too early */ req->background = 1; list_add(&req->bg_entry, &fc->background); if (req->inode) @@ -273,7 +314,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req, if (req->locked) { /* This is uninterruptible sleep, because data is being copied to/from the buffers of req. During - locked state, there musn't be any filesystem + locked state, there mustn't be any filesystem operation (e.g. page fault), since that could lead to deadlock */ spin_unlock(&fuse_lock); @@ -308,7 +349,12 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req) req->in.h.len = sizeof(struct fuse_in_header) + len_args(req->in.numargs, (struct fuse_arg *) req->in.args); if (!req->preallocated) { - /* decrease outstanding_sem, but without blocking... */ + /* If request is not preallocated (either FORGET or + RELEASE), then still decrease outstanding_sem, so + user can't open infinite number of files while not + processing the RELEASE requests. However for + efficiency do it without blocking, so if down() + would block, just increase the debt instead */ if (down_trylock(&fc->outstanding_sem)) fc->outstanding_debt++; } @@ -338,6 +384,11 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req) request_send_wait(fc, req, 1); } +/* + * Non-interruptible version of the above function is for operations + * which can't legally return -ERESTART{SYS,NOINTR}. This can still + * be interrupted but only with SIGKILL. + */ void request_send_nonint(struct fuse_conn *fc, struct fuse_req *req) { request_send_wait(fc, req, 0); @@ -388,6 +439,11 @@ void fuse_send_init(struct fuse_conn *fc) request_send_background(fc, req); } +/* + * Lock the request. Up to the next unlock_request() there mustn't be + * anything that could cause a page-fault. If the request was already + * interrupted bail out. + */ static inline int lock_request(struct fuse_req *req) { int err = 0; @@ -402,6 +458,11 @@ static inline int lock_request(struct fuse_req *req) return err; } +/* + * Unlock request. If it was interrupted during being locked, the + * requester thread is currently waiting for it to be unlocked, so + * wake it up. + */ static inline void unlock_request(struct fuse_req *req) { if (req) { @@ -413,15 +474,6 @@ static inline void unlock_request(struct fuse_req *req) } } -/* Why all this complex one-page-at-a-time copying needed instead of - just copy_to/from_user()? The reason is that blocking on a page - fault must be avoided while the request is locked. This is because - if servicing that pagefault happens to be done by this filesystem, - an unbreakable deadlock can occur. So the code is careful to allow - request interruption during get_user_pages(), and only lock the - request while doing kmapped copying, which cannot block. - */ - struct fuse_copy_state { int write; struct fuse_req *req; @@ -435,26 +487,18 @@ struct fuse_copy_state { unsigned len; }; -static unsigned fuse_copy_init(struct fuse_copy_state *cs, int write, - struct fuse_req *req, const struct iovec *iov, - unsigned long nr_segs) +static void fuse_copy_init(struct fuse_copy_state *cs, int write, + struct fuse_req *req, const struct iovec *iov, + unsigned long nr_segs) { - unsigned i; - unsigned nbytes; - memset(cs, 0, sizeof(*cs)); cs->write = write; cs->req = req; cs->iov = iov; cs->nr_segs = nr_segs; - - nbytes = 0; - for (i = 0; i < nr_segs; i++) - nbytes += iov[i].iov_len; - - return nbytes; } +/* Unmap and put previous page of userspace buffer */ static inline void fuse_copy_finish(struct fuse_copy_state *cs) { if (cs->mapaddr) { @@ -468,6 +512,10 @@ static inline void fuse_copy_finish(struct fuse_copy_state *cs) } } +/* + * Get another pagefull of userspace buffer, and map it to kernel + * address space, and lock request + */ static int fuse_copy_fill(struct fuse_copy_state *cs) { unsigned long offset; @@ -499,6 +547,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) return lock_request(cs->req); } +/* Do as much copy to/from userspace buffer as we can */ static inline int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) { @@ -516,6 +565,10 @@ static inline int fuse_copy_do(struct fuse_copy_state *cs, void **val, return ncpy; } +/* + * Copy a page in the request to/from the userspace buffer. Must be + * done atomically + */ static inline int fuse_copy_page(struct fuse_copy_state *cs, struct page *page, unsigned offset, unsigned count, int zeroing) { @@ -541,6 +594,7 @@ static inline int fuse_copy_page(struct fuse_copy_state *cs, struct page *page, return 0; } +/* Copy pages in the request to/from userspace buffer */ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, int zeroing) { @@ -562,6 +616,7 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, return 0; } +/* Copy a single argument in the request to/from userspace buffer */ static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size) { while (size) { @@ -573,6 +628,7 @@ static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size) return 0; } +/* Copy request arguments to/from userspace buffer */ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, unsigned argpages, struct fuse_arg *args, int zeroing) @@ -590,6 +646,7 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, return err; } +/* Wait until a request is available on the pending list */ static void request_wait(struct fuse_conn *fc) { DECLARE_WAITQUEUE(wait, current); @@ -608,6 +665,15 @@ static void request_wait(struct fuse_conn *fc) remove_wait_queue(&fc->waitq, &wait); } +/* + * Read a single request into the userspace filesystem's buffer. This + * function waits until a request is available, then removes it from + * the pending list and copies request data to userspace buffer. If + * no reply is needed (FORGET) or request has been interrupted or + * there was an error during the copying then it's finished by calling + * request_end(). Otherwise add it to the processing list, and set + * the 'sent' flag. + */ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *off) { @@ -616,7 +682,6 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, struct fuse_req *req; struct fuse_in *in; struct fuse_copy_state cs; - unsigned nbytes; unsigned reqsize; spin_lock(&fuse_lock); @@ -638,9 +703,9 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, in = &req->in; reqsize = req->in.h.len; - nbytes = fuse_copy_init(&cs, 1, req, iov, nr_segs); + fuse_copy_init(&cs, 1, req, iov, nr_segs); err = -EINVAL; - if (nbytes >= reqsize) { + if (iov_length(iov, nr_segs) >= reqsize) { err = fuse_copy_one(&cs, &in->h, sizeof(in->h)); if (!err) err = fuse_copy_args(&cs, in->numargs, in->argpages, @@ -681,6 +746,7 @@ static ssize_t fuse_dev_read(struct file *file, char __user *buf, return fuse_dev_readv(file, &iov, 1, off); } +/* Look up request on processing list by unique ID */ static struct fuse_req *request_find(struct fuse_conn *fc, unsigned unique) { struct list_head *entry; @@ -717,11 +783,18 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out, out->page_zeroing); } +/* + * Write a single reply to a request. First the header is copied from + * the write buffer. The request is then searched on the processing + * list by the unique ID found in the header. If found, then remove + * it from the list and copy the rest of the buffer to the request. + * The request is finished by calling request_end() + */ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *off) { int err; - unsigned nbytes; + unsigned nbytes = iov_length(iov, nr_segs); struct fuse_req *req; struct fuse_out_header oh; struct fuse_copy_state cs; @@ -729,7 +802,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, if (!fc) return -ENODEV; - nbytes = fuse_copy_init(&cs, 0, NULL, iov, nr_segs); + fuse_copy_init(&cs, 0, NULL, iov, nr_segs); if (nbytes < sizeof(struct fuse_out_header)) return -EINVAL; @@ -806,6 +879,7 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait) return mask; } +/* Abort all requests on the given list (pending or processing) */ static void end_requests(struct fuse_conn *fc, struct list_head *head) { while (!list_empty(head)) {