~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/nfs/direct.c

Version: ~ [ linux-5.16 ] ~ [ linux-5.15.13 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.90 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.170 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.224 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.261 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.296 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.298 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.18.140 ] ~ [ linux-3.16.85 ] ~ [ linux-3.14.79 ] ~ [ linux-3.12.74 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * linux/fs/nfs/direct.c
  3  *
  4  * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
  5  *
  6  * High-performance uncached I/O for the Linux NFS client
  7  *
  8  * There are important applications whose performance or correctness
  9  * depends on uncached access to file data.  Database clusters
 10  * (multiple copies of the same instance running on separate hosts)
 11  * implement their own cache coherency protocol that subsumes file
 12  * system cache protocols.  Applications that process datasets
 13  * considerably larger than the client's memory do not always benefit
 14  * from a local cache.  A streaming video server, for instance, has no
 15  * need to cache the contents of a file.
 16  *
 17  * When an application requests uncached I/O, all read and write requests
 18  * are made directly to the server; data stored or fetched via these
 19  * requests is not cached in the Linux page cache.  The client does not
 20  * correct unaligned requests from applications.  All requested bytes are
 21  * held on permanent storage before a direct write system call returns to
 22  * an application.
 23  *
 24  * Solaris implements an uncached I/O facility called directio() that
 25  * is used for backups and sequential I/O to very large files.  Solaris
 26  * also supports uncaching whole NFS partitions with "-o forcedirectio,"
 27  * an undocumented mount option.
 28  *
 29  * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
 30  * help from Andrew Morton.
 31  *
 32  * 18 Dec 2001  Initial implementation for 2.4  --cel
 33  * 08 Jul 2002  Version for 2.4.19, with bug fixes --trondmy
 34  * 08 Jun 2003  Port to 2.5 APIs  --cel
 35  * 31 Mar 2004  Handle direct I/O without VFS support  --cel
 36  * 15 Sep 2004  Parallel async reads  --cel
 37  * 04 May 2005  support O_DIRECT with aio  --cel
 38  *
 39  */
 40 
 41 #include <linux/errno.h>
 42 #include <linux/sched.h>
 43 #include <linux/kernel.h>
 44 #include <linux/file.h>
 45 #include <linux/pagemap.h>
 46 #include <linux/kref.h>
 47 #include <linux/slab.h>
 48 #include <linux/task_io_accounting_ops.h>
 49 #include <linux/module.h>
 50 
 51 #include <linux/nfs_fs.h>
 52 #include <linux/nfs_page.h>
 53 #include <linux/sunrpc/clnt.h>
 54 
 55 #include <asm/uaccess.h>
 56 #include <linux/atomic.h>
 57 
 58 #include "internal.h"
 59 #include "iostat.h"
 60 #include "pnfs.h"
 61 
 62 #define NFSDBG_FACILITY         NFSDBG_VFS
 63 
 64 static struct kmem_cache *nfs_direct_cachep;
 65 
 66 /*
 67  * This represents a set of asynchronous requests that we're waiting on
 68  */
 69 struct nfs_direct_req {
 70         struct kref             kref;           /* release manager */
 71 
 72         /* I/O parameters */
 73         struct nfs_open_context *ctx;           /* file open context info */
 74         struct nfs_lock_context *l_ctx;         /* Lock context info */
 75         struct kiocb *          iocb;           /* controlling i/o request */
 76         struct inode *          inode;          /* target file of i/o */
 77 
 78         /* completion state */
 79         atomic_t                io_count;       /* i/os we're waiting for */
 80         spinlock_t              lock;           /* protect completion state */
 81         ssize_t                 count,          /* bytes actually processed */
 82                                 bytes_left,     /* bytes left to be sent */
 83                                 error;          /* any reported error */
 84         struct completion       completion;     /* wait for i/o completion */
 85 
 86         /* commit state */
 87         struct nfs_mds_commit_info mds_cinfo;   /* Storage for cinfo */
 88         struct pnfs_ds_commit_info ds_cinfo;    /* Storage for cinfo */
 89         struct work_struct      work;
 90         int                     flags;
 91 #define NFS_ODIRECT_DO_COMMIT           (1)     /* an unstable reply was received */
 92 #define NFS_ODIRECT_RESCHED_WRITES      (2)     /* write verification failed */
 93         struct nfs_writeverf    verf;           /* unstable write verifier */
 94 };
 95 
 96 static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
 97 static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops;
 98 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
 99 static void nfs_direct_write_schedule_work(struct work_struct *work);
100 
101 static inline void get_dreq(struct nfs_direct_req *dreq)
102 {
103         atomic_inc(&dreq->io_count);
104 }
105 
106 static inline int put_dreq(struct nfs_direct_req *dreq)
107 {
108         return atomic_dec_and_test(&dreq->io_count);
109 }
110 
111 /**
112  * nfs_direct_IO - NFS address space operation for direct I/O
113  * @rw: direction (read or write)
114  * @iocb: target I/O control block
115  * @iov: array of vectors that define I/O buffer
116  * @pos: offset in file to begin the operation
117  * @nr_segs: size of iovec array
118  *
119  * The presence of this routine in the address space ops vector means
120  * the NFS client supports direct I/O. However, for most direct IO, we
121  * shunt off direct read and write requests before the VFS gets them,
122  * so this method is only ever called for swap.
123  */
124 ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
125 {
126         struct inode *inode = iocb->ki_filp->f_mapping->host;
127 
128         /* we only support swap file calling nfs_direct_IO */
129         if (!IS_SWAPFILE(inode))
130                 return 0;
131 
132 #ifndef CONFIG_NFS_SWAP
133         dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n",
134                         iocb->ki_filp->f_path.dentry->d_name.name,
135                         (long long) pos, nr_segs);
136 
137         return -EINVAL;
138 #else
139         VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
140 
141         if (rw == READ || rw == KERNEL_READ)
142                 return nfs_file_direct_read(iocb, iov, nr_segs, pos,
143                                 rw == READ ? true : false);
144         return nfs_file_direct_write(iocb, iov, nr_segs, pos,
145                                 rw == WRITE ? true : false);
146 #endif /* CONFIG_NFS_SWAP */
147 }
148 
149 static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
150 {
151         unsigned int i;
152         for (i = 0; i < npages; i++)
153                 page_cache_release(pages[i]);
154 }
155 
156 void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
157                               struct nfs_direct_req *dreq)
158 {
159         cinfo->lock = &dreq->lock;
160         cinfo->mds = &dreq->mds_cinfo;
161         cinfo->ds = &dreq->ds_cinfo;
162         cinfo->dreq = dreq;
163         cinfo->completion_ops = &nfs_direct_commit_completion_ops;
164 }
165 
166 static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
167 {
168         struct nfs_direct_req *dreq;
169 
170         dreq = kmem_cache_zalloc(nfs_direct_cachep, GFP_KERNEL);
171         if (!dreq)
172                 return NULL;
173 
174         kref_init(&dreq->kref);
175         kref_get(&dreq->kref);
176         init_completion(&dreq->completion);
177         INIT_LIST_HEAD(&dreq->mds_cinfo.list);
178         INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
179         spin_lock_init(&dreq->lock);
180 
181         return dreq;
182 }
183 
184 static void nfs_direct_req_free(struct kref *kref)
185 {
186         struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
187 
188         nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo);
189         if (dreq->l_ctx != NULL)
190                 nfs_put_lock_context(dreq->l_ctx);
191         if (dreq->ctx != NULL)
192                 put_nfs_open_context(dreq->ctx);
193         kmem_cache_free(nfs_direct_cachep, dreq);
194 }
195 
196 static void nfs_direct_req_release(struct nfs_direct_req *dreq)
197 {
198         kref_put(&dreq->kref, nfs_direct_req_free);
199 }
200 
201 ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq)
202 {
203         return dreq->bytes_left;
204 }
205 EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left);
206 
207 /*
208  * Collects and returns the final error value/byte-count.
209  */
210 static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
211 {
212         ssize_t result = -EIOCBQUEUED;
213 
214         /* Async requests don't wait here */
215         if (dreq->iocb)
216                 goto out;
217 
218         result = wait_for_completion_killable(&dreq->completion);
219 
220         if (!result)
221                 result = dreq->error;
222         if (!result)
223                 result = dreq->count;
224 
225 out:
226         return (ssize_t) result;
227 }
228 
229 /*
230  * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
231  * the iocb is still valid here if this is a synchronous request.
232  */
233 static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
234 {
235         struct inode *inode = dreq->inode;
236 
237         if (dreq->iocb && write) {
238                 loff_t pos = dreq->iocb->ki_pos + dreq->count;
239 
240                 spin_lock(&inode->i_lock);
241                 if (i_size_read(inode) < pos)
242                         i_size_write(inode, pos);
243                 spin_unlock(&inode->i_lock);
244         }
245 
246         if (write)
247                 nfs_zap_mapping(inode, inode->i_mapping);
248 
249         inode_dio_done(inode);
250 
251         if (dreq->iocb) {
252                 long res = (long) dreq->error;
253                 if (!res)
254                         res = (long) dreq->count;
255                 aio_complete(dreq->iocb, res, 0);
256         }
257 
258         complete_all(&dreq->completion);
259 
260         nfs_direct_req_release(dreq);
261 }
262 
263 static void nfs_direct_readpage_release(struct nfs_page *req)
264 {
265         dprintk("NFS: direct read done (%s/%lld %d@%lld)\n",
266                 req->wb_context->dentry->d_inode->i_sb->s_id,
267                 (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
268                 req->wb_bytes,
269                 (long long)req_offset(req));
270         nfs_release_request(req);
271 }
272 
273 static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
274 {
275         unsigned long bytes = 0;
276         struct nfs_direct_req *dreq = hdr->dreq;
277 
278         if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
279                 goto out_put;
280 
281         spin_lock(&dreq->lock);
282         if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0))
283                 dreq->error = hdr->error;
284         else
285                 dreq->count += hdr->good_bytes;
286         spin_unlock(&dreq->lock);
287 
288         while (!list_empty(&hdr->pages)) {
289                 struct nfs_page *req = nfs_list_entry(hdr->pages.next);
290                 struct page *page = req->wb_page;
291 
292                 if (!PageCompound(page) && bytes < hdr->good_bytes)
293                         set_page_dirty(page);
294                 bytes += req->wb_bytes;
295                 nfs_list_remove_request(req);
296                 nfs_direct_readpage_release(req);
297         }
298 out_put:
299         if (put_dreq(dreq))
300                 nfs_direct_complete(dreq, false);
301         hdr->release(hdr);
302 }
303 
304 static void nfs_read_sync_pgio_error(struct list_head *head)
305 {
306         struct nfs_page *req;
307 
308         while (!list_empty(head)) {
309                 req = nfs_list_entry(head->next);
310                 nfs_list_remove_request(req);
311                 nfs_release_request(req);
312         }
313 }
314 
315 static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr)
316 {
317         get_dreq(hdr->dreq);
318 }
319 
320 static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
321         .error_cleanup = nfs_read_sync_pgio_error,
322         .init_hdr = nfs_direct_pgio_init,
323         .completion = nfs_direct_read_completion,
324 };
325 
326 /*
327  * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
328  * operation.  If nfs_readdata_alloc() or get_user_pages() fails,
329  * bail and stop sending more reads.  Read length accounting is
330  * handled automatically by nfs_direct_read_result().  Otherwise, if
331  * no requests have been sent, just return an error.
332  */
333 static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc,
334                                                 const struct iovec *iov,
335                                                 loff_t pos, bool uio)
336 {
337         struct nfs_direct_req *dreq = desc->pg_dreq;
338         struct nfs_open_context *ctx = dreq->ctx;
339         struct inode *inode = ctx->dentry->d_inode;
340         unsigned long user_addr = (unsigned long)iov->iov_base;
341         size_t count = iov->iov_len;
342         size_t rsize = NFS_SERVER(inode)->rsize;
343         unsigned int pgbase;
344         int result;
345         ssize_t started = 0;
346         struct page **pagevec = NULL;
347         unsigned int npages;
348 
349         do {
350                 size_t bytes;
351                 int i;
352 
353                 pgbase = user_addr & ~PAGE_MASK;
354                 bytes = min(max_t(size_t, rsize, PAGE_SIZE), count);
355 
356                 result = -ENOMEM;
357                 npages = nfs_page_array_len(pgbase, bytes);
358                 if (!pagevec)
359                         pagevec = kmalloc(npages * sizeof(struct page *),
360                                           GFP_KERNEL);
361                 if (!pagevec)
362                         break;
363                 if (uio) {
364                         down_read(&current->mm->mmap_sem);
365                         result = get_user_pages(current, current->mm, user_addr,
366                                         npages, 1, 0, pagevec, NULL);
367                         up_read(&current->mm->mmap_sem);
368                         if (result < 0)
369                                 break;
370                 } else {
371                         WARN_ON(npages != 1);
372                         result = get_kernel_page(user_addr, 1, pagevec);
373                         if (WARN_ON(result != 1))
374                                 break;
375                 }
376 
377                 if ((unsigned)result < npages) {
378                         bytes = result * PAGE_SIZE;
379                         if (bytes <= pgbase) {
380                                 nfs_direct_release_pages(pagevec, result);
381                                 break;
382                         }
383                         bytes -= pgbase;
384                         npages = result;
385                 }
386 
387                 for (i = 0; i < npages; i++) {
388                         struct nfs_page *req;
389                         unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
390                         /* XXX do we need to do the eof zeroing found in async_filler? */
391                         req = nfs_create_request(dreq->ctx, dreq->inode,
392                                                  pagevec[i],
393                                                  pgbase, req_len);
394                         if (IS_ERR(req)) {
395                                 result = PTR_ERR(req);
396                                 break;
397                         }
398                         req->wb_index = pos >> PAGE_SHIFT;
399                         req->wb_offset = pos & ~PAGE_MASK;
400                         if (!nfs_pageio_add_request(desc, req)) {
401                                 result = desc->pg_error;
402                                 nfs_release_request(req);
403                                 break;
404                         }
405                         pgbase = 0;
406                         bytes -= req_len;
407                         started += req_len;
408                         user_addr += req_len;
409                         pos += req_len;
410                         count -= req_len;
411                         dreq->bytes_left -= req_len;
412                 }
413                 /* The nfs_page now hold references to these pages */
414                 nfs_direct_release_pages(pagevec, npages);
415         } while (count != 0 && result >= 0);
416 
417         kfree(pagevec);
418 
419         if (started)
420                 return started;
421         return result < 0 ? (ssize_t) result : -EFAULT;
422 }
423 
424 static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
425                                               const struct iovec *iov,
426                                               unsigned long nr_segs,
427                                               loff_t pos, bool uio)
428 {
429         struct nfs_pageio_descriptor desc;
430         struct inode *inode = dreq->inode;
431         ssize_t result = -EINVAL;
432         size_t requested_bytes = 0;
433         unsigned long seg;
434 
435         NFS_PROTO(dreq->inode)->read_pageio_init(&desc, dreq->inode,
436                              &nfs_direct_read_completion_ops);
437         get_dreq(dreq);
438         desc.pg_dreq = dreq;
439         atomic_inc(&inode->i_dio_count);
440 
441         for (seg = 0; seg < nr_segs; seg++) {
442                 const struct iovec *vec = &iov[seg];
443                 result = nfs_direct_read_schedule_segment(&desc, vec, pos, uio);
444                 if (result < 0)
445                         break;
446                 requested_bytes += result;
447                 if ((size_t)result < vec->iov_len)
448                         break;
449                 pos += vec->iov_len;
450         }
451 
452         nfs_pageio_complete(&desc);
453 
454         /*
455          * If no bytes were started, return the error, and let the
456          * generic layer handle the completion.
457          */
458         if (requested_bytes == 0) {
459                 inode_dio_done(inode);
460                 nfs_direct_req_release(dreq);
461                 return result < 0 ? result : -EIO;
462         }
463 
464         if (put_dreq(dreq))
465                 nfs_direct_complete(dreq, false);
466         return 0;
467 }
468 
469 static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
470                                unsigned long nr_segs, loff_t pos, bool uio)
471 {
472         ssize_t result = -ENOMEM;
473         struct inode *inode = iocb->ki_filp->f_mapping->host;
474         struct nfs_direct_req *dreq;
475         struct nfs_lock_context *l_ctx;
476 
477         dreq = nfs_direct_req_alloc();
478         if (dreq == NULL)
479                 goto out;
480 
481         dreq->inode = inode;
482         dreq->bytes_left = iov_length(iov, nr_segs);
483         dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
484         l_ctx = nfs_get_lock_context(dreq->ctx);
485         if (IS_ERR(l_ctx)) {
486                 result = PTR_ERR(l_ctx);
487                 goto out_release;
488         }
489         dreq->l_ctx = l_ctx;
490         if (!is_sync_kiocb(iocb))
491                 dreq->iocb = iocb;
492 
493         NFS_I(inode)->read_io += iov_length(iov, nr_segs);
494         result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio);
495         if (!result)
496                 result = nfs_direct_wait(dreq);
497 out_release:
498         nfs_direct_req_release(dreq);
499 out:
500         return result;
501 }
502 
503 #if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
504 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
505 {
506         struct nfs_pageio_descriptor desc;
507         struct nfs_page *req, *tmp;
508         LIST_HEAD(reqs);
509         struct nfs_commit_info cinfo;
510         LIST_HEAD(failed);
511 
512         nfs_init_cinfo_from_dreq(&cinfo, dreq);
513         pnfs_recover_commit_reqs(dreq->inode, &reqs, &cinfo);
514         spin_lock(cinfo.lock);
515         nfs_scan_commit_list(&cinfo.mds->list, &reqs, &cinfo, 0);
516         spin_unlock(cinfo.lock);
517 
518         dreq->count = 0;
519         get_dreq(dreq);
520 
521         NFS_PROTO(dreq->inode)->write_pageio_init(&desc, dreq->inode, FLUSH_STABLE,
522                               &nfs_direct_write_completion_ops);
523         desc.pg_dreq = dreq;
524 
525         list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
526                 if (!nfs_pageio_add_request(&desc, req)) {
527                         nfs_list_remove_request(req);
528                         nfs_list_add_request(req, &failed);
529                         spin_lock(cinfo.lock);
530                         dreq->flags = 0;
531                         dreq->error = -EIO;
532                         spin_unlock(cinfo.lock);
533                 }
534                 nfs_release_request(req);
535         }
536         nfs_pageio_complete(&desc);
537 
538         while (!list_empty(&failed)) {
539                 req = nfs_list_entry(failed.next);
540                 nfs_list_remove_request(req);
541                 nfs_unlock_and_release_request(req);
542         }
543 
544         if (put_dreq(dreq))
545                 nfs_direct_write_complete(dreq, dreq->inode);
546 }
547 
548 static void nfs_direct_commit_complete(struct nfs_commit_data *data)
549 {
550         struct nfs_direct_req *dreq = data->dreq;
551         struct nfs_commit_info cinfo;
552         struct nfs_page *req;
553         int status = data->task.tk_status;
554 
555         nfs_init_cinfo_from_dreq(&cinfo, dreq);
556         if (status < 0) {
557                 dprintk("NFS: %5u commit failed with error %d.\n",
558                         data->task.tk_pid, status);
559                 dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
560         } else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
561                 dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid);
562                 dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
563         }
564 
565         dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
566         while (!list_empty(&data->pages)) {
567                 req = nfs_list_entry(data->pages.next);
568                 nfs_list_remove_request(req);
569                 if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) {
570                         /* Note the rewrite will go through mds */
571                         nfs_mark_request_commit(req, NULL, &cinfo);
572                 } else
573                         nfs_release_request(req);
574                 nfs_unlock_and_release_request(req);
575         }
576 
577         if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
578                 nfs_direct_write_complete(dreq, data->inode);
579 }
580 
581 static void nfs_direct_error_cleanup(struct nfs_inode *nfsi)
582 {
583         /* There is no lock to clear */
584 }
585 
586 static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = {
587         .completion = nfs_direct_commit_complete,
588         .error_cleanup = nfs_direct_error_cleanup,
589 };
590 
591 static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
592 {
593         int res;
594         struct nfs_commit_info cinfo;
595         LIST_HEAD(mds_list);
596 
597         nfs_init_cinfo_from_dreq(&cinfo, dreq);
598         nfs_scan_commit(dreq->inode, &mds_list, &cinfo);
599         res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo);
600         if (res < 0) /* res == -ENOMEM */
601                 nfs_direct_write_reschedule(dreq);
602 }
603 
604 static void nfs_direct_write_schedule_work(struct work_struct *work)
605 {
606         struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work);
607         int flags = dreq->flags;
608 
609         dreq->flags = 0;
610         switch (flags) {
611                 case NFS_ODIRECT_DO_COMMIT:
612                         nfs_direct_commit_schedule(dreq);
613                         break;
614                 case NFS_ODIRECT_RESCHED_WRITES:
615                         nfs_direct_write_reschedule(dreq);
616                         break;
617                 default:
618                         nfs_direct_complete(dreq, true);
619         }
620 }
621 
622 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
623 {
624         schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */
625 }
626 
627 #else
628 static void nfs_direct_write_schedule_work(struct work_struct *work)
629 {
630 }
631 
632 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
633 {
634         nfs_direct_complete(dreq, true);
635 }
636 #endif
637 
638 /*
639  * NB: Return the value of the first error return code.  Subsequent
640  *     errors after the first one are ignored.
641  */
642 /*
643  * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
644  * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
645  * bail and stop sending more writes.  Write length accounting is
646  * handled automatically by nfs_direct_write_result().  Otherwise, if
647  * no requests have been sent, just return an error.
648  */
649 static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc,
650                                                  const struct iovec *iov,
651                                                  loff_t pos, bool uio)
652 {
653         struct nfs_direct_req *dreq = desc->pg_dreq;
654         struct nfs_open_context *ctx = dreq->ctx;
655         struct inode *inode = ctx->dentry->d_inode;
656         unsigned long user_addr = (unsigned long)iov->iov_base;
657         size_t count = iov->iov_len;
658         size_t wsize = NFS_SERVER(inode)->wsize;
659         unsigned int pgbase;
660         int result;
661         ssize_t started = 0;
662         struct page **pagevec = NULL;
663         unsigned int npages;
664 
665         do {
666                 size_t bytes;
667                 int i;
668 
669                 pgbase = user_addr & ~PAGE_MASK;
670                 bytes = min(max_t(size_t, wsize, PAGE_SIZE), count);
671 
672                 result = -ENOMEM;
673                 npages = nfs_page_array_len(pgbase, bytes);
674                 if (!pagevec)
675                         pagevec = kmalloc(npages * sizeof(struct page *), GFP_KERNEL);
676                 if (!pagevec)
677                         break;
678 
679                 if (uio) {
680                         down_read(&current->mm->mmap_sem);
681                         result = get_user_pages(current, current->mm, user_addr,
682                                                 npages, 0, 0, pagevec, NULL);
683                         up_read(&current->mm->mmap_sem);
684                         if (result < 0)
685                                 break;
686                 } else {
687                         WARN_ON(npages != 1);
688                         result = get_kernel_page(user_addr, 0, pagevec);
689                         if (WARN_ON(result != 1))
690                                 break;
691                 }
692 
693                 if ((unsigned)result < npages) {
694                         bytes = result * PAGE_SIZE;
695                         if (bytes <= pgbase) {
696                                 nfs_direct_release_pages(pagevec, result);
697                                 break;
698                         }
699                         bytes -= pgbase;
700                         npages = result;
701                 }
702 
703                 for (i = 0; i < npages; i++) {
704                         struct nfs_page *req;
705                         unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
706 
707                         req = nfs_create_request(dreq->ctx, dreq->inode,
708                                                  pagevec[i],
709                                                  pgbase, req_len);
710                         if (IS_ERR(req)) {
711                                 result = PTR_ERR(req);
712                                 break;
713                         }
714                         nfs_lock_request(req);
715                         req->wb_index = pos >> PAGE_SHIFT;
716                         req->wb_offset = pos & ~PAGE_MASK;
717                         if (!nfs_pageio_add_request(desc, req)) {
718                                 result = desc->pg_error;
719                                 nfs_unlock_and_release_request(req);
720                                 break;
721                         }
722                         pgbase = 0;
723                         bytes -= req_len;
724                         started += req_len;
725                         user_addr += req_len;
726                         pos += req_len;
727                         count -= req_len;
728                         dreq->bytes_left -= req_len;
729                 }
730                 /* The nfs_page now hold references to these pages */
731                 nfs_direct_release_pages(pagevec, npages);
732         } while (count != 0 && result >= 0);
733 
734         kfree(pagevec);
735 
736         if (started)
737                 return started;
738         return result < 0 ? (ssize_t) result : -EFAULT;
739 }
740 
741 static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
742 {
743         struct nfs_direct_req *dreq = hdr->dreq;
744         struct nfs_commit_info cinfo;
745         int bit = -1;
746         struct nfs_page *req = nfs_list_entry(hdr->pages.next);
747 
748         if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
749                 goto out_put;
750 
751         nfs_init_cinfo_from_dreq(&cinfo, dreq);
752 
753         spin_lock(&dreq->lock);
754 
755         if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
756                 dreq->flags = 0;
757                 dreq->error = hdr->error;
758         }
759         if (dreq->error != 0)
760                 bit = NFS_IOHDR_ERROR;
761         else {
762                 dreq->count += hdr->good_bytes;
763                 if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) {
764                         dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
765                         bit = NFS_IOHDR_NEED_RESCHED;
766                 } else if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
767                         if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
768                                 bit = NFS_IOHDR_NEED_RESCHED;
769                         else if (dreq->flags == 0) {
770                                 memcpy(&dreq->verf, hdr->verf,
771                                        sizeof(dreq->verf));
772                                 bit = NFS_IOHDR_NEED_COMMIT;
773                                 dreq->flags = NFS_ODIRECT_DO_COMMIT;
774                         } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
775                                 if (memcmp(&dreq->verf, hdr->verf, sizeof(dreq->verf))) {
776                                         dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
777                                         bit = NFS_IOHDR_NEED_RESCHED;
778                                 } else
779                                         bit = NFS_IOHDR_NEED_COMMIT;
780                         }
781                 }
782         }
783         spin_unlock(&dreq->lock);
784 
785         while (!list_empty(&hdr->pages)) {
786                 req = nfs_list_entry(hdr->pages.next);
787                 nfs_list_remove_request(req);
788                 switch (bit) {
789                 case NFS_IOHDR_NEED_RESCHED:
790                 case NFS_IOHDR_NEED_COMMIT:
791                         kref_get(&req->wb_kref);
792                         nfs_mark_request_commit(req, hdr->lseg, &cinfo);
793                 }
794                 nfs_unlock_and_release_request(req);
795         }
796 
797 out_put:
798         if (put_dreq(dreq))
799                 nfs_direct_write_complete(dreq, hdr->inode);
800         hdr->release(hdr);
801 }
802 
803 static void nfs_write_sync_pgio_error(struct list_head *head)
804 {
805         struct nfs_page *req;
806 
807         while (!list_empty(head)) {
808                 req = nfs_list_entry(head->next);
809                 nfs_list_remove_request(req);
810                 nfs_unlock_and_release_request(req);
811         }
812 }
813 
814 static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
815         .error_cleanup = nfs_write_sync_pgio_error,
816         .init_hdr = nfs_direct_pgio_init,
817         .completion = nfs_direct_write_completion,
818 };
819 
820 static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
821                                                const struct iovec *iov,
822                                                unsigned long nr_segs,
823                                                loff_t pos, bool uio)
824 {
825         struct nfs_pageio_descriptor desc;
826         struct inode *inode = dreq->inode;
827         ssize_t result = 0;
828         size_t requested_bytes = 0;
829         unsigned long seg;
830 
831         NFS_PROTO(inode)->write_pageio_init(&desc, inode, FLUSH_COND_STABLE,
832                               &nfs_direct_write_completion_ops);
833         desc.pg_dreq = dreq;
834         get_dreq(dreq);
835         atomic_inc(&inode->i_dio_count);
836 
837         NFS_I(dreq->inode)->write_io += iov_length(iov, nr_segs);
838         for (seg = 0; seg < nr_segs; seg++) {
839                 const struct iovec *vec = &iov[seg];
840                 result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio);
841                 if (result < 0)
842                         break;
843                 requested_bytes += result;
844                 if ((size_t)result < vec->iov_len)
845                         break;
846                 pos += vec->iov_len;
847         }
848         nfs_pageio_complete(&desc);
849 
850         /*
851          * If no bytes were started, return the error, and let the
852          * generic layer handle the completion.
853          */
854         if (requested_bytes == 0) {
855                 inode_dio_done(inode);
856                 nfs_direct_req_release(dreq);
857                 return result < 0 ? result : -EIO;
858         }
859 
860         if (put_dreq(dreq))
861                 nfs_direct_write_complete(dreq, dreq->inode);
862         return 0;
863 }
864 
865 static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
866                                 unsigned long nr_segs, loff_t pos,
867                                 size_t count, bool uio)
868 {
869         ssize_t result = -ENOMEM;
870         struct inode *inode = iocb->ki_filp->f_mapping->host;
871         struct nfs_direct_req *dreq;
872         struct nfs_lock_context *l_ctx;
873 
874         dreq = nfs_direct_req_alloc();
875         if (!dreq)
876                 goto out;
877 
878         dreq->inode = inode;
879         dreq->bytes_left = count;
880         dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
881         l_ctx = nfs_get_lock_context(dreq->ctx);
882         if (IS_ERR(l_ctx)) {
883                 result = PTR_ERR(l_ctx);
884                 goto out_release;
885         }
886         dreq->l_ctx = l_ctx;
887         if (!is_sync_kiocb(iocb))
888                 dreq->iocb = iocb;
889 
890         result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
891         if (!result)
892                 result = nfs_direct_wait(dreq);
893 out_release:
894         nfs_direct_req_release(dreq);
895 out:
896         return result;
897 }
898 
899 /**
900  * nfs_file_direct_read - file direct read operation for NFS files
901  * @iocb: target I/O control block
902  * @iov: vector of user buffers into which to read data
903  * @nr_segs: size of iov vector
904  * @pos: byte offset in file where reading starts
905  *
906  * We use this function for direct reads instead of calling
907  * generic_file_aio_read() in order to avoid gfar's check to see if
908  * the request starts before the end of the file.  For that check
909  * to work, we must generate a GETATTR before each direct read, and
910  * even then there is a window between the GETATTR and the subsequent
911  * READ where the file size could change.  Our preference is simply
912  * to do all reads the application wants, and the server will take
913  * care of managing the end of file boundary.
914  *
915  * This function also eliminates unnecessarily updating the file's
916  * atime locally, as the NFS server sets the file's atime, and this
917  * client must read the updated atime from the server back into its
918  * cache.
919  */
920 ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
921                                 unsigned long nr_segs, loff_t pos, bool uio)
922 {
923         ssize_t retval = -EINVAL;
924         struct file *file = iocb->ki_filp;
925         struct address_space *mapping = file->f_mapping;
926         size_t count;
927 
928         count = iov_length(iov, nr_segs);
929         nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
930 
931         dfprintk(FILE, "NFS: direct read(%s/%s, %zd@%Ld)\n",
932                 file->f_path.dentry->d_parent->d_name.name,
933                 file->f_path.dentry->d_name.name,
934                 count, (long long) pos);
935 
936         retval = 0;
937         if (!count)
938                 goto out;
939 
940         retval = nfs_sync_mapping(mapping);
941         if (retval)
942                 goto out;
943 
944         task_io_account_read(count);
945 
946         retval = nfs_direct_read(iocb, iov, nr_segs, pos, uio);
947         if (retval > 0)
948                 iocb->ki_pos = pos + retval;
949 
950 out:
951         return retval;
952 }
953 
954 /**
955  * nfs_file_direct_write - file direct write operation for NFS files
956  * @iocb: target I/O control block
957  * @iov: vector of user buffers from which to write data
958  * @nr_segs: size of iov vector
959  * @pos: byte offset in file where writing starts
960  *
961  * We use this function for direct writes instead of calling
962  * generic_file_aio_write() in order to avoid taking the inode
963  * semaphore and updating the i_size.  The NFS server will set
964  * the new i_size and this client must read the updated size
965  * back into its cache.  We let the server do generic write
966  * parameter checking and report problems.
967  *
968  * We eliminate local atime updates, see direct read above.
969  *
970  * We avoid unnecessary page cache invalidations for normal cached
971  * readers of this file.
972  *
973  * Note that O_APPEND is not supported for NFS direct writes, as there
974  * is no atomic O_APPEND write facility in the NFS protocol.
975  */
976 ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
977                                 unsigned long nr_segs, loff_t pos, bool uio)
978 {
979         ssize_t retval = -EINVAL;
980         struct file *file = iocb->ki_filp;
981         struct address_space *mapping = file->f_mapping;
982         size_t count;
983 
984         count = iov_length(iov, nr_segs);
985         nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
986 
987         dfprintk(FILE, "NFS: direct write(%s/%s, %zd@%Ld)\n",
988                 file->f_path.dentry->d_parent->d_name.name,
989                 file->f_path.dentry->d_name.name,
990                 count, (long long) pos);
991 
992         retval = generic_write_checks(file, &pos, &count, 0);
993         if (retval)
994                 goto out;
995 
996         retval = -EINVAL;
997         if ((ssize_t) count < 0)
998                 goto out;
999         retval = 0;
1000         if (!count)
1001                 goto out;
1002 
1003         retval = nfs_sync_mapping(mapping);
1004         if (retval)
1005                 goto out;
1006 
1007         task_io_account_write(count);
1008 
1009         retval = nfs_direct_write(iocb, iov, nr_segs, pos, count, uio);
1010         if (retval > 0) {
1011                 struct inode *inode = mapping->host;
1012 
1013                 iocb->ki_pos = pos + retval;
1014                 spin_lock(&inode->i_lock);
1015                 if (i_size_read(inode) < iocb->ki_pos)
1016                         i_size_write(inode, iocb->ki_pos);
1017                 spin_unlock(&inode->i_lock);
1018         }
1019 out:
1020         return retval;
1021 }
1022 
1023 /**
1024  * nfs_init_directcache - create a slab cache for nfs_direct_req structures
1025  *
1026  */
1027 int __init nfs_init_directcache(void)
1028 {
1029         nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
1030                                                 sizeof(struct nfs_direct_req),
1031                                                 0, (SLAB_RECLAIM_ACCOUNT|
1032                                                         SLAB_MEM_SPREAD),
1033                                                 NULL);
1034         if (nfs_direct_cachep == NULL)
1035                 return -ENOMEM;
1036 
1037         return 0;
1038 }
1039 
1040 /**
1041  * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
1042  *
1043  */
1044 void nfs_destroy_directcache(void)
1045 {
1046         kmem_cache_destroy(nfs_direct_cachep);
1047 }
1048 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp