~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/mm/filemap.c

Version: ~ [ linux-5.6-rc7 ] ~ [ linux-5.5.11 ] ~ [ linux-5.4.27 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.112 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.174 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.217 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.217 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.82 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-3.9.11 ] ~ [ linux-3.8.13 ] ~ [ linux-3.7.10 ] ~ [ linux-3.6.11 ] ~ [ linux-3.5.7 ] ~ [ linux-3.4.113 ] ~ [ linux-3.3.8 ] ~ [ linux-3.2.102 ] ~ [ linux-3.1.10 ] ~ [ linux-3.0.101 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  *      linux/mm/filemap.c
  3  *
  4  * Copyright (C) 1994-1999  Linus Torvalds
  5  */
  6 
  7 /*
  8  * This file handles the generic file mmap semantics used by
  9  * most "normal" filesystems (but you don't /have/ to use this:
 10  * the NFS filesystem used to do this differently, for example)
 11  */
 12 #include <linux/config.h>
 13 #include <linux/module.h>
 14 #include <linux/slab.h>
 15 #include <linux/compiler.h>
 16 #include <linux/fs.h>
 17 #include <linux/aio.h>
 18 #include <linux/kernel_stat.h>
 19 #include <linux/mm.h>
 20 #include <linux/swap.h>
 21 #include <linux/mman.h>
 22 #include <linux/pagemap.h>
 23 #include <linux/file.h>
 24 #include <linux/uio.h>
 25 #include <linux/hash.h>
 26 #include <linux/writeback.h>
 27 #include <linux/pagevec.h>
 28 #include <linux/blkdev.h>
 29 #include <linux/security.h>
 30 /*
 31  * This is needed for the following functions:
 32  *  - try_to_release_page
 33  *  - block_invalidatepage
 34  *  - generic_osync_inode
 35  *
 36  * FIXME: remove all knowledge of the buffer layer from the core VM
 37  */
 38 #include <linux/buffer_head.h> /* for generic_osync_inode */
 39 
 40 #include <asm/uaccess.h>
 41 #include <asm/mman.h>
 42 
 43 /*
 44  * Shared mappings implemented 30.11.1994. It's not fully working yet,
 45  * though.
 46  *
 47  * Shared mappings now work. 15.8.1995  Bruno.
 48  *
 49  * finished 'unifying' the page and buffer cache and SMP-threaded the
 50  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
 51  *
 52  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
 53  */
 54 
 55 /*
 56  * Lock ordering:
 57  *
 58  *  ->i_shared_sem              (vmtruncate)
 59  *    ->private_lock            (__free_pte->__set_page_dirty_buffers)
 60  *      ->swap_list_lock
 61  *        ->swap_device_lock    (exclusive_swap_page, others)
 62  *          ->mapping->page_lock
 63  *
 64  *  ->i_sem
 65  *    ->i_shared_sem            (truncate->invalidate_mmap_range)
 66  *
 67  *  ->mmap_sem
 68  *    ->i_shared_sem            (various places)
 69  *
 70  *  ->mmap_sem
 71  *    ->lock_page               (access_process_vm)
 72  *
 73  *  ->mmap_sem
 74  *    ->i_sem                   (msync)
 75  *
 76  *  ->inode_lock
 77  *    ->sb_lock                 (fs/fs-writeback.c)
 78  *    ->mapping->page_lock      (__sync_single_inode)
 79  *
 80  *  ->page_table_lock
 81  *    ->swap_device_lock        (try_to_unmap_one)
 82  *    ->private_lock            (try_to_unmap_one)
 83  *    ->page_lock               (try_to_unmap_one)
 84  *    ->zone.lru_lock           (follow_page->mark_page_accessed)
 85  *
 86  *  ->task->proc_lock
 87  *    ->dcache_lock             (proc_pid_lookup)
 88  */
 89 
 90 /*
 91  * Remove a page from the page cache and free it. Caller has to make
 92  * sure the page is locked and that nobody else uses it - or that usage
 93  * is safe.  The caller must hold a write_lock on the mapping's page_lock.
 94  */
 95 void __remove_from_page_cache(struct page *page)
 96 {
 97         struct address_space *mapping = page->mapping;
 98 
 99         radix_tree_delete(&mapping->page_tree, page->index);
100         list_del(&page->list);
101         page->mapping = NULL;
102 
103         mapping->nrpages--;
104         pagecache_acct(-1);
105 }
106 
107 void remove_from_page_cache(struct page *page)
108 {
109         struct address_space *mapping = page->mapping;
110 
111         if (unlikely(!PageLocked(page)))
112                 PAGE_BUG(page);
113 
114         spin_lock(&mapping->page_lock);
115         __remove_from_page_cache(page);
116         spin_unlock(&mapping->page_lock);
117 }
118 
119 static inline int sync_page(struct page *page)
120 {
121         struct address_space *mapping = page->mapping;
122 
123         if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
124                 return mapping->a_ops->sync_page(page);
125         return 0;
126 }
127 
128 /**
129  * filemap_fdatawrite - start writeback against all of a mapping's dirty pages
130  * @mapping: address space structure to write
131  *
132  * This is a "data integrity" operation, as opposed to a regular memory
133  * cleansing writeback.  The difference between these two operations is that
134  * if a dirty page/buffer is encountered, it must be waited upon, and not just
135  * skipped over.
136  */
137 static int __filemap_fdatawrite(struct address_space *mapping, int sync_mode)
138 {
139         int ret;
140         struct writeback_control wbc = {
141                 .sync_mode = sync_mode,
142                 .nr_to_write = mapping->nrpages * 2,
143         };
144 
145         if (mapping->backing_dev_info->memory_backed)
146                 return 0;
147 
148         spin_lock(&mapping->page_lock);
149         list_splice_init(&mapping->dirty_pages, &mapping->io_pages);
150         spin_unlock(&mapping->page_lock);
151         ret = do_writepages(mapping, &wbc);
152         return ret;
153 }
154 
155 int filemap_fdatawrite(struct address_space *mapping)
156 {
157         return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
158 }
159 
160 EXPORT_SYMBOL(filemap_fdatawrite);
161 
162 /*
163  * This is a mostly non-blocking flush.  Not suitable for data-integrity
164  * purposes.
165  */
166 int filemap_flush(struct address_space *mapping)
167 {
168         return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
169 }
170 
171 /**
172  * filemap_fdatawait - walk the list of locked pages of the given address
173  *                     space and wait for all of them.
174  * @mapping: address space structure to wait for
175  */
176 int filemap_fdatawait(struct address_space * mapping)
177 {
178         int ret = 0;
179         int progress;
180 
181 restart:
182         progress = 0;
183         spin_lock(&mapping->page_lock);
184         while (!list_empty(&mapping->locked_pages)) {
185                 struct page *page;
186 
187                 page = list_entry(mapping->locked_pages.next,struct page,list);
188                 list_del(&page->list);
189                 if (PageDirty(page))
190                         list_add(&page->list, &mapping->dirty_pages);
191                 else
192                         list_add(&page->list, &mapping->clean_pages);
193 
194                 if (!PageWriteback(page)) {
195                         if (++progress > 32) {
196                                 if (need_resched()) {
197                                         spin_unlock(&mapping->page_lock);
198                                         __cond_resched();
199                                         goto restart;
200                                 }
201                         }
202                         continue;
203                 }
204 
205                 progress = 0;
206                 page_cache_get(page);
207                 spin_unlock(&mapping->page_lock);
208 
209                 wait_on_page_writeback(page);
210                 if (PageError(page))
211                         ret = -EIO;
212 
213                 page_cache_release(page);
214                 spin_lock(&mapping->page_lock);
215         }
216         spin_unlock(&mapping->page_lock);
217 
218         /* Check for outstanding write errors */
219         if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
220                 ret = -ENOSPC;
221         if (test_and_clear_bit(AS_EIO, &mapping->flags))
222                 ret = -EIO;
223 
224         return ret;
225 }
226 
227 EXPORT_SYMBOL(filemap_fdatawait);
228 
229 /*
230  * This adds a page to the page cache, starting out as locked, unreferenced,
231  * not uptodate and with no errors.
232  *
233  * This function is used for two things: adding newly allocated pagecache
234  * pages and for moving existing anon pages into swapcache.
235  *
236  * In the case of pagecache pages, the page is new, so we can just run
237  * SetPageLocked() against it.  The other page state flags were set by
238  * rmqueue()
239  *
240  * In the case of swapcache, try_to_swap_out() has already locked the page, so
241  * SetPageLocked() is ugly-but-OK there too.  The required page state has been
242  * set up by swap_out_add_to_swap_cache().
243  *
244  * This function does not add the page to the LRU.  The caller must do that.
245  */
246 int add_to_page_cache(struct page *page, struct address_space *mapping,
247                 pgoff_t offset, int gfp_mask)
248 {
249         int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
250 
251         if (error == 0) {
252                 page_cache_get(page);
253                 spin_lock(&mapping->page_lock);
254                 error = radix_tree_insert(&mapping->page_tree, offset, page);
255                 if (!error) {
256                         SetPageLocked(page);
257                         ___add_to_page_cache(page, mapping, offset);
258                 } else {
259                         page_cache_release(page);
260                 }
261                 spin_unlock(&mapping->page_lock);
262                 radix_tree_preload_end();
263         }
264         return error;
265 }
266 
267 EXPORT_SYMBOL(add_to_page_cache);
268 
269 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
270                                 pgoff_t offset, int gfp_mask)
271 {
272         int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
273         if (ret == 0)
274                 lru_cache_add(page);
275         return ret;
276 }
277 
278 /*
279  * In order to wait for pages to become available there must be
280  * waitqueues associated with pages. By using a hash table of
281  * waitqueues where the bucket discipline is to maintain all
282  * waiters on the same queue and wake all when any of the pages
283  * become available, and for the woken contexts to check to be
284  * sure the appropriate page became available, this saves space
285  * at a cost of "thundering herd" phenomena during rare hash
286  * collisions.
287  */
288 static wait_queue_head_t *page_waitqueue(struct page *page)
289 {
290         const struct zone *zone = page_zone(page);
291 
292         return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
293 }
294 
295 void wait_on_page_bit(struct page *page, int bit_nr)
296 {
297         wait_queue_head_t *waitqueue = page_waitqueue(page);
298         DEFINE_WAIT(wait);
299 
300         do {
301                 prepare_to_wait(waitqueue, &wait, TASK_UNINTERRUPTIBLE);
302                 if (test_bit(bit_nr, &page->flags)) {
303                         sync_page(page);
304                         io_schedule();
305                 }
306         } while (test_bit(bit_nr, &page->flags));
307         finish_wait(waitqueue, &wait);
308 }
309 
310 EXPORT_SYMBOL(wait_on_page_bit);
311 
312 /**
313  * unlock_page() - unlock a locked page
314  *
315  * @page: the page
316  *
317  * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
318  * Also wakes sleepers in wait_on_page_writeback() because the wakeup
319  * mechananism between PageLocked pages and PageWriteback pages is shared.
320  * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
321  *
322  * The first mb is necessary to safely close the critical section opened by the
323  * TestSetPageLocked(), the second mb is necessary to enforce ordering between
324  * the clear_bit and the read of the waitqueue (to avoid SMP races with a
325  * parallel wait_on_page_locked()).
326  */
327 void unlock_page(struct page *page)
328 {
329         wait_queue_head_t *waitqueue = page_waitqueue(page);
330         smp_mb__before_clear_bit();
331         if (!TestClearPageLocked(page))
332                 BUG();
333         smp_mb__after_clear_bit(); 
334         if (waitqueue_active(waitqueue))
335                 wake_up_all(waitqueue);
336 }
337 
338 EXPORT_SYMBOL(unlock_page);
339 EXPORT_SYMBOL(lock_page);
340 
341 /*
342  * End writeback against a page.
343  */
344 void end_page_writeback(struct page *page)
345 {
346         wait_queue_head_t *waitqueue = page_waitqueue(page);
347 
348         if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) {
349                 smp_mb__before_clear_bit();
350                 if (!TestClearPageWriteback(page))
351                         BUG();
352                 smp_mb__after_clear_bit();
353         }
354         if (waitqueue_active(waitqueue))
355                 wake_up_all(waitqueue);
356 }
357 
358 EXPORT_SYMBOL(end_page_writeback);
359 
360 /*
361  * Get a lock on the page, assuming we need to sleep to get it.
362  *
363  * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
364  * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
365  * chances are that on the second loop, the block layer's plug list is empty,
366  * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
367  */
368 void __lock_page(struct page *page)
369 {
370         wait_queue_head_t *wqh = page_waitqueue(page);
371         DEFINE_WAIT(wait);
372 
373         while (TestSetPageLocked(page)) {
374                 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
375                 if (PageLocked(page)) {
376                         sync_page(page);
377                         io_schedule();
378                 }
379         }
380         finish_wait(wqh, &wait);
381 }
382 
383 EXPORT_SYMBOL(__lock_page);
384 
385 /*
386  * a rather lightweight function, finding and getting a reference to a
387  * hashed page atomically.
388  */
389 struct page * find_get_page(struct address_space *mapping, unsigned long offset)
390 {
391         struct page *page;
392 
393         /*
394          * We scan the hash list read-only. Addition to and removal from
395          * the hash-list needs a held write-lock.
396          */
397         spin_lock(&mapping->page_lock);
398         page = radix_tree_lookup(&mapping->page_tree, offset);
399         if (page)
400                 page_cache_get(page);
401         spin_unlock(&mapping->page_lock);
402         return page;
403 }
404 
405 EXPORT_SYMBOL(find_get_page);
406 
407 /*
408  * Same as above, but trylock it instead of incrementing the count.
409  */
410 struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
411 {
412         struct page *page;
413 
414         spin_lock(&mapping->page_lock);
415         page = radix_tree_lookup(&mapping->page_tree, offset);
416         if (page && TestSetPageLocked(page))
417                 page = NULL;
418         spin_unlock(&mapping->page_lock);
419         return page;
420 }
421 
422 EXPORT_SYMBOL(find_trylock_page);
423 
424 /**
425  * find_lock_page - locate, pin and lock a pagecache page
426  *
427  * @mapping - the address_space to search
428  * @offset - the page index
429  *
430  * Locates the desired pagecache page, locks it, increments its reference
431  * count and returns its address.
432  *
433  * Returns zero if the page was not present. find_lock_page() may sleep.
434  */
435 struct page *find_lock_page(struct address_space *mapping,
436                                 unsigned long offset)
437 {
438         struct page *page;
439 
440         spin_lock(&mapping->page_lock);
441 repeat:
442         page = radix_tree_lookup(&mapping->page_tree, offset);
443         if (page) {
444                 page_cache_get(page);
445                 if (TestSetPageLocked(page)) {
446                         spin_unlock(&mapping->page_lock);
447                         lock_page(page);
448                         spin_lock(&mapping->page_lock);
449 
450                         /* Has the page been truncated while we slept? */
451                         if (page->mapping != mapping || page->index != offset) {
452                                 unlock_page(page);
453                                 page_cache_release(page);
454                                 goto repeat;
455                         }
456                 }
457         }
458         spin_unlock(&mapping->page_lock);
459         return page;
460 }
461 
462 EXPORT_SYMBOL(find_lock_page);
463 
464 /**
465  * find_or_create_page - locate or add a pagecache page
466  *
467  * @mapping - the page's address_space
468  * @index - the page's index into the mapping
469  * @gfp_mask - page allocation mode
470  *
471  * Locates a page in the pagecache.  If the page is not present, a new page
472  * is allocated using @gfp_mask and is added to the pagecache and to the VM's
473  * LRU list.  The returned page is locked and has its reference count
474  * incremented.
475  *
476  * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
477  * allocation!
478  *
479  * find_or_create_page() returns the desired page's address, or zero on
480  * memory exhaustion.
481  */
482 struct page *find_or_create_page(struct address_space *mapping,
483                 unsigned long index, unsigned int gfp_mask)
484 {
485         struct page *page, *cached_page = NULL;
486         int err;
487 repeat:
488         page = find_lock_page(mapping, index);
489         if (!page) {
490                 if (!cached_page) {
491                         cached_page = alloc_page(gfp_mask);
492                         if (!cached_page)
493                                 return NULL;
494                 }
495                 err = add_to_page_cache_lru(cached_page, mapping,
496                                         index, gfp_mask);
497                 if (!err) {
498                         page = cached_page;
499                         cached_page = NULL;
500                 } else if (err == -EEXIST)
501                         goto repeat;
502         }
503         if (cached_page)
504                 page_cache_release(cached_page);
505         return page;
506 }
507 
508 EXPORT_SYMBOL(find_or_create_page);
509 
510 /**
511  * find_get_pages - gang pagecache lookup
512  * @mapping:    The address_space to search
513  * @start:      The starting page index
514  * @nr_pages:   The maximum number of pages
515  * @pages:      Where the resulting pages are placed
516  *
517  * find_get_pages() will search for and return a group of up to
518  * @nr_pages pages in the mapping.  The pages are placed at @pages.
519  * find_get_pages() takes a reference against the returned pages.
520  *
521  * The search returns a group of mapping-contiguous pages with ascending
522  * indexes.  There may be holes in the indices due to not-present pages.
523  *
524  * find_get_pages() returns the number of pages which were found.
525  */
526 unsigned int find_get_pages(struct address_space *mapping, pgoff_t start,
527                             unsigned int nr_pages, struct page **pages)
528 {
529         unsigned int i;
530         unsigned int ret;
531 
532         spin_lock(&mapping->page_lock);
533         ret = radix_tree_gang_lookup(&mapping->page_tree,
534                                 (void **)pages, start, nr_pages);
535         for (i = 0; i < ret; i++)
536                 page_cache_get(pages[i]);
537         spin_unlock(&mapping->page_lock);
538         return ret;
539 }
540 
541 /*
542  * Same as grab_cache_page, but do not wait if the page is unavailable.
543  * This is intended for speculative data generators, where the data can
544  * be regenerated if the page couldn't be grabbed.  This routine should
545  * be safe to call while holding the lock for another page.
546  *
547  * Clear __GFP_FS when allocating the page to avoid recursion into the fs
548  * and deadlock against the caller's locked page.
549  */
550 struct page *
551 grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
552 {
553         struct page *page = find_get_page(mapping, index);
554         int gfp_mask;
555 
556         if (page) {
557                 if (!TestSetPageLocked(page))
558                         return page;
559                 page_cache_release(page);
560                 return NULL;
561         }
562         gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS;
563         page = alloc_pages(gfp_mask, 0);
564         if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) {
565                 page_cache_release(page);
566                 page = NULL;
567         }
568         return page;
569 }
570 
571 EXPORT_SYMBOL(grab_cache_page_nowait);
572 
573 /*
574  * This is a generic file read routine, and uses the
575  * inode->i_op->readpage() function for the actual low-level
576  * stuff.
577  *
578  * This is really ugly. But the goto's actually try to clarify some
579  * of the logic when it comes to error handling etc.
580  * - note the struct file * is only passed for the use of readpage
581  */
582 void do_generic_mapping_read(struct address_space *mapping,
583                              struct file_ra_state *ra,
584                              struct file * filp,
585                              loff_t *ppos,
586                              read_descriptor_t * desc,
587                              read_actor_t actor)
588 {
589         struct inode *inode = mapping->host;
590         unsigned long index, offset;
591         struct page *cached_page;
592         int error;
593 
594         cached_page = NULL;
595         index = *ppos >> PAGE_CACHE_SHIFT;
596         offset = *ppos & ~PAGE_CACHE_MASK;
597 
598         for (;;) {
599                 struct page *page;
600                 unsigned long end_index, nr, ret;
601                 loff_t isize = i_size_read(inode);
602 
603                 end_index = isize >> PAGE_CACHE_SHIFT;
604                         
605                 if (index > end_index)
606                         break;
607                 nr = PAGE_CACHE_SIZE;
608                 if (index == end_index) {
609                         nr = isize & ~PAGE_CACHE_MASK;
610                         if (nr <= offset)
611                                 break;
612                 }
613 
614                 cond_resched();
615                 page_cache_readahead(mapping, ra, filp, index);
616 
617                 nr = nr - offset;
618 find_page:
619                 page = find_get_page(mapping, index);
620                 if (unlikely(page == NULL)) {
621                         handle_ra_miss(mapping, ra, index);
622                         goto no_cached_page;
623                 }
624                 if (!PageUptodate(page))
625                         goto page_not_up_to_date;
626 page_ok:
627                 /* If users can be writing to this page using arbitrary
628                  * virtual addresses, take care about potential aliasing
629                  * before reading the page on the kernel side.
630                  */
631                 if (!list_empty(&mapping->i_mmap_shared))
632                         flush_dcache_page(page);
633 
634                 /*
635                  * Mark the page accessed if we read the beginning.
636                  */
637                 if (!offset)
638                         mark_page_accessed(page);
639 
640                 /*
641                  * Ok, we have the page, and it's up-to-date, so
642                  * now we can copy it to user space...
643                  *
644                  * The actor routine returns how many bytes were actually used..
645                  * NOTE! This may not be the same as how much of a user buffer
646                  * we filled up (we may be padding etc), so we can only update
647                  * "pos" here (the actor routine has to update the user buffer
648                  * pointers and the remaining count).
649                  */
650                 ret = actor(desc, page, offset, nr);
651                 offset += ret;
652                 index += offset >> PAGE_CACHE_SHIFT;
653                 offset &= ~PAGE_CACHE_MASK;
654 
655                 page_cache_release(page);
656                 if (ret == nr && desc->count)
657                         continue;
658                 break;
659 
660 page_not_up_to_date:
661                 if (PageUptodate(page))
662                         goto page_ok;
663 
664                 /* Get exclusive access to the page ... */
665                 lock_page(page);
666 
667                 /* Did it get unhashed before we got the lock? */
668                 if (!page->mapping) {
669                         unlock_page(page);
670                         page_cache_release(page);
671                         continue;
672                 }
673 
674                 /* Did somebody else fill it already? */
675                 if (PageUptodate(page)) {
676                         unlock_page(page);
677                         goto page_ok;
678                 }
679 
680 readpage:
681                 /* ... and start the actual read. The read will unlock the page. */
682                 error = mapping->a_ops->readpage(filp, page);
683 
684                 if (!error) {
685                         if (PageUptodate(page))
686                                 goto page_ok;
687                         wait_on_page_locked(page);
688                         if (PageUptodate(page))
689                                 goto page_ok;
690                         error = -EIO;
691                 }
692 
693                 /* UHHUH! A synchronous read error occurred. Report it */
694                 desc->error = error;
695                 page_cache_release(page);
696                 break;
697 
698 no_cached_page:
699                 /*
700                  * Ok, it wasn't cached, so we need to create a new
701                  * page..
702                  */
703                 if (!cached_page) {
704                         cached_page = page_cache_alloc_cold(mapping);
705                         if (!cached_page) {
706                                 desc->error = -ENOMEM;
707                                 break;
708                         }
709                 }
710                 error = add_to_page_cache_lru(cached_page, mapping,
711                                                 index, GFP_KERNEL);
712                 if (error) {
713                         if (error == -EEXIST)
714                                 goto find_page;
715                         desc->error = error;
716                         break;
717                 }
718                 page = cached_page;
719                 cached_page = NULL;
720                 goto readpage;
721         }
722 
723         *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
724         if (cached_page)
725                 page_cache_release(cached_page);
726         update_atime(inode);
727 }
728 
729 EXPORT_SYMBOL(do_generic_mapping_read);
730 
731 int file_read_actor(read_descriptor_t *desc, struct page *page,
732                         unsigned long offset, unsigned long size)
733 {
734         char *kaddr;
735         unsigned long left, count = desc->count;
736 
737         if (size > count)
738                 size = count;
739 
740         /*
741          * Faults on the destination of a read are common, so do it before
742          * taking the kmap.
743          */
744         if (!fault_in_pages_writeable(desc->buf, size)) {
745                 kaddr = kmap_atomic(page, KM_USER0);
746                 left = __copy_to_user(desc->buf, kaddr + offset, size);
747                 kunmap_atomic(kaddr, KM_USER0);
748                 if (left == 0)
749                         goto success;
750         }
751 
752         /* Do it the slow way */
753         kaddr = kmap(page);
754         left = __copy_to_user(desc->buf, kaddr + offset, size);
755         kunmap(page);
756 
757         if (left) {
758                 size -= left;
759                 desc->error = -EFAULT;
760         }
761 success:
762         desc->count = count - size;
763         desc->written += size;
764         desc->buf += size;
765         return size;
766 }
767 
768 /*
769  * This is the "read()" routine for all filesystems
770  * that can use the page cache directly.
771  */
772 ssize_t
773 __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
774                 unsigned long nr_segs, loff_t *ppos)
775 {
776         struct file *filp = iocb->ki_filp;
777         ssize_t retval;
778         unsigned long seg;
779         size_t count;
780 
781         count = 0;
782         for (seg = 0; seg < nr_segs; seg++) {
783                 const struct iovec *iv = &iov[seg];
784 
785                 /*
786                  * If any segment has a negative length, or the cumulative
787                  * length ever wraps negative then return -EINVAL.
788                  */
789                 count += iv->iov_len;
790                 if (unlikely((ssize_t)(count|iv->iov_len) < 0))
791                         return -EINVAL;
792                 if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
793                         continue;
794                 if (seg == 0)
795                         return -EFAULT;
796                 nr_segs = seg;
797                 count -= iv->iov_len;   /* This segment is no good */
798                 break;
799         }
800 
801         /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
802         if (filp->f_flags & O_DIRECT) {
803                 loff_t pos = *ppos, size;
804                 struct address_space *mapping;
805                 struct inode *inode;
806 
807                 mapping = filp->f_dentry->d_inode->i_mapping;
808                 inode = mapping->host;
809                 retval = 0;
810                 if (!count)
811                         goto out; /* skip atime */
812                 size = i_size_read(inode);
813                 if (pos < size) {
814                         retval = generic_file_direct_IO(READ, iocb,
815                                                 iov, pos, nr_segs);
816                         if (retval >= 0 && !is_sync_kiocb(iocb))
817                                 retval = -EIOCBQUEUED;
818                         if (retval > 0)
819                                 *ppos = pos + retval;
820                 }
821                 update_atime(filp->f_dentry->d_inode);
822                 goto out;
823         }
824 
825         retval = 0;
826         if (count) {
827                 for (seg = 0; seg < nr_segs; seg++) {
828                         read_descriptor_t desc;
829 
830                         desc.written = 0;
831                         desc.buf = iov[seg].iov_base;
832                         desc.count = iov[seg].iov_len;
833                         if (desc.count == 0)
834                                 continue;
835                         desc.error = 0;
836                         do_generic_file_read(filp,ppos,&desc,file_read_actor);
837                         retval += desc.written;
838                         if (!retval) {
839                                 retval = desc.error;
840                                 break;
841                         }
842                 }
843         }
844 out:
845         return retval;
846 }
847 
848 EXPORT_SYMBOL(__generic_file_aio_read);
849 
850 ssize_t
851 generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
852 {
853         struct iovec local_iov = { .iov_base = buf, .iov_len = count };
854 
855         BUG_ON(iocb->ki_pos != pos);
856         return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
857 }
858 
859 EXPORT_SYMBOL(generic_file_aio_read);
860 
861 ssize_t
862 generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
863 {
864         struct iovec local_iov = { .iov_base = buf, .iov_len = count };
865         struct kiocb kiocb;
866         ssize_t ret;
867 
868         init_sync_kiocb(&kiocb, filp);
869         ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos);
870         if (-EIOCBQUEUED == ret)
871                 ret = wait_on_sync_kiocb(&kiocb);
872         return ret;
873 }
874 
875 EXPORT_SYMBOL(generic_file_read);
876 
877 int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
878 {
879         ssize_t written;
880         unsigned long count = desc->count;
881         struct file *file = (struct file *) desc->buf;
882 
883         if (size > count)
884                 size = count;
885 
886         written = file->f_op->sendpage(file, page, offset,
887                                        size, &file->f_pos, size<count);
888         if (written < 0) {
889                 desc->error = written;
890                 written = 0;
891         }
892         desc->count = count - written;
893         desc->written += written;
894         return written;
895 }
896 
897 ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
898                          size_t count, read_actor_t actor, void __user *target)
899 {
900         read_descriptor_t desc;
901 
902         if (!count)
903                 return 0;
904 
905         desc.written = 0;
906         desc.count = count;
907         desc.buf = target;
908         desc.error = 0;
909 
910         do_generic_file_read(in_file, ppos, &desc, actor);
911         if (desc.written)
912                 return desc.written;
913         return desc.error;
914 }
915 
916 EXPORT_SYMBOL(generic_file_sendfile);
917 
918 static ssize_t
919 do_readahead(struct address_space *mapping, struct file *filp,
920              unsigned long index, unsigned long nr)
921 {
922         if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
923                 return -EINVAL;
924 
925         force_page_cache_readahead(mapping, filp, index,
926                                         max_sane_readahead(nr));
927         return 0;
928 }
929 
930 asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
931 {
932         ssize_t ret;
933         struct file *file;
934 
935         ret = -EBADF;
936         file = fget(fd);
937         if (file) {
938                 if (file->f_mode & FMODE_READ) {
939                         struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
940                         unsigned long start = offset >> PAGE_CACHE_SHIFT;
941                         unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
942                         unsigned long len = end - start + 1;
943                         ret = do_readahead(mapping, file, start, len);
944                 }
945                 fput(file);
946         }
947         return ret;
948 }
949 
950 #ifdef CONFIG_MMU
951 /*
952  * This adds the requested page to the page cache if it isn't already there,
953  * and schedules an I/O to read in its contents from disk.
954  */
955 static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
956 static int page_cache_read(struct file * file, unsigned long offset)
957 {
958         struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
959         struct page *page; 
960         int error;
961 
962         page = page_cache_alloc_cold(mapping);
963         if (!page)
964                 return -ENOMEM;
965 
966         error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
967         if (!error) {
968                 error = mapping->a_ops->readpage(file, page);
969                 page_cache_release(page);
970                 return error;
971         }
972 
973         /*
974          * We arrive here in the unlikely event that someone 
975          * raced with us and added our page to the cache first
976          * or we are out of memory for radix-tree nodes.
977          */
978         page_cache_release(page);
979         return error == -EEXIST ? 0 : error;
980 }
981 
982 #define MMAP_READAROUND (16UL)
983 #define MMAP_LOTSAMISS  (100)
984 
985 /*
986  * filemap_nopage() is invoked via the vma operations vector for a
987  * mapped memory region to read in file data during a page fault.
988  *
989  * The goto's are kind of ugly, but this streamlines the normal case of having
990  * it in the page cache, and handles the special cases reasonably without
991  * having a lot of duplicated code.
992  */
993 struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int unused)
994 {
995         int error;
996         struct file *file = area->vm_file;
997         struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
998         struct file_ra_state *ra = &file->f_ra;
999         struct inode *inode = mapping->host;
1000         struct page *page;
1001         unsigned long size, pgoff, endoff;
1002         int did_readaround = 0;
1003 
1004         pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1005         endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1006 
1007 retry_all:
1008         size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1009         if (pgoff >= size)
1010                 goto outside_data_content;
1011 
1012         /* If we don't want any read-ahead, don't bother */
1013         if (VM_RandomReadHint(area))
1014                 goto no_cached_page;
1015 
1016         /*
1017          * The "size" of the file, as far as mmap is concerned, isn't bigger
1018          * than the mapping
1019          */
1020         if (size > endoff)
1021                 size = endoff;
1022 
1023         /*
1024          * The readahead code wants to be told about each and every page
1025          * so it can build and shrink its windows appropriately
1026          *
1027          * For sequential accesses, we use the generic readahead logic.
1028          */
1029         if (VM_SequentialReadHint(area))
1030                 page_cache_readahead(mapping, ra, file, pgoff);
1031 
1032         /*
1033          * Do we have something in the page cache already?
1034          */
1035 retry_find:
1036         page = find_get_page(mapping, pgoff);
1037         if (!page) {
1038                 if (VM_SequentialReadHint(area)) {
1039                         handle_ra_miss(mapping, ra, pgoff);
1040                         goto no_cached_page;
1041                 }
1042                 ra->mmap_miss++;
1043 
1044                 /*
1045                  * Do we miss much more than hit in this file? If so,
1046                  * stop bothering with read-ahead. It will only hurt.
1047                  */
1048                 if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS)
1049                         goto no_cached_page;
1050 
1051                 did_readaround = 1;
1052                 do_page_cache_readahead(mapping, file,
1053                                 pgoff & ~(MMAP_READAROUND-1), MMAP_READAROUND);
1054                 goto retry_find;
1055         }
1056 
1057         if (!did_readaround)
1058                 ra->mmap_hit++;
1059 
1060         /*
1061          * Ok, found a page in the page cache, now we need to check
1062          * that it's up-to-date.
1063          */
1064         if (!PageUptodate(page))
1065                 goto page_not_uptodate;
1066 
1067 success:
1068         /*
1069          * Found the page and have a reference on it.
1070          */
1071         mark_page_accessed(page);
1072         return page;
1073 
1074 outside_data_content:
1075         /*
1076          * An external ptracer can access pages that normally aren't
1077          * accessible..
1078          */
1079         if (area->vm_mm == current->mm)
1080                 return NULL;
1081         /* Fall through to the non-read-ahead case */
1082 no_cached_page:
1083         /*
1084          * We're only likely to ever get here if MADV_RANDOM is in
1085          * effect.
1086          */
1087         error = page_cache_read(file, pgoff);
1088 
1089         /*
1090          * The page we want has now been added to the page cache.
1091          * In the unlikely event that someone removed it in the
1092          * meantime, we'll just come back here and read it again.
1093          */
1094         if (error >= 0)
1095                 goto retry_find;
1096 
1097         /*
1098          * An error return from page_cache_read can result if the
1099          * system is low on memory, or a problem occurs while trying
1100          * to schedule I/O.
1101          */
1102         if (error == -ENOMEM)
1103                 return NOPAGE_OOM;
1104         return NULL;
1105 
1106 page_not_uptodate:
1107         inc_page_state(pgmajfault);
1108         lock_page(page);
1109 
1110         /* Did it get unhashed while we waited for it? */
1111         if (!page->mapping) {
1112                 unlock_page(page);
1113                 page_cache_release(page);
1114                 goto retry_all;
1115         }
1116 
1117         /* Did somebody else get it up-to-date? */
1118         if (PageUptodate(page)) {
1119                 unlock_page(page);
1120                 goto success;
1121         }
1122 
1123         if (!mapping->a_ops->readpage(file, page)) {
1124                 wait_on_page_locked(page);
1125                 if (PageUptodate(page))
1126                         goto success;
1127         }
1128 
1129         /*
1130          * Umm, take care of errors if the page isn't up-to-date.
1131          * Try to re-read it _once_. We do this synchronously,
1132          * because there really aren't any performance issues here
1133          * and we need to check for errors.
1134          */
1135         lock_page(page);
1136 
1137         /* Somebody truncated the page on us? */
1138         if (!page->mapping) {
1139                 unlock_page(page);
1140                 page_cache_release(page);
1141                 goto retry_all;
1142         }
1143 
1144         /* Somebody else successfully read it in? */
1145         if (PageUptodate(page)) {
1146                 unlock_page(page);
1147                 goto success;
1148         }
1149         ClearPageError(page);
1150         if (!mapping->a_ops->readpage(file, page)) {
1151                 wait_on_page_locked(page);
1152                 if (PageUptodate(page))
1153                         goto success;
1154         }
1155 
1156         /*
1157          * Things didn't work out. Return zero to tell the
1158          * mm layer so, possibly freeing the page cache page first.
1159          */
1160         page_cache_release(page);
1161         return NULL;
1162 }
1163 
1164 EXPORT_SYMBOL(filemap_nopage);
1165 
1166 static struct page * filemap_getpage(struct file *file, unsigned long pgoff,
1167                                         int nonblock)
1168 {
1169         struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
1170         struct page *page;
1171         int error;
1172 
1173         /*
1174          * Do we have something in the page cache already?
1175          */
1176 retry_find:
1177         page = find_get_page(mapping, pgoff);
1178         if (!page) {
1179                 if (nonblock)
1180                         return NULL;
1181                 goto no_cached_page;
1182         }
1183 
1184         /*
1185          * Ok, found a page in the page cache, now we need to check
1186          * that it's up-to-date.
1187          */
1188         if (!PageUptodate(page))
1189                 goto page_not_uptodate;
1190 
1191 success:
1192         /*
1193          * Found the page and have a reference on it.
1194          */
1195         mark_page_accessed(page);
1196         return page;
1197 
1198 no_cached_page:
1199         error = page_cache_read(file, pgoff);
1200 
1201         /*
1202          * The page we want has now been added to the page cache.
1203          * In the unlikely event that someone removed it in the
1204          * meantime, we'll just come back here and read it again.
1205          */
1206         if (error >= 0)
1207                 goto retry_find;
1208 
1209         /*
1210          * An error return from page_cache_read can result if the
1211          * system is low on memory, or a problem occurs while trying
1212          * to schedule I/O.
1213          */
1214         return NULL;
1215 
1216 page_not_uptodate:
1217         lock_page(page);
1218 
1219         /* Did it get unhashed while we waited for it? */
1220         if (!page->mapping) {
1221                 unlock_page(page);
1222                 goto err;
1223         }
1224 
1225         /* Did somebody else get it up-to-date? */
1226         if (PageUptodate(page)) {
1227                 unlock_page(page);
1228                 goto success;
1229         }
1230 
1231         if (!mapping->a_ops->readpage(file, page)) {
1232                 wait_on_page_locked(page);
1233                 if (PageUptodate(page))
1234                         goto success;
1235         }
1236 
1237         /*
1238          * Umm, take care of errors if the page isn't up-to-date.
1239          * Try to re-read it _once_. We do this synchronously,
1240          * because there really aren't any performance issues here
1241          * and we need to check for errors.
1242          */
1243         lock_page(page);
1244 
1245         /* Somebody truncated the page on us? */
1246         if (!page->mapping) {
1247                 unlock_page(page);
1248                 goto err;
1249         }
1250         /* Somebody else successfully read it in? */
1251         if (PageUptodate(page)) {
1252                 unlock_page(page);
1253                 goto success;
1254         }
1255 
1256         ClearPageError(page);
1257         if (!mapping->a_ops->readpage(file, page)) {
1258                 wait_on_page_locked(page);
1259                 if (PageUptodate(page))
1260                         goto success;
1261         }
1262 
1263         /*
1264          * Things didn't work out. Return zero to tell the
1265          * mm layer so, possibly freeing the page cache page first.
1266          */
1267 err:
1268         page_cache_release(page);
1269 
1270         return NULL;
1271 }
1272 
1273 static int filemap_populate(struct vm_area_struct *vma,
1274                         unsigned long addr,
1275                         unsigned long len,
1276                         pgprot_t prot,
1277                         unsigned long pgoff,
1278                         int nonblock)
1279 {
1280         struct file *file = vma->vm_file;
1281         struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
1282         struct inode *inode = mapping->host;
1283         unsigned long size;
1284         struct mm_struct *mm = vma->vm_mm;
1285         struct page *page;
1286         int err;
1287 
1288         if (!nonblock)
1289                 force_page_cache_readahead(mapping, vma->vm_file,
1290                                         pgoff, len >> PAGE_CACHE_SHIFT);
1291 
1292 repeat:
1293         size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1294         if (pgoff + (len >> PAGE_CACHE_SHIFT) > size)
1295                 return -EINVAL;
1296 
1297         page = filemap_getpage(file, pgoff, nonblock);
1298         if (!page && !nonblock)
1299                 return -ENOMEM;
1300         if (page) {
1301                 err = install_page(mm, vma, addr, page, prot);
1302                 if (err) {
1303                         page_cache_release(page);
1304                         return err;
1305                 }
1306         } else {
1307                 /*
1308                  * If a nonlinear mapping then store the file page offset
1309                  * in the pte.
1310                  */
1311                 unsigned long pgidx;
1312                 pgidx = (addr - vma->vm_start) >> PAGE_SHIFT;
1313                 pgidx += vma->vm_pgoff;
1314                 pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
1315                 if (pgoff != pgidx) {
1316                         err = install_file_pte(mm, vma, addr, pgoff, prot);
1317                         if (err)
1318                                 return err;
1319                 }
1320         }
1321 
1322         len -= PAGE_SIZE;
1323         addr += PAGE_SIZE;
1324         pgoff++;
1325         if (len)
1326                 goto repeat;
1327 
1328         return 0;
1329 }
1330 
1331 static struct vm_operations_struct generic_file_vm_ops = {
1332         .nopage         = filemap_nopage,
1333         .populate       = filemap_populate,
1334 };
1335 
1336 /* This is used for a general mmap of a disk file */
1337 
1338 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1339 {
1340         struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
1341         struct inode *inode = mapping->host;
1342 
1343         if (!mapping->a_ops->readpage)
1344                 return -ENOEXEC;
1345         update_atime(inode);
1346         vma->vm_ops = &generic_file_vm_ops;
1347         return 0;
1348 }
1349 
1350 /*
1351  * This is for filesystems which do not implement ->writepage.
1352  */
1353 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
1354 {
1355         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
1356                 return -EINVAL;
1357         return generic_file_mmap(file, vma);
1358 }
1359 #else
1360 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1361 {
1362         return -ENOSYS;
1363 }
1364 int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
1365 {
1366         return -ENOSYS;
1367 }
1368 #endif /* CONFIG_MMU */
1369 
1370 EXPORT_SYMBOL(generic_file_mmap);
1371 EXPORT_SYMBOL(generic_file_readonly_mmap);
1372 
1373 static inline struct page *__read_cache_page(struct address_space *mapping,
1374                                 unsigned long index,
1375                                 int (*filler)(void *,struct page*),
1376                                 void *data)
1377 {
1378         struct page *page, *cached_page = NULL;
1379         int err;
1380 repeat:
1381         page = find_get_page(mapping, index);
1382         if (!page) {
1383                 if (!cached_page) {
1384                         cached_page = page_cache_alloc_cold(mapping);
1385                         if (!cached_page)
1386                                 return ERR_PTR(-ENOMEM);
1387                 }
1388                 err = add_to_page_cache_lru(cached_page, mapping,
1389                                         index, GFP_KERNEL);
1390                 if (err == -EEXIST)
1391                         goto repeat;
1392                 if (err < 0) {
1393                         /* Presumably ENOMEM for radix tree node */
1394                         page_cache_release(cached_page);
1395                         return ERR_PTR(err);
1396                 }
1397                 page = cached_page;
1398                 cached_page = NULL;
1399                 err = filler(data, page);
1400                 if (err < 0) {
1401                         page_cache_release(page);
1402                         page = ERR_PTR(err);
1403                 }
1404         }
1405         if (cached_page)
1406                 page_cache_release(cached_page);
1407         return page;
1408 }
1409 
1410 /*
1411  * Read into the page cache. If a page already exists,
1412  * and PageUptodate() is not set, try to fill the page.
1413  */
1414 struct page *read_cache_page(struct address_space *mapping,
1415                                 unsigned long index,
1416                                 int (*filler)(void *,struct page*),
1417                                 void *data)
1418 {
1419         struct page *page;
1420         int err;
1421 
1422 retry:
1423         page = __read_cache_page(mapping, index, filler, data);
1424         if (IS_ERR(page))
1425                 goto out;
1426         mark_page_accessed(page);
1427         if (PageUptodate(page))
1428                 goto out;
1429 
1430         lock_page(page);
1431         if (!page->mapping) {
1432                 unlock_page(page);
1433                 page_cache_release(page);
1434                 goto retry;
1435         }
1436         if (PageUptodate(page)) {
1437                 unlock_page(page);
1438                 goto out;
1439         }
1440         err = filler(data, page);
1441         if (err < 0) {
1442                 page_cache_release(page);
1443                 page = ERR_PTR(err);
1444         }
1445  out:
1446         return page;
1447 }
1448 
1449 EXPORT_SYMBOL(read_cache_page);
1450 
1451 /*
1452  * If the page was newly created, increment its refcount and add it to the
1453  * caller's lru-buffering pagevec.  This function is specifically for
1454  * generic_file_write().
1455  */
1456 static inline struct page *
1457 __grab_cache_page(struct address_space *mapping, unsigned long index,
1458                         struct page **cached_page, struct pagevec *lru_pvec)
1459 {
1460         int err;
1461         struct page *page;
1462 repeat:
1463         page = find_lock_page(mapping, index);
1464         if (!page) {
1465                 if (!*cached_page) {
1466                         *cached_page = page_cache_alloc(mapping);
1467                         if (!*cached_page)
1468                                 return NULL;
1469                 }
1470                 err = add_to_page_cache(*cached_page, mapping,
1471                                         index, GFP_KERNEL);
1472                 if (err == -EEXIST)
1473                         goto repeat;
1474                 if (err == 0) {
1475                         page = *cached_page;
1476                         page_cache_get(page);
1477                         if (!pagevec_add(lru_pvec, page))
1478                                 __pagevec_lru_add(lru_pvec);
1479                         *cached_page = NULL;
1480                 }
1481         }
1482         return page;
1483 }
1484 
1485 void remove_suid(struct dentry *dentry)
1486 {
1487         struct iattr newattrs;
1488         struct inode *inode = dentry->d_inode;
1489         unsigned int mode = inode->i_mode & (S_ISUID|S_ISGID|S_IXGRP);
1490 
1491         if (!(mode & S_IXGRP))
1492                 mode &= S_ISUID;
1493 
1494         /* were any of the uid bits set? */
1495         if (mode && !capable(CAP_FSETID)) {
1496                 newattrs.ia_valid = ATTR_KILL_SUID|ATTR_KILL_SGID|ATTR_FORCE;
1497                 notify_change(dentry, &newattrs);
1498         }
1499 }
1500 
1501 EXPORT_SYMBOL(remove_suid);
1502 
1503 /*
1504  * Copy as much as we can into the page and return the number of bytes which
1505  * were sucessfully copied.  If a fault is encountered then clear the page
1506  * out to (offset+bytes) and return the number of bytes which were copied.
1507  */
1508 static inline size_t
1509 filemap_copy_from_user(struct page *page, unsigned long offset,
1510                         const char __user *buf, unsigned bytes)
1511 {
1512         char *kaddr;
1513         int left;
1514 
1515         kaddr = kmap_atomic(page, KM_USER0);
1516         left = __copy_from_user(kaddr + offset, buf, bytes);
1517         kunmap_atomic(kaddr, KM_USER0);
1518 
1519         if (left != 0) {
1520                 /* Do it the slow way */
1521                 kaddr = kmap(page);
1522                 left = __copy_from_user(kaddr + offset, buf, bytes);
1523                 kunmap(page);
1524         }
1525         return bytes - left;
1526 }
1527 
1528 static size_t
1529 __filemap_copy_from_user_iovec(char *vaddr, 
1530                         const struct iovec *iov, size_t base, size_t bytes)
1531 {
1532         size_t copied = 0, left = 0;
1533 
1534         while (bytes) {
1535                 char __user *buf = iov->iov_base + base;
1536                 int copy = min(bytes, iov->iov_len - base);
1537 
1538                 base = 0;
1539                 left = __copy_from_user(vaddr, buf, copy);
1540                 copied += copy;
1541                 bytes -= copy;
1542                 vaddr += copy;
1543                 iov++;
1544 
1545                 if (unlikely(left)) {
1546                         /* zero the rest of the target like __copy_from_user */
1547                         if (bytes)
1548                                 memset(vaddr, 0, bytes);
1549                         break;
1550                 }
1551         }
1552         return copied - left;
1553 }
1554 
1555 /*
1556  * This has the same sideeffects and return value as filemap_copy_from_user().
1557  * The difference is that on a fault we need to memset the remainder of the
1558  * page (out to offset+bytes), to emulate filemap_copy_from_user()'s
1559  * single-segment behaviour.
1560  */
1561 static inline size_t
1562 filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
1563                         const struct iovec *iov, size_t base, size_t bytes)
1564 {
1565         char *kaddr;
1566         size_t copied;
1567 
1568         kaddr = kmap_atomic(page, KM_USER0);
1569         copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
1570                                                 base, bytes);
1571         kunmap_atomic(kaddr, KM_USER0);
1572         if (copied != bytes) {
1573                 kaddr = kmap(page);
1574                 copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
1575                                                         base, bytes);
1576                 kunmap(page);
1577         }
1578         return copied;
1579 }
1580 
1581 static inline void
1582 filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
1583 {
1584         const struct iovec *iov = *iovp;
1585         size_t base = *basep;
1586 
1587         while (bytes) {
1588                 int copy = min(bytes, iov->iov_len - base);
1589 
1590                 bytes -= copy;
1591                 base += copy;
1592                 if (iov->iov_len == base) {
1593                         iov++;
1594                         base = 0;
1595                 }
1596         }
1597         *iovp = iov;
1598         *basep = base;
1599 }
1600 
1601 /*
1602  * Performs necessary checks before doing a write
1603  *
1604  * Can adjust writing position aor amount of bytes to write.
1605  * Returns appropriate error code that caller should return or
1606  * zero in case that write should be allowed.
1607  */
1608 inline int generic_write_checks(struct inode *inode,
1609                 struct file *file, loff_t *pos, size_t *count, int isblk)
1610 {
1611         unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1612 
1613         if (unlikely(*pos < 0))
1614                 return -EINVAL;
1615 
1616         if (unlikely(file->f_error)) {
1617                 int err = file->f_error;
1618                 file->f_error = 0;
1619                 return err;
1620         }
1621 
1622         if (!isblk) {
1623                 /* FIXME: this is for backwards compatibility with 2.4 */
1624                 if (file->f_flags & O_APPEND)
1625                         *pos = i_size_read(inode);
1626 
1627                 if (limit != RLIM_INFINITY) {
1628                         if (*pos >= limit) {
1629                                 send_sig(SIGXFSZ, current, 0);
1630                                 return -EFBIG;
1631                         }
1632                         if (*count > limit - (typeof(limit))*pos) {
1633                                 *count = limit - (typeof(limit))*pos;
1634                         }
1635                 }
1636         }
1637 
1638         /*
1639          * LFS rule
1640          */
1641         if (unlikely(*pos + *count > MAX_NON_LFS &&
1642                                 !(file->f_flags & O_LARGEFILE))) {
1643                 if (*pos >= MAX_NON_LFS) {
1644                         send_sig(SIGXFSZ, current, 0);
1645                         return -EFBIG;
1646                 }
1647                 if (*count > MAX_NON_LFS - (unsigned long)*pos) {
1648                         *count = MAX_NON_LFS - (unsigned long)*pos;
1649                 }
1650         }
1651 
1652         /*
1653          * Are we about to exceed the fs block limit ?
1654          *
1655          * If we have written data it becomes a short write.  If we have
1656          * exceeded without writing data we send a signal and return EFBIG.
1657          * Linus frestrict idea will clean these up nicely..
1658          */
1659         if (likely(!isblk)) {
1660                 if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
1661                         if (*count || *pos > inode->i_sb->s_maxbytes) {
1662                                 send_sig(SIGXFSZ, current, 0);
1663                                 return -EFBIG;
1664                         }
1665                         /* zero-length writes at ->s_maxbytes are OK */
1666                 }
1667 
1668                 if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
1669                         *count = inode->i_sb->s_maxbytes - *pos;
1670         } else {
1671                 loff_t isize;
1672                 if (bdev_read_only(inode->i_bdev))
1673                         return -EPERM;
1674                 isize = i_size_read(inode);
1675                 if (*pos >= isize) {
1676                         if (*count || *pos > isize)
1677                                 return -ENOSPC;
1678                 }
1679 
1680                 if (*pos + *count > isize)
1681                         *count = isize - *pos;
1682         }
1683         return 0;
1684 }
1685 
1686 EXPORT_SYMBOL(generic_write_checks);
1687 
1688 /*
1689  * Write to a file through the page cache. 
1690  *
1691  * We put everything into the page cache prior to writing it. This is not a
1692  * problem when writing full pages. With partial pages, however, we first have
1693  * to read the data into the cache, then dirty the page, and finally schedule
1694  * it for writing by marking it dirty.
1695  *                                                      okir@monad.swb.de
1696  */
1697 ssize_t
1698 generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
1699                                 unsigned long nr_segs, loff_t *ppos)
1700 {
1701         struct file *file = iocb->ki_filp;
1702         struct address_space * mapping = file->f_dentry->d_inode->i_mapping;
1703         struct address_space_operations *a_ops = mapping->a_ops;
1704         size_t ocount;          /* original count */
1705         size_t count;           /* after file limit checks */
1706         struct inode    *inode = mapping->host;
1707         long            status = 0;
1708         loff_t          pos;
1709         struct page     *page;
1710         struct page     *cached_page = NULL;
1711         const int       isblk = S_ISBLK(inode->i_mode);
1712         ssize_t         written;
1713         ssize_t         err;
1714         size_t          bytes;
1715         struct pagevec  lru_pvec;
1716         const struct iovec *cur_iov = iov; /* current iovec */
1717         size_t          iov_base = 0;      /* offset in the current iovec */
1718         unsigned long   seg;
1719         char __user     *buf;
1720 
1721         ocount = 0;
1722         for (seg = 0; seg < nr_segs; seg++) {
1723                 const struct iovec *iv = &iov[seg];
1724 
1725                 /*
1726                  * If any segment has a negative length, or the cumulative
1727                  * length ever wraps negative then return -EINVAL.
1728                  */
1729                 ocount += iv->iov_len;
1730                 if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
1731                         return -EINVAL;
1732                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1733                         continue;
1734                 if (seg == 0)
1735                         return -EFAULT;
1736                 nr_segs = seg;
1737                 ocount -= iv->iov_len;  /* This segment is no good */
1738                 break;
1739         }
1740 
1741         count = ocount;
1742         pos = *ppos;
1743         pagevec_init(&lru_pvec, 0);
1744 
1745         /* We can write back this queue in page reclaim */
1746         current->backing_dev_info = mapping->backing_dev_info;
1747         written = 0;
1748 
1749         err = generic_write_checks(inode, file, &pos, &count, isblk);
1750         if (err)
1751                 goto out;
1752 
1753 
1754         if (count == 0)
1755                 goto out;
1756 
1757         remove_suid(file->f_dentry);
1758         inode_update_time(inode, 1);
1759 
1760         /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1761         if (unlikely(file->f_flags & O_DIRECT)) {
1762                 if (count != ocount)
1763                         nr_segs = iov_shorten((struct iovec *)iov,
1764                                                 nr_segs, count);
1765                 written = generic_file_direct_IO(WRITE, iocb,
1766                                         iov, pos, nr_segs);
1767                 if (written > 0) {
1768                         loff_t end = pos + written;
1769                         if (end > i_size_read(inode) && !isblk) {
1770                                 i_size_write(inode,  end);
1771                                 mark_inode_dirty(inode);
1772                         }
1773                         *ppos = end;
1774                 }
1775                 /*
1776                  * Sync the fs metadata but not the minor inode changes and
1777                  * of course not the data as we did direct DMA for the IO.
1778                  */
1779                 if (written >= 0 && file->f_flags & O_SYNC)
1780                         status = generic_osync_inode(inode, OSYNC_METADATA);
1781                 if (written >= 0 && !is_sync_kiocb(iocb))
1782                         written = -EIOCBQUEUED;
1783                 goto out_status;
1784         }
1785 
1786         buf = iov->iov_base;
1787         do {
1788                 unsigned long index;
1789                 unsigned long offset;
1790                 size_t copied;
1791 
1792                 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
1793                 index = pos >> PAGE_CACHE_SHIFT;
1794                 bytes = PAGE_CACHE_SIZE - offset;
1795                 if (bytes > count)
1796                         bytes = count;
1797 
1798                 /*
1799                  * Bring in the user page that we will copy from _first_.
1800                  * Otherwise there's a nasty deadlock on copying from the
1801                  * same page as we're writing to, without it being marked
1802                  * up-to-date.
1803                  */
1804                 fault_in_pages_readable(buf, bytes);
1805 
1806                 page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
1807                 if (!page) {
1808                         status = -ENOMEM;
1809                         break;
1810                 }
1811 
1812                 status = a_ops->prepare_write(file, page, offset, offset+bytes);
1813                 if (unlikely(status)) {
1814                         loff_t isize = i_size_read(inode);
1815                         /*
1816                          * prepare_write() may have instantiated a few blocks
1817                          * outside i_size.  Trim these off again.
1818                          */
1819                         unlock_page(page);
1820                         page_cache_release(page);
1821                         if (pos + bytes > isize)
1822                                 vmtruncate(inode, isize);
1823                         break;
1824                 }
1825                 if (likely(nr_segs == 1))
1826                         copied = filemap_copy_from_user(page, offset,
1827                                                         buf, bytes);
1828                 else
1829                         copied = filemap_copy_from_user_iovec(page, offset,
1830                                                 cur_iov, iov_base, bytes);
1831                 flush_dcache_page(page);
1832                 status = a_ops->commit_write(file, page, offset, offset+bytes);
1833                 if (likely(copied > 0)) {
1834                         if (!status)
1835                                 status = copied;
1836 
1837                         if (status >= 0) {
1838                                 written += status;
1839                                 count -= status;
1840                                 pos += status;
1841                                 buf += status;
1842                                 if (unlikely(nr_segs > 1))
1843                                         filemap_set_next_iovec(&cur_iov,
1844                                                         &iov_base, status);
1845                         }
1846                 }
1847                 if (unlikely(copied != bytes))
1848                         if (status >= 0)
1849                                 status = -EFAULT;
1850                 unlock_page(page);
1851                 mark_page_accessed(page);
1852                 page_cache_release(page);
1853                 if (status < 0)
1854                         break;
1855                 balance_dirty_pages_ratelimited(mapping);
1856                 cond_resched();
1857         } while (count);
1858         *ppos = pos;
1859 
1860         if (cached_page)
1861                 page_cache_release(cached_page);
1862 
1863         /*
1864          * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
1865          */
1866         if (status >= 0) {
1867                 if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
1868                         status = generic_osync_inode(inode,
1869                                         OSYNC_METADATA|OSYNC_DATA);
1870         }
1871         
1872 out_status:     
1873         err = written ? written : status;
1874 out:
1875         pagevec_lru_add(&lru_pvec);
1876         current->backing_dev_info = 0;
1877         return err;
1878 }
1879 
1880 EXPORT_SYMBOL(generic_file_aio_write_nolock);
1881 
1882 ssize_t
1883 generic_file_write_nolock(struct file *file, const struct iovec *iov,
1884                                 unsigned long nr_segs, loff_t *ppos)
1885 {
1886         struct kiocb kiocb;
1887         ssize_t ret;
1888 
1889         init_sync_kiocb(&kiocb, file);
1890         ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
1891         if (-EIOCBQUEUED == ret)
1892                 ret = wait_on_sync_kiocb(&kiocb);
1893         return ret;
1894 }
1895 
1896 EXPORT_SYMBOL(generic_file_write_nolock);
1897 
1898 ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
1899                                size_t count, loff_t pos)
1900 {
1901         struct file *file = iocb->ki_filp;
1902         struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
1903         ssize_t err;
1904         struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count };
1905 
1906         BUG_ON(iocb->ki_pos != pos);
1907 
1908         down(&inode->i_sem);
1909         err = generic_file_aio_write_nolock(iocb, &local_iov, 1, 
1910                                                 &iocb->ki_pos);
1911         up(&inode->i_sem);
1912 
1913         return err;
1914 }
1915 
1916 EXPORT_SYMBOL(generic_file_aio_write);
1917 
1918 ssize_t generic_file_write(struct file *file, const char __user *buf,
1919                            size_t count, loff_t *ppos)
1920 {
1921         struct inode    *inode = file->f_dentry->d_inode->i_mapping->host;
1922         ssize_t         err;
1923         struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count };
1924 
1925         down(&inode->i_sem);
1926         err = generic_file_write_nolock(file, &local_iov, 1, ppos);
1927         up(&inode->i_sem);
1928 
1929         return err;
1930 }
1931 
1932 EXPORT_SYMBOL(generic_file_write);
1933 
1934 ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
1935                         unsigned long nr_segs, loff_t *ppos)
1936 {
1937         struct kiocb kiocb;
1938         ssize_t ret;
1939 
1940         init_sync_kiocb(&kiocb, filp);
1941         ret = __generic_file_aio_read(&kiocb, iov, nr_segs, ppos);
1942         if (-EIOCBQUEUED == ret)
1943                 ret = wait_on_sync_kiocb(&kiocb);
1944         return ret;
1945 }
1946 
1947 EXPORT_SYMBOL(generic_file_readv);
1948 
1949 ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
1950                         unsigned long nr_segs, loff_t * ppos) 
1951 {
1952         struct inode *inode = file->f_dentry->d_inode;
1953         ssize_t ret;
1954 
1955         down(&inode->i_sem);
1956         ret = generic_file_write_nolock(file, iov, nr_segs, ppos);
1957         up(&inode->i_sem);
1958         return ret;
1959 }
1960 
1961 EXPORT_SYMBOL(generic_file_writev);
1962 
1963 ssize_t
1964 generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
1965         loff_t offset, unsigned long nr_segs)
1966 {
1967         struct file *file = iocb->ki_filp;
1968         struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
1969         ssize_t retval;
1970 
1971         if (mapping->nrpages) {
1972                 retval = filemap_fdatawrite(mapping);
1973                 if (retval == 0)
1974                         retval = filemap_fdatawait(mapping);
1975                 if (retval)
1976                         goto out;
1977         }
1978 
1979         retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs);
1980         if (rw == WRITE && mapping->nrpages)
1981                 invalidate_inode_pages2(mapping);
1982 out:
1983         return retval;
1984 }
1985 
1986 EXPORT_SYMBOL_GPL(generic_file_direct_IO);
1987 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp