~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/core/page_pool.c

Version: ~ [ linux-5.12-rc7 ] ~ [ linux-5.11.13 ] ~ [ linux-5.10.29 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.111 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.186 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.230 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.266 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.266 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.18.140 ] ~ [ linux-3.16.85 ] ~ [ linux-3.14.79 ] ~ [ linux-3.12.74 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /* SPDX-License-Identifier: GPL-2.0
  2  *
  3  * page_pool.c
  4  *      Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
  5  *      Copyright (C) 2016 Red Hat, Inc.
  6  */
  7 
  8 #include <linux/types.h>
  9 #include <linux/kernel.h>
 10 #include <linux/slab.h>
 11 #include <linux/device.h>
 12 
 13 #include <net/page_pool.h>
 14 #include <linux/dma-direction.h>
 15 #include <linux/dma-mapping.h>
 16 #include <linux/page-flags.h>
 17 #include <linux/mm.h> /* for __put_page() */
 18 
 19 #include <trace/events/page_pool.h>
 20 
 21 #define DEFER_TIME (msecs_to_jiffies(1000))
 22 #define DEFER_WARN_INTERVAL (60 * HZ)
 23 
 24 static int page_pool_init(struct page_pool *pool,
 25                           const struct page_pool_params *params)
 26 {
 27         unsigned int ring_qsize = 1024; /* Default */
 28 
 29         memcpy(&pool->p, params, sizeof(pool->p));
 30 
 31         /* Validate only known flags were used */
 32         if (pool->p.flags & ~(PP_FLAG_ALL))
 33                 return -EINVAL;
 34 
 35         if (pool->p.pool_size)
 36                 ring_qsize = pool->p.pool_size;
 37 
 38         /* Sanity limit mem that can be pinned down */
 39         if (ring_qsize > 32768)
 40                 return -E2BIG;
 41 
 42         /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
 43          * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
 44          * which is the XDP_TX use-case.
 45          */
 46         if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
 47             (pool->p.dma_dir != DMA_BIDIRECTIONAL))
 48                 return -EINVAL;
 49 
 50         if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
 51                 return -ENOMEM;
 52 
 53         atomic_set(&pool->pages_state_release_cnt, 0);
 54 
 55         /* Driver calling page_pool_create() also call page_pool_destroy() */
 56         refcount_set(&pool->user_cnt, 1);
 57 
 58         if (pool->p.flags & PP_FLAG_DMA_MAP)
 59                 get_device(pool->p.dev);
 60 
 61         return 0;
 62 }
 63 
 64 struct page_pool *page_pool_create(const struct page_pool_params *params)
 65 {
 66         struct page_pool *pool;
 67         int err = 0;
 68 
 69         pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
 70         if (!pool)
 71                 return ERR_PTR(-ENOMEM);
 72 
 73         err = page_pool_init(pool, params);
 74         if (err < 0) {
 75                 pr_warn("%s() gave up with errno %d\n", __func__, err);
 76                 kfree(pool);
 77                 return ERR_PTR(err);
 78         }
 79 
 80         return pool;
 81 }
 82 EXPORT_SYMBOL(page_pool_create);
 83 
 84 /* fast path */
 85 static struct page *__page_pool_get_cached(struct page_pool *pool)
 86 {
 87         struct ptr_ring *r = &pool->ring;
 88         struct page *page;
 89 
 90         /* Quicker fallback, avoid locks when ring is empty */
 91         if (__ptr_ring_empty(r))
 92                 return NULL;
 93 
 94         /* Test for safe-context, caller should provide this guarantee */
 95         if (likely(in_serving_softirq())) {
 96                 if (likely(pool->alloc.count)) {
 97                         /* Fast-path */
 98                         page = pool->alloc.cache[--pool->alloc.count];
 99                         return page;
100                 }
101                 /* Slower-path: Alloc array empty, time to refill
102                  *
103                  * Open-coded bulk ptr_ring consumer.
104                  *
105                  * Discussion: the ring consumer lock is not really
106                  * needed due to the softirq/NAPI protection, but
107                  * later need the ability to reclaim pages on the
108                  * ring. Thus, keeping the locks.
109                  */
110                 spin_lock(&r->consumer_lock);
111                 while ((page = __ptr_ring_consume(r))) {
112                         if (pool->alloc.count == PP_ALLOC_CACHE_REFILL)
113                                 break;
114                         pool->alloc.cache[pool->alloc.count++] = page;
115                 }
116                 spin_unlock(&r->consumer_lock);
117                 return page;
118         }
119 
120         /* Slow-path: Get page from locked ring queue */
121         page = ptr_ring_consume(&pool->ring);
122         return page;
123 }
124 
125 /* slow path */
126 noinline
127 static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
128                                                  gfp_t _gfp)
129 {
130         struct page *page;
131         gfp_t gfp = _gfp;
132         dma_addr_t dma;
133 
134         /* We could always set __GFP_COMP, and avoid this branch, as
135          * prep_new_page() can handle order-0 with __GFP_COMP.
136          */
137         if (pool->p.order)
138                 gfp |= __GFP_COMP;
139 
140         /* FUTURE development:
141          *
142          * Current slow-path essentially falls back to single page
143          * allocations, which doesn't improve performance.  This code
144          * need bulk allocation support from the page allocator code.
145          */
146 
147         /* Cache was empty, do real allocation */
148         page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
149         if (!page)
150                 return NULL;
151 
152         if (!(pool->p.flags & PP_FLAG_DMA_MAP))
153                 goto skip_dma_map;
154 
155         /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
156          * since dma_addr_t can be either 32 or 64 bits and does not always fit
157          * into page private data (i.e 32bit cpu with 64bit DMA caps)
158          * This mapping is kept for lifetime of page, until leaving pool.
159          */
160         dma = dma_map_page_attrs(pool->p.dev, page, 0,
161                                  (PAGE_SIZE << pool->p.order),
162                                  pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC);
163         if (dma_mapping_error(pool->p.dev, dma)) {
164                 put_page(page);
165                 return NULL;
166         }
167         page->dma_addr = dma;
168 
169 skip_dma_map:
170         /* Track how many pages are held 'in-flight' */
171         pool->pages_state_hold_cnt++;
172 
173         trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
174 
175         /* When page just alloc'ed is should/must have refcnt 1. */
176         return page;
177 }
178 
179 /* For using page_pool replace: alloc_pages() API calls, but provide
180  * synchronization guarantee for allocation side.
181  */
182 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
183 {
184         struct page *page;
185 
186         /* Fast-path: Get a page from cache */
187         page = __page_pool_get_cached(pool);
188         if (page)
189                 return page;
190 
191         /* Slow-path: cache empty, do real allocation */
192         page = __page_pool_alloc_pages_slow(pool, gfp);
193         return page;
194 }
195 EXPORT_SYMBOL(page_pool_alloc_pages);
196 
197 /* Calculate distance between two u32 values, valid if distance is below 2^(31)
198  *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
199  */
200 #define _distance(a, b) (s32)((a) - (b))
201 
202 static s32 page_pool_inflight(struct page_pool *pool)
203 {
204         u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
205         u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
206         s32 inflight;
207 
208         inflight = _distance(hold_cnt, release_cnt);
209 
210         trace_page_pool_inflight(pool, inflight, hold_cnt, release_cnt);
211         WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
212 
213         return inflight;
214 }
215 
216 /* Cleanup page_pool state from page */
217 static void __page_pool_clean_page(struct page_pool *pool,
218                                    struct page *page)
219 {
220         dma_addr_t dma;
221         int count;
222 
223         if (!(pool->p.flags & PP_FLAG_DMA_MAP))
224                 goto skip_dma_unmap;
225 
226         dma = page->dma_addr;
227         /* DMA unmap */
228         dma_unmap_page_attrs(pool->p.dev, dma,
229                              PAGE_SIZE << pool->p.order, pool->p.dma_dir,
230                              DMA_ATTR_SKIP_CPU_SYNC);
231         page->dma_addr = 0;
232 skip_dma_unmap:
233         /* This may be the last page returned, releasing the pool, so
234          * it is not safe to reference pool afterwards.
235          */
236         count = atomic_inc_return(&pool->pages_state_release_cnt);
237         trace_page_pool_state_release(pool, page, count);
238 }
239 
240 /* unmap the page and clean our state */
241 void page_pool_unmap_page(struct page_pool *pool, struct page *page)
242 {
243         /* When page is unmapped, this implies page will not be
244          * returned to page_pool.
245          */
246         __page_pool_clean_page(pool, page);
247 }
248 EXPORT_SYMBOL(page_pool_unmap_page);
249 
250 /* Return a page to the page allocator, cleaning up our state */
251 static void __page_pool_return_page(struct page_pool *pool, struct page *page)
252 {
253         __page_pool_clean_page(pool, page);
254 
255         put_page(page);
256         /* An optimization would be to call __free_pages(page, pool->p.order)
257          * knowing page is not part of page-cache (thus avoiding a
258          * __page_cache_release() call).
259          */
260 }
261 
262 static bool __page_pool_recycle_into_ring(struct page_pool *pool,
263                                    struct page *page)
264 {
265         int ret;
266         /* BH protection not needed if current is serving softirq */
267         if (in_serving_softirq())
268                 ret = ptr_ring_produce(&pool->ring, page);
269         else
270                 ret = ptr_ring_produce_bh(&pool->ring, page);
271 
272         return (ret == 0) ? true : false;
273 }
274 
275 /* Only allow direct recycling in special circumstances, into the
276  * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
277  *
278  * Caller must provide appropriate safe context.
279  */
280 static bool __page_pool_recycle_direct(struct page *page,
281                                        struct page_pool *pool)
282 {
283         if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
284                 return false;
285 
286         /* Caller MUST have verified/know (page_ref_count(page) == 1) */
287         pool->alloc.cache[pool->alloc.count++] = page;
288         return true;
289 }
290 
291 void __page_pool_put_page(struct page_pool *pool,
292                           struct page *page, bool allow_direct)
293 {
294         /* This allocator is optimized for the XDP mode that uses
295          * one-frame-per-page, but have fallbacks that act like the
296          * regular page allocator APIs.
297          *
298          * refcnt == 1 means page_pool owns page, and can recycle it.
299          */
300         if (likely(page_ref_count(page) == 1)) {
301                 /* Read barrier done in page_ref_count / READ_ONCE */
302 
303                 if (allow_direct && in_serving_softirq())
304                         if (__page_pool_recycle_direct(page, pool))
305                                 return;
306 
307                 if (!__page_pool_recycle_into_ring(pool, page)) {
308                         /* Cache full, fallback to free pages */
309                         __page_pool_return_page(pool, page);
310                 }
311                 return;
312         }
313         /* Fallback/non-XDP mode: API user have elevated refcnt.
314          *
315          * Many drivers split up the page into fragments, and some
316          * want to keep doing this to save memory and do refcnt based
317          * recycling. Support this use case too, to ease drivers
318          * switching between XDP/non-XDP.
319          *
320          * In-case page_pool maintains the DMA mapping, API user must
321          * call page_pool_put_page once.  In this elevated refcnt
322          * case, the DMA is unmapped/released, as driver is likely
323          * doing refcnt based recycle tricks, meaning another process
324          * will be invoking put_page.
325          */
326         __page_pool_clean_page(pool, page);
327         put_page(page);
328 }
329 EXPORT_SYMBOL(__page_pool_put_page);
330 
331 static void __page_pool_empty_ring(struct page_pool *pool)
332 {
333         struct page *page;
334 
335         /* Empty recycle ring */
336         while ((page = ptr_ring_consume_bh(&pool->ring))) {
337                 /* Verify the refcnt invariant of cached pages */
338                 if (!(page_ref_count(page) == 1))
339                         pr_crit("%s() page_pool refcnt %d violation\n",
340                                 __func__, page_ref_count(page));
341 
342                 __page_pool_return_page(pool, page);
343         }
344 }
345 
346 static void page_pool_free(struct page_pool *pool)
347 {
348         if (pool->disconnect)
349                 pool->disconnect(pool);
350 
351         ptr_ring_cleanup(&pool->ring, NULL);
352 
353         if (pool->p.flags & PP_FLAG_DMA_MAP)
354                 put_device(pool->p.dev);
355 
356         kfree(pool);
357 }
358 
359 static void page_pool_scrub(struct page_pool *pool)
360 {
361         struct page *page;
362 
363         /* Empty alloc cache, assume caller made sure this is
364          * no-longer in use, and page_pool_alloc_pages() cannot be
365          * call concurrently.
366          */
367         while (pool->alloc.count) {
368                 page = pool->alloc.cache[--pool->alloc.count];
369                 __page_pool_return_page(pool, page);
370         }
371 
372         /* No more consumers should exist, but producers could still
373          * be in-flight.
374          */
375         __page_pool_empty_ring(pool);
376 }
377 
378 static int page_pool_release(struct page_pool *pool)
379 {
380         int inflight;
381 
382         page_pool_scrub(pool);
383         inflight = page_pool_inflight(pool);
384         if (!inflight)
385                 page_pool_free(pool);
386 
387         return inflight;
388 }
389 
390 static void page_pool_release_retry(struct work_struct *wq)
391 {
392         struct delayed_work *dwq = to_delayed_work(wq);
393         struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
394         int inflight;
395 
396         inflight = page_pool_release(pool);
397         if (!inflight)
398                 return;
399 
400         /* Periodic warning */
401         if (time_after_eq(jiffies, pool->defer_warn)) {
402                 int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
403 
404                 pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
405                         __func__, inflight, sec);
406                 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
407         }
408 
409         /* Still not ready to be disconnected, retry later */
410         schedule_delayed_work(&pool->release_dw, DEFER_TIME);
411 }
412 
413 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *))
414 {
415         refcount_inc(&pool->user_cnt);
416         pool->disconnect = disconnect;
417 }
418 
419 void page_pool_destroy(struct page_pool *pool)
420 {
421         if (!pool)
422                 return;
423 
424         if (!page_pool_put(pool))
425                 return;
426 
427         if (!page_pool_release(pool))
428                 return;
429 
430         pool->defer_start = jiffies;
431         pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
432 
433         INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
434         schedule_delayed_work(&pool->release_dw, DEFER_TIME);
435 }
436 EXPORT_SYMBOL(page_pool_destroy);
437 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp