~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/ceph/mds_client.c

Version: ~ [ linux-5.11-rc3 ] ~ [ linux-5.10.7 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.89 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.167 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.215 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.251 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.251 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.85 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 #include <linux/ceph/ceph_debug.h>
  2 
  3 #include <linux/fs.h>
  4 #include <linux/wait.h>
  5 #include <linux/slab.h>
  6 #include <linux/sched.h>
  7 #include <linux/debugfs.h>
  8 #include <linux/seq_file.h>
  9 
 10 #include "super.h"
 11 #include "mds_client.h"
 12 
 13 #include <linux/ceph/ceph_features.h>
 14 #include <linux/ceph/messenger.h>
 15 #include <linux/ceph/decode.h>
 16 #include <linux/ceph/pagelist.h>
 17 #include <linux/ceph/auth.h>
 18 #include <linux/ceph/debugfs.h>
 19 
 20 /*
 21  * A cluster of MDS (metadata server) daemons is responsible for
 22  * managing the file system namespace (the directory hierarchy and
 23  * inodes) and for coordinating shared access to storage.  Metadata is
 24  * partitioning hierarchically across a number of servers, and that
 25  * partition varies over time as the cluster adjusts the distribution
 26  * in order to balance load.
 27  *
 28  * The MDS client is primarily responsible to managing synchronous
 29  * metadata requests for operations like open, unlink, and so forth.
 30  * If there is a MDS failure, we find out about it when we (possibly
 31  * request and) receive a new MDS map, and can resubmit affected
 32  * requests.
 33  *
 34  * For the most part, though, we take advantage of a lossless
 35  * communications channel to the MDS, and do not need to worry about
 36  * timing out or resubmitting requests.
 37  *
 38  * We maintain a stateful "session" with each MDS we interact with.
 39  * Within each session, we sent periodic heartbeat messages to ensure
 40  * any capabilities or leases we have been issues remain valid.  If
 41  * the session times out and goes stale, our leases and capabilities
 42  * are no longer valid.
 43  */
 44 
 45 struct ceph_reconnect_state {
 46         int nr_caps;
 47         struct ceph_pagelist *pagelist;
 48         bool flock;
 49 };
 50 
 51 static void __wake_requests(struct ceph_mds_client *mdsc,
 52                             struct list_head *head);
 53 
 54 static const struct ceph_connection_operations mds_con_ops;
 55 
 56 
 57 /*
 58  * mds reply parsing
 59  */
 60 
 61 /*
 62  * parse individual inode info
 63  */
 64 static int parse_reply_info_in(void **p, void *end,
 65                                struct ceph_mds_reply_info_in *info,
 66                                u64 features)
 67 {
 68         int err = -EIO;
 69 
 70         info->in = *p;
 71         *p += sizeof(struct ceph_mds_reply_inode) +
 72                 sizeof(*info->in->fragtree.splits) *
 73                 le32_to_cpu(info->in->fragtree.nsplits);
 74 
 75         ceph_decode_32_safe(p, end, info->symlink_len, bad);
 76         ceph_decode_need(p, end, info->symlink_len, bad);
 77         info->symlink = *p;
 78         *p += info->symlink_len;
 79 
 80         if (features & CEPH_FEATURE_DIRLAYOUTHASH)
 81                 ceph_decode_copy_safe(p, end, &info->dir_layout,
 82                                       sizeof(info->dir_layout), bad);
 83         else
 84                 memset(&info->dir_layout, 0, sizeof(info->dir_layout));
 85 
 86         ceph_decode_32_safe(p, end, info->xattr_len, bad);
 87         ceph_decode_need(p, end, info->xattr_len, bad);
 88         info->xattr_data = *p;
 89         *p += info->xattr_len;
 90         return 0;
 91 bad:
 92         return err;
 93 }
 94 
 95 /*
 96  * parse a normal reply, which may contain a (dir+)dentry and/or a
 97  * target inode.
 98  */
 99 static int parse_reply_info_trace(void **p, void *end,
100                                   struct ceph_mds_reply_info_parsed *info,
101                                   u64 features)
102 {
103         int err;
104 
105         if (info->head->is_dentry) {
106                 err = parse_reply_info_in(p, end, &info->diri, features);
107                 if (err < 0)
108                         goto out_bad;
109 
110                 if (unlikely(*p + sizeof(*info->dirfrag) > end))
111                         goto bad;
112                 info->dirfrag = *p;
113                 *p += sizeof(*info->dirfrag) +
114                         sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
115                 if (unlikely(*p > end))
116                         goto bad;
117 
118                 ceph_decode_32_safe(p, end, info->dname_len, bad);
119                 ceph_decode_need(p, end, info->dname_len, bad);
120                 info->dname = *p;
121                 *p += info->dname_len;
122                 info->dlease = *p;
123                 *p += sizeof(*info->dlease);
124         }
125 
126         if (info->head->is_target) {
127                 err = parse_reply_info_in(p, end, &info->targeti, features);
128                 if (err < 0)
129                         goto out_bad;
130         }
131 
132         if (unlikely(*p != end))
133                 goto bad;
134         return 0;
135 
136 bad:
137         err = -EIO;
138 out_bad:
139         pr_err("problem parsing mds trace %d\n", err);
140         return err;
141 }
142 
143 /*
144  * parse readdir results
145  */
146 static int parse_reply_info_dir(void **p, void *end,
147                                 struct ceph_mds_reply_info_parsed *info,
148                                 u64 features)
149 {
150         u32 num, i = 0;
151         int err;
152 
153         info->dir_dir = *p;
154         if (*p + sizeof(*info->dir_dir) > end)
155                 goto bad;
156         *p += sizeof(*info->dir_dir) +
157                 sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
158         if (*p > end)
159                 goto bad;
160 
161         ceph_decode_need(p, end, sizeof(num) + 2, bad);
162         num = ceph_decode_32(p);
163         info->dir_end = ceph_decode_8(p);
164         info->dir_complete = ceph_decode_8(p);
165         if (num == 0)
166                 goto done;
167 
168         /* alloc large array */
169         info->dir_nr = num;
170         info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
171                                sizeof(*info->dir_dname) +
172                                sizeof(*info->dir_dname_len) +
173                                sizeof(*info->dir_dlease),
174                                GFP_NOFS);
175         if (info->dir_in == NULL) {
176                 err = -ENOMEM;
177                 goto out_bad;
178         }
179         info->dir_dname = (void *)(info->dir_in + num);
180         info->dir_dname_len = (void *)(info->dir_dname + num);
181         info->dir_dlease = (void *)(info->dir_dname_len + num);
182 
183         while (num) {
184                 /* dentry */
185                 ceph_decode_need(p, end, sizeof(u32)*2, bad);
186                 info->dir_dname_len[i] = ceph_decode_32(p);
187                 ceph_decode_need(p, end, info->dir_dname_len[i], bad);
188                 info->dir_dname[i] = *p;
189                 *p += info->dir_dname_len[i];
190                 dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
191                      info->dir_dname[i]);
192                 info->dir_dlease[i] = *p;
193                 *p += sizeof(struct ceph_mds_reply_lease);
194 
195                 /* inode */
196                 err = parse_reply_info_in(p, end, &info->dir_in[i], features);
197                 if (err < 0)
198                         goto out_bad;
199                 i++;
200                 num--;
201         }
202 
203 done:
204         if (*p != end)
205                 goto bad;
206         return 0;
207 
208 bad:
209         err = -EIO;
210 out_bad:
211         pr_err("problem parsing dir contents %d\n", err);
212         return err;
213 }
214 
215 /*
216  * parse fcntl F_GETLK results
217  */
218 static int parse_reply_info_filelock(void **p, void *end,
219                                      struct ceph_mds_reply_info_parsed *info,
220                                      u64 features)
221 {
222         if (*p + sizeof(*info->filelock_reply) > end)
223                 goto bad;
224 
225         info->filelock_reply = *p;
226         *p += sizeof(*info->filelock_reply);
227 
228         if (unlikely(*p != end))
229                 goto bad;
230         return 0;
231 
232 bad:
233         return -EIO;
234 }
235 
236 /*
237  * parse create results
238  */
239 static int parse_reply_info_create(void **p, void *end,
240                                   struct ceph_mds_reply_info_parsed *info,
241                                   u64 features)
242 {
243         if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
244                 if (*p == end) {
245                         info->has_create_ino = false;
246                 } else {
247                         info->has_create_ino = true;
248                         info->ino = ceph_decode_64(p);
249                 }
250         }
251 
252         if (unlikely(*p != end))
253                 goto bad;
254         return 0;
255 
256 bad:
257         return -EIO;
258 }
259 
260 /*
261  * parse extra results
262  */
263 static int parse_reply_info_extra(void **p, void *end,
264                                   struct ceph_mds_reply_info_parsed *info,
265                                   u64 features)
266 {
267         if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
268                 return parse_reply_info_filelock(p, end, info, features);
269         else if (info->head->op == CEPH_MDS_OP_READDIR ||
270                  info->head->op == CEPH_MDS_OP_LSSNAP)
271                 return parse_reply_info_dir(p, end, info, features);
272         else if (info->head->op == CEPH_MDS_OP_CREATE)
273                 return parse_reply_info_create(p, end, info, features);
274         else
275                 return -EIO;
276 }
277 
278 /*
279  * parse entire mds reply
280  */
281 static int parse_reply_info(struct ceph_msg *msg,
282                             struct ceph_mds_reply_info_parsed *info,
283                             u64 features)
284 {
285         void *p, *end;
286         u32 len;
287         int err;
288 
289         info->head = msg->front.iov_base;
290         p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
291         end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
292 
293         /* trace */
294         ceph_decode_32_safe(&p, end, len, bad);
295         if (len > 0) {
296                 ceph_decode_need(&p, end, len, bad);
297                 err = parse_reply_info_trace(&p, p+len, info, features);
298                 if (err < 0)
299                         goto out_bad;
300         }
301 
302         /* extra */
303         ceph_decode_32_safe(&p, end, len, bad);
304         if (len > 0) {
305                 ceph_decode_need(&p, end, len, bad);
306                 err = parse_reply_info_extra(&p, p+len, info, features);
307                 if (err < 0)
308                         goto out_bad;
309         }
310 
311         /* snap blob */
312         ceph_decode_32_safe(&p, end, len, bad);
313         info->snapblob_len = len;
314         info->snapblob = p;
315         p += len;
316 
317         if (p != end)
318                 goto bad;
319         return 0;
320 
321 bad:
322         err = -EIO;
323 out_bad:
324         pr_err("mds parse_reply err %d\n", err);
325         return err;
326 }
327 
328 static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
329 {
330         kfree(info->dir_in);
331 }
332 
333 
334 /*
335  * sessions
336  */
337 static const char *session_state_name(int s)
338 {
339         switch (s) {
340         case CEPH_MDS_SESSION_NEW: return "new";
341         case CEPH_MDS_SESSION_OPENING: return "opening";
342         case CEPH_MDS_SESSION_OPEN: return "open";
343         case CEPH_MDS_SESSION_HUNG: return "hung";
344         case CEPH_MDS_SESSION_CLOSING: return "closing";
345         case CEPH_MDS_SESSION_RESTARTING: return "restarting";
346         case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
347         default: return "???";
348         }
349 }
350 
351 static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
352 {
353         if (atomic_inc_not_zero(&s->s_ref)) {
354                 dout("mdsc get_session %p %d -> %d\n", s,
355                      atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
356                 return s;
357         } else {
358                 dout("mdsc get_session %p 0 -- FAIL", s);
359                 return NULL;
360         }
361 }
362 
363 void ceph_put_mds_session(struct ceph_mds_session *s)
364 {
365         dout("mdsc put_session %p %d -> %d\n", s,
366              atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
367         if (atomic_dec_and_test(&s->s_ref)) {
368                 if (s->s_auth.authorizer)
369                         ceph_auth_destroy_authorizer(
370                                 s->s_mdsc->fsc->client->monc.auth,
371                                 s->s_auth.authorizer);
372                 kfree(s);
373         }
374 }
375 
376 /*
377  * called under mdsc->mutex
378  */
379 struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
380                                                    int mds)
381 {
382         struct ceph_mds_session *session;
383 
384         if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
385                 return NULL;
386         session = mdsc->sessions[mds];
387         dout("lookup_mds_session %p %d\n", session,
388              atomic_read(&session->s_ref));
389         get_session(session);
390         return session;
391 }
392 
393 static bool __have_session(struct ceph_mds_client *mdsc, int mds)
394 {
395         if (mds >= mdsc->max_sessions)
396                 return false;
397         return mdsc->sessions[mds];
398 }
399 
400 static int __verify_registered_session(struct ceph_mds_client *mdsc,
401                                        struct ceph_mds_session *s)
402 {
403         if (s->s_mds >= mdsc->max_sessions ||
404             mdsc->sessions[s->s_mds] != s)
405                 return -ENOENT;
406         return 0;
407 }
408 
409 /*
410  * create+register a new session for given mds.
411  * called under mdsc->mutex.
412  */
413 static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
414                                                  int mds)
415 {
416         struct ceph_mds_session *s;
417 
418         if (mds >= mdsc->mdsmap->m_max_mds)
419                 return ERR_PTR(-EINVAL);
420 
421         s = kzalloc(sizeof(*s), GFP_NOFS);
422         if (!s)
423                 return ERR_PTR(-ENOMEM);
424         s->s_mdsc = mdsc;
425         s->s_mds = mds;
426         s->s_state = CEPH_MDS_SESSION_NEW;
427         s->s_ttl = 0;
428         s->s_seq = 0;
429         mutex_init(&s->s_mutex);
430 
431         ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
432 
433         spin_lock_init(&s->s_gen_ttl_lock);
434         s->s_cap_gen = 0;
435         s->s_cap_ttl = jiffies - 1;
436 
437         spin_lock_init(&s->s_cap_lock);
438         s->s_renew_requested = 0;
439         s->s_renew_seq = 0;
440         INIT_LIST_HEAD(&s->s_caps);
441         s->s_nr_caps = 0;
442         s->s_trim_caps = 0;
443         atomic_set(&s->s_ref, 1);
444         INIT_LIST_HEAD(&s->s_waiting);
445         INIT_LIST_HEAD(&s->s_unsafe);
446         s->s_num_cap_releases = 0;
447         s->s_cap_reconnect = 0;
448         s->s_cap_iterator = NULL;
449         INIT_LIST_HEAD(&s->s_cap_releases);
450         INIT_LIST_HEAD(&s->s_cap_releases_done);
451         INIT_LIST_HEAD(&s->s_cap_flushing);
452         INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
453 
454         dout("register_session mds%d\n", mds);
455         if (mds >= mdsc->max_sessions) {
456                 int newmax = 1 << get_count_order(mds+1);
457                 struct ceph_mds_session **sa;
458 
459                 dout("register_session realloc to %d\n", newmax);
460                 sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
461                 if (sa == NULL)
462                         goto fail_realloc;
463                 if (mdsc->sessions) {
464                         memcpy(sa, mdsc->sessions,
465                                mdsc->max_sessions * sizeof(void *));
466                         kfree(mdsc->sessions);
467                 }
468                 mdsc->sessions = sa;
469                 mdsc->max_sessions = newmax;
470         }
471         mdsc->sessions[mds] = s;
472         atomic_inc(&s->s_ref);  /* one ref to sessions[], one to caller */
473 
474         ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
475                       ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
476 
477         return s;
478 
479 fail_realloc:
480         kfree(s);
481         return ERR_PTR(-ENOMEM);
482 }
483 
484 /*
485  * called under mdsc->mutex
486  */
487 static void __unregister_session(struct ceph_mds_client *mdsc,
488                                struct ceph_mds_session *s)
489 {
490         dout("__unregister_session mds%d %p\n", s->s_mds, s);
491         BUG_ON(mdsc->sessions[s->s_mds] != s);
492         mdsc->sessions[s->s_mds] = NULL;
493         ceph_con_close(&s->s_con);
494         ceph_put_mds_session(s);
495 }
496 
497 /*
498  * drop session refs in request.
499  *
500  * should be last request ref, or hold mdsc->mutex
501  */
502 static void put_request_session(struct ceph_mds_request *req)
503 {
504         if (req->r_session) {
505                 ceph_put_mds_session(req->r_session);
506                 req->r_session = NULL;
507         }
508 }
509 
510 void ceph_mdsc_release_request(struct kref *kref)
511 {
512         struct ceph_mds_request *req = container_of(kref,
513                                                     struct ceph_mds_request,
514                                                     r_kref);
515         if (req->r_request)
516                 ceph_msg_put(req->r_request);
517         if (req->r_reply) {
518                 ceph_msg_put(req->r_reply);
519                 destroy_reply_info(&req->r_reply_info);
520         }
521         if (req->r_inode) {
522                 ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
523                 iput(req->r_inode);
524         }
525         if (req->r_locked_dir)
526                 ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
527         if (req->r_target_inode)
528                 iput(req->r_target_inode);
529         if (req->r_dentry)
530                 dput(req->r_dentry);
531         if (req->r_old_dentry) {
532                 /*
533                  * track (and drop pins for) r_old_dentry_dir
534                  * separately, since r_old_dentry's d_parent may have
535                  * changed between the dir mutex being dropped and
536                  * this request being freed.
537                  */
538                 ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
539                                   CEPH_CAP_PIN);
540                 dput(req->r_old_dentry);
541                 iput(req->r_old_dentry_dir);
542         }
543         kfree(req->r_path1);
544         kfree(req->r_path2);
545         put_request_session(req);
546         ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
547         kfree(req);
548 }
549 
550 /*
551  * lookup session, bump ref if found.
552  *
553  * called under mdsc->mutex.
554  */
555 static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
556                                              u64 tid)
557 {
558         struct ceph_mds_request *req;
559         struct rb_node *n = mdsc->request_tree.rb_node;
560 
561         while (n) {
562                 req = rb_entry(n, struct ceph_mds_request, r_node);
563                 if (tid < req->r_tid)
564                         n = n->rb_left;
565                 else if (tid > req->r_tid)
566                         n = n->rb_right;
567                 else {
568                         ceph_mdsc_get_request(req);
569                         return req;
570                 }
571         }
572         return NULL;
573 }
574 
575 static void __insert_request(struct ceph_mds_client *mdsc,
576                              struct ceph_mds_request *new)
577 {
578         struct rb_node **p = &mdsc->request_tree.rb_node;
579         struct rb_node *parent = NULL;
580         struct ceph_mds_request *req = NULL;
581 
582         while (*p) {
583                 parent = *p;
584                 req = rb_entry(parent, struct ceph_mds_request, r_node);
585                 if (new->r_tid < req->r_tid)
586                         p = &(*p)->rb_left;
587                 else if (new->r_tid > req->r_tid)
588                         p = &(*p)->rb_right;
589                 else
590                         BUG();
591         }
592 
593         rb_link_node(&new->r_node, parent, p);
594         rb_insert_color(&new->r_node, &mdsc->request_tree);
595 }
596 
597 /*
598  * Register an in-flight request, and assign a tid.  Link to directory
599  * are modifying (if any).
600  *
601  * Called under mdsc->mutex.
602  */
603 static void __register_request(struct ceph_mds_client *mdsc,
604                                struct ceph_mds_request *req,
605                                struct inode *dir)
606 {
607         req->r_tid = ++mdsc->last_tid;
608         if (req->r_num_caps)
609                 ceph_reserve_caps(mdsc, &req->r_caps_reservation,
610                                   req->r_num_caps);
611         dout("__register_request %p tid %lld\n", req, req->r_tid);
612         ceph_mdsc_get_request(req);
613         __insert_request(mdsc, req);
614 
615         req->r_uid = current_fsuid();
616         req->r_gid = current_fsgid();
617 
618         if (dir) {
619                 struct ceph_inode_info *ci = ceph_inode(dir);
620 
621                 ihold(dir);
622                 spin_lock(&ci->i_unsafe_lock);
623                 req->r_unsafe_dir = dir;
624                 list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
625                 spin_unlock(&ci->i_unsafe_lock);
626         }
627 }
628 
629 static void __unregister_request(struct ceph_mds_client *mdsc,
630                                  struct ceph_mds_request *req)
631 {
632         dout("__unregister_request %p tid %lld\n", req, req->r_tid);
633         rb_erase(&req->r_node, &mdsc->request_tree);
634         RB_CLEAR_NODE(&req->r_node);
635 
636         if (req->r_unsafe_dir) {
637                 struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
638 
639                 spin_lock(&ci->i_unsafe_lock);
640                 list_del_init(&req->r_unsafe_dir_item);
641                 spin_unlock(&ci->i_unsafe_lock);
642 
643                 iput(req->r_unsafe_dir);
644                 req->r_unsafe_dir = NULL;
645         }
646 
647         complete_all(&req->r_safe_completion);
648 
649         ceph_mdsc_put_request(req);
650 }
651 
652 /*
653  * Choose mds to send request to next.  If there is a hint set in the
654  * request (e.g., due to a prior forward hint from the mds), use that.
655  * Otherwise, consult frag tree and/or caps to identify the
656  * appropriate mds.  If all else fails, choose randomly.
657  *
658  * Called under mdsc->mutex.
659  */
660 static struct dentry *get_nonsnap_parent(struct dentry *dentry)
661 {
662         /*
663          * we don't need to worry about protecting the d_parent access
664          * here because we never renaming inside the snapped namespace
665          * except to resplice to another snapdir, and either the old or new
666          * result is a valid result.
667          */
668         while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
669                 dentry = dentry->d_parent;
670         return dentry;
671 }
672 
673 static int __choose_mds(struct ceph_mds_client *mdsc,
674                         struct ceph_mds_request *req)
675 {
676         struct inode *inode;
677         struct ceph_inode_info *ci;
678         struct ceph_cap *cap;
679         int mode = req->r_direct_mode;
680         int mds = -1;
681         u32 hash = req->r_direct_hash;
682         bool is_hash = req->r_direct_is_hash;
683 
684         /*
685          * is there a specific mds we should try?  ignore hint if we have
686          * no session and the mds is not up (active or recovering).
687          */
688         if (req->r_resend_mds >= 0 &&
689             (__have_session(mdsc, req->r_resend_mds) ||
690              ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
691                 dout("choose_mds using resend_mds mds%d\n",
692                      req->r_resend_mds);
693                 return req->r_resend_mds;
694         }
695 
696         if (mode == USE_RANDOM_MDS)
697                 goto random;
698 
699         inode = NULL;
700         if (req->r_inode) {
701                 inode = req->r_inode;
702         } else if (req->r_dentry) {
703                 /* ignore race with rename; old or new d_parent is okay */
704                 struct dentry *parent = req->r_dentry->d_parent;
705                 struct inode *dir = parent->d_inode;
706 
707                 if (dir->i_sb != mdsc->fsc->sb) {
708                         /* not this fs! */
709                         inode = req->r_dentry->d_inode;
710                 } else if (ceph_snap(dir) != CEPH_NOSNAP) {
711                         /* direct snapped/virtual snapdir requests
712                          * based on parent dir inode */
713                         struct dentry *dn = get_nonsnap_parent(parent);
714                         inode = dn->d_inode;
715                         dout("__choose_mds using nonsnap parent %p\n", inode);
716                 } else {
717                         /* dentry target */
718                         inode = req->r_dentry->d_inode;
719                         if (!inode || mode == USE_AUTH_MDS) {
720                                 /* dir + name */
721                                 inode = dir;
722                                 hash = ceph_dentry_hash(dir, req->r_dentry);
723                                 is_hash = true;
724                         }
725                 }
726         }
727 
728         dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
729              (int)hash, mode);
730         if (!inode)
731                 goto random;
732         ci = ceph_inode(inode);
733 
734         if (is_hash && S_ISDIR(inode->i_mode)) {
735                 struct ceph_inode_frag frag;
736                 int found;
737 
738                 ceph_choose_frag(ci, hash, &frag, &found);
739                 if (found) {
740                         if (mode == USE_ANY_MDS && frag.ndist > 0) {
741                                 u8 r;
742 
743                                 /* choose a random replica */
744                                 get_random_bytes(&r, 1);
745                                 r %= frag.ndist;
746                                 mds = frag.dist[r];
747                                 dout("choose_mds %p %llx.%llx "
748                                      "frag %u mds%d (%d/%d)\n",
749                                      inode, ceph_vinop(inode),
750                                      frag.frag, mds,
751                                      (int)r, frag.ndist);
752                                 if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
753                                     CEPH_MDS_STATE_ACTIVE)
754                                         return mds;
755                         }
756 
757                         /* since this file/dir wasn't known to be
758                          * replicated, then we want to look for the
759                          * authoritative mds. */
760                         mode = USE_AUTH_MDS;
761                         if (frag.mds >= 0) {
762                                 /* choose auth mds */
763                                 mds = frag.mds;
764                                 dout("choose_mds %p %llx.%llx "
765                                      "frag %u mds%d (auth)\n",
766                                      inode, ceph_vinop(inode), frag.frag, mds);
767                                 if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
768                                     CEPH_MDS_STATE_ACTIVE)
769                                         return mds;
770                         }
771                 }
772         }
773 
774         spin_lock(&ci->i_ceph_lock);
775         cap = NULL;
776         if (mode == USE_AUTH_MDS)
777                 cap = ci->i_auth_cap;
778         if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
779                 cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
780         if (!cap) {
781                 spin_unlock(&ci->i_ceph_lock);
782                 goto random;
783         }
784         mds = cap->session->s_mds;
785         dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
786              inode, ceph_vinop(inode), mds,
787              cap == ci->i_auth_cap ? "auth " : "", cap);
788         spin_unlock(&ci->i_ceph_lock);
789         return mds;
790 
791 random:
792         mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
793         dout("choose_mds chose random mds%d\n", mds);
794         return mds;
795 }
796 
797 
798 /*
799  * session messages
800  */
801 static struct ceph_msg *create_session_msg(u32 op, u64 seq)
802 {
803         struct ceph_msg *msg;
804         struct ceph_mds_session_head *h;
805 
806         msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
807                            false);
808         if (!msg) {
809                 pr_err("create_session_msg ENOMEM creating msg\n");
810                 return NULL;
811         }
812         h = msg->front.iov_base;
813         h->op = cpu_to_le32(op);
814         h->seq = cpu_to_le64(seq);
815         return msg;
816 }
817 
818 /*
819  * send session open request.
820  *
821  * called under mdsc->mutex
822  */
823 static int __open_session(struct ceph_mds_client *mdsc,
824                           struct ceph_mds_session *session)
825 {
826         struct ceph_msg *msg;
827         int mstate;
828         int mds = session->s_mds;
829 
830         /* wait for mds to go active? */
831         mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
832         dout("open_session to mds%d (%s)\n", mds,
833              ceph_mds_state_name(mstate));
834         session->s_state = CEPH_MDS_SESSION_OPENING;
835         session->s_renew_requested = jiffies;
836 
837         /* send connect message */
838         msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
839         if (!msg)
840                 return -ENOMEM;
841         ceph_con_send(&session->s_con, msg);
842         return 0;
843 }
844 
845 /*
846  * open sessions for any export targets for the given mds
847  *
848  * called under mdsc->mutex
849  */
850 static struct ceph_mds_session *
851 __open_export_target_session(struct ceph_mds_client *mdsc, int target)
852 {
853         struct ceph_mds_session *session;
854 
855         session = __ceph_lookup_mds_session(mdsc, target);
856         if (!session) {
857                 session = register_session(mdsc, target);
858                 if (IS_ERR(session))
859                         return session;
860         }
861         if (session->s_state == CEPH_MDS_SESSION_NEW ||
862             session->s_state == CEPH_MDS_SESSION_CLOSING)
863                 __open_session(mdsc, session);
864 
865         return session;
866 }
867 
868 struct ceph_mds_session *
869 ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target)
870 {
871         struct ceph_mds_session *session;
872 
873         dout("open_export_target_session to mds%d\n", target);
874 
875         mutex_lock(&mdsc->mutex);
876         session = __open_export_target_session(mdsc, target);
877         mutex_unlock(&mdsc->mutex);
878 
879         return session;
880 }
881 
882 static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
883                                           struct ceph_mds_session *session)
884 {
885         struct ceph_mds_info *mi;
886         struct ceph_mds_session *ts;
887         int i, mds = session->s_mds;
888 
889         if (mds >= mdsc->mdsmap->m_max_mds)
890                 return;
891 
892         mi = &mdsc->mdsmap->m_info[mds];
893         dout("open_export_target_sessions for mds%d (%d targets)\n",
894              session->s_mds, mi->num_export_targets);
895 
896         for (i = 0; i < mi->num_export_targets; i++) {
897                 ts = __open_export_target_session(mdsc, mi->export_targets[i]);
898                 if (!IS_ERR(ts))
899                         ceph_put_mds_session(ts);
900         }
901 }
902 
903 void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
904                                            struct ceph_mds_session *session)
905 {
906         mutex_lock(&mdsc->mutex);
907         __open_export_target_sessions(mdsc, session);
908         mutex_unlock(&mdsc->mutex);
909 }
910 
911 /*
912  * session caps
913  */
914 
915 /*
916  * Free preallocated cap messages assigned to this session
917  */
918 static void cleanup_cap_releases(struct ceph_mds_session *session)
919 {
920         struct ceph_msg *msg;
921 
922         spin_lock(&session->s_cap_lock);
923         while (!list_empty(&session->s_cap_releases)) {
924                 msg = list_first_entry(&session->s_cap_releases,
925                                        struct ceph_msg, list_head);
926                 list_del_init(&msg->list_head);
927                 ceph_msg_put(msg);
928         }
929         while (!list_empty(&session->s_cap_releases_done)) {
930                 msg = list_first_entry(&session->s_cap_releases_done,
931                                        struct ceph_msg, list_head);
932                 list_del_init(&msg->list_head);
933                 ceph_msg_put(msg);
934         }
935         spin_unlock(&session->s_cap_lock);
936 }
937 
938 /*
939  * Helper to safely iterate over all caps associated with a session, with
940  * special care taken to handle a racing __ceph_remove_cap().
941  *
942  * Caller must hold session s_mutex.
943  */
944 static int iterate_session_caps(struct ceph_mds_session *session,
945                                  int (*cb)(struct inode *, struct ceph_cap *,
946                                             void *), void *arg)
947 {
948         struct list_head *p;
949         struct ceph_cap *cap;
950         struct inode *inode, *last_inode = NULL;
951         struct ceph_cap *old_cap = NULL;
952         int ret;
953 
954         dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
955         spin_lock(&session->s_cap_lock);
956         p = session->s_caps.next;
957         while (p != &session->s_caps) {
958                 cap = list_entry(p, struct ceph_cap, session_caps);
959                 inode = igrab(&cap->ci->vfs_inode);
960                 if (!inode) {
961                         p = p->next;
962                         continue;
963                 }
964                 session->s_cap_iterator = cap;
965                 spin_unlock(&session->s_cap_lock);
966 
967                 if (last_inode) {
968                         iput(last_inode);
969                         last_inode = NULL;
970                 }
971                 if (old_cap) {
972                         ceph_put_cap(session->s_mdsc, old_cap);
973                         old_cap = NULL;
974                 }
975 
976                 ret = cb(inode, cap, arg);
977                 last_inode = inode;
978 
979                 spin_lock(&session->s_cap_lock);
980                 p = p->next;
981                 if (cap->ci == NULL) {
982                         dout("iterate_session_caps  finishing cap %p removal\n",
983                              cap);
984                         BUG_ON(cap->session != session);
985                         list_del_init(&cap->session_caps);
986                         session->s_nr_caps--;
987                         cap->session = NULL;
988                         old_cap = cap;  /* put_cap it w/o locks held */
989                 }
990                 if (ret < 0)
991                         goto out;
992         }
993         ret = 0;
994 out:
995         session->s_cap_iterator = NULL;
996         spin_unlock(&session->s_cap_lock);
997 
998         if (last_inode)
999                 iput(last_inode);
1000         if (old_cap)
1001                 ceph_put_cap(session->s_mdsc, old_cap);
1002 
1003         return ret;
1004 }
1005 
1006 static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1007                                   void *arg)
1008 {
1009         struct ceph_inode_info *ci = ceph_inode(inode);
1010         int drop = 0;
1011 
1012         dout("removing cap %p, ci is %p, inode is %p\n",
1013              cap, ci, &ci->vfs_inode);
1014         spin_lock(&ci->i_ceph_lock);
1015         __ceph_remove_cap(cap, false);
1016         if (!__ceph_is_any_real_caps(ci)) {
1017                 struct ceph_mds_client *mdsc =
1018                         ceph_sb_to_client(inode->i_sb)->mdsc;
1019 
1020                 spin_lock(&mdsc->cap_dirty_lock);
1021                 if (!list_empty(&ci->i_dirty_item)) {
1022                         pr_info(" dropping dirty %s state for %p %lld\n",
1023                                 ceph_cap_string(ci->i_dirty_caps),
1024                                 inode, ceph_ino(inode));
1025                         ci->i_dirty_caps = 0;
1026                         list_del_init(&ci->i_dirty_item);
1027                         drop = 1;
1028                 }
1029                 if (!list_empty(&ci->i_flushing_item)) {
1030                         pr_info(" dropping dirty+flushing %s state for %p %lld\n",
1031                                 ceph_cap_string(ci->i_flushing_caps),
1032                                 inode, ceph_ino(inode));
1033                         ci->i_flushing_caps = 0;
1034                         list_del_init(&ci->i_flushing_item);
1035                         mdsc->num_cap_flushing--;
1036                         drop = 1;
1037                 }
1038                 if (drop && ci->i_wrbuffer_ref) {
1039                         pr_info(" dropping dirty data for %p %lld\n",
1040                                 inode, ceph_ino(inode));
1041                         ci->i_wrbuffer_ref = 0;
1042                         ci->i_wrbuffer_ref_head = 0;
1043                         drop++;
1044                 }
1045                 spin_unlock(&mdsc->cap_dirty_lock);
1046         }
1047         spin_unlock(&ci->i_ceph_lock);
1048         while (drop--)
1049                 iput(inode);
1050         return 0;
1051 }
1052 
1053 /*
1054  * caller must hold session s_mutex
1055  */
1056 static void remove_session_caps(struct ceph_mds_session *session)
1057 {
1058         dout("remove_session_caps on %p\n", session);
1059         iterate_session_caps(session, remove_session_caps_cb, NULL);
1060 
1061         spin_lock(&session->s_cap_lock);
1062         if (session->s_nr_caps > 0) {
1063                 struct super_block *sb = session->s_mdsc->fsc->sb;
1064                 struct inode *inode;
1065                 struct ceph_cap *cap, *prev = NULL;
1066                 struct ceph_vino vino;
1067                 /*
1068                  * iterate_session_caps() skips inodes that are being
1069                  * deleted, we need to wait until deletions are complete.
1070                  * __wait_on_freeing_inode() is designed for the job,
1071                  * but it is not exported, so use lookup inode function
1072                  * to access it.
1073                  */
1074                 while (!list_empty(&session->s_caps)) {
1075                         cap = list_entry(session->s_caps.next,
1076                                          struct ceph_cap, session_caps);
1077                         if (cap == prev)
1078                                 break;
1079                         prev = cap;
1080                         vino = cap->ci->i_vino;
1081                         spin_unlock(&session->s_cap_lock);
1082 
1083                         inode = ceph_find_inode(sb, vino);
1084                         iput(inode);
1085 
1086                         spin_lock(&session->s_cap_lock);
1087                 }
1088         }
1089         spin_unlock(&session->s_cap_lock);
1090 
1091         BUG_ON(session->s_nr_caps > 0);
1092         BUG_ON(!list_empty(&session->s_cap_flushing));
1093         cleanup_cap_releases(session);
1094 }
1095 
1096 /*
1097  * wake up any threads waiting on this session's caps.  if the cap is
1098  * old (didn't get renewed on the client reconnect), remove it now.
1099  *
1100  * caller must hold s_mutex.
1101  */
1102 static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
1103                               void *arg)
1104 {
1105         struct ceph_inode_info *ci = ceph_inode(inode);
1106 
1107         wake_up_all(&ci->i_cap_wq);
1108         if (arg) {
1109                 spin_lock(&ci->i_ceph_lock);
1110                 ci->i_wanted_max_size = 0;
1111                 ci->i_requested_max_size = 0;
1112                 spin_unlock(&ci->i_ceph_lock);
1113         }
1114         return 0;
1115 }
1116 
1117 static void wake_up_session_caps(struct ceph_mds_session *session,
1118                                  int reconnect)
1119 {
1120         dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
1121         iterate_session_caps(session, wake_up_session_cb,
1122                              (void *)(unsigned long)reconnect);
1123 }
1124 
1125 /*
1126  * Send periodic message to MDS renewing all currently held caps.  The
1127  * ack will reset the expiration for all caps from this session.
1128  *
1129  * caller holds s_mutex
1130  */
1131 static int send_renew_caps(struct ceph_mds_client *mdsc,
1132                            struct ceph_mds_session *session)
1133 {
1134         struct ceph_msg *msg;
1135         int state;
1136 
1137         if (time_after_eq(jiffies, session->s_cap_ttl) &&
1138             time_after_eq(session->s_cap_ttl, session->s_renew_requested))
1139                 pr_info("mds%d caps stale\n", session->s_mds);
1140         session->s_renew_requested = jiffies;
1141 
1142         /* do not try to renew caps until a recovering mds has reconnected
1143          * with its clients. */
1144         state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
1145         if (state < CEPH_MDS_STATE_RECONNECT) {
1146                 dout("send_renew_caps ignoring mds%d (%s)\n",
1147                      session->s_mds, ceph_mds_state_name(state));
1148                 return 0;
1149         }
1150 
1151         dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
1152                 ceph_mds_state_name(state));
1153         msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
1154                                  ++session->s_renew_seq);
1155         if (!msg)
1156                 return -ENOMEM;
1157         ceph_con_send(&session->s_con, msg);
1158         return 0;
1159 }
1160 
1161 static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
1162                              struct ceph_mds_session *session, u64 seq)
1163 {
1164         struct ceph_msg *msg;
1165 
1166         dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
1167              session->s_mds, session_state_name(session->s_state), seq);
1168         msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
1169         if (!msg)
1170                 return -ENOMEM;
1171         ceph_con_send(&session->s_con, msg);
1172         return 0;
1173 }
1174 
1175 
1176 /*
1177  * Note new cap ttl, and any transition from stale -> not stale (fresh?).
1178  *
1179  * Called under session->s_mutex
1180  */
1181 static void renewed_caps(struct ceph_mds_client *mdsc,
1182                          struct ceph_mds_session *session, int is_renew)
1183 {
1184         int was_stale;
1185         int wake = 0;
1186 
1187         spin_lock(&session->s_cap_lock);
1188         was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl);
1189 
1190         session->s_cap_ttl = session->s_renew_requested +
1191                 mdsc->mdsmap->m_session_timeout*HZ;
1192 
1193         if (was_stale) {
1194                 if (time_before(jiffies, session->s_cap_ttl)) {
1195                         pr_info("mds%d caps renewed\n", session->s_mds);
1196                         wake = 1;
1197                 } else {
1198                         pr_info("mds%d caps still stale\n", session->s_mds);
1199                 }
1200         }
1201         dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
1202              session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
1203              time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
1204         spin_unlock(&session->s_cap_lock);
1205 
1206         if (wake)
1207                 wake_up_session_caps(session, 0);
1208 }
1209 
1210 /*
1211  * send a session close request
1212  */
1213 static int request_close_session(struct ceph_mds_client *mdsc,
1214                                  struct ceph_mds_session *session)
1215 {
1216         struct ceph_msg *msg;
1217 
1218         dout("request_close_session mds%d state %s seq %lld\n",
1219              session->s_mds, session_state_name(session->s_state),
1220              session->s_seq);
1221         msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
1222         if (!msg)
1223                 return -ENOMEM;
1224         ceph_con_send(&session->s_con, msg);
1225         return 0;
1226 }
1227 
1228 /*
1229  * Called with s_mutex held.
1230  */
1231 static int __close_session(struct ceph_mds_client *mdsc,
1232                          struct ceph_mds_session *session)
1233 {
1234         if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
1235                 return 0;
1236         session->s_state = CEPH_MDS_SESSION_CLOSING;
1237         return request_close_session(mdsc, session);
1238 }
1239 
1240 /*
1241  * Trim old(er) caps.
1242  *
1243  * Because we can't cache an inode without one or more caps, we do
1244  * this indirectly: if a cap is unused, we prune its aliases, at which
1245  * point the inode will hopefully get dropped to.
1246  *
1247  * Yes, this is a bit sloppy.  Our only real goal here is to respond to
1248  * memory pressure from the MDS, though, so it needn't be perfect.
1249  */
1250 static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
1251 {
1252         struct ceph_mds_session *session = arg;
1253         struct ceph_inode_info *ci = ceph_inode(inode);
1254         int used, wanted, oissued, mine;
1255 
1256         if (session->s_trim_caps <= 0)
1257                 return -1;
1258 
1259         spin_lock(&ci->i_ceph_lock);
1260         mine = cap->issued | cap->implemented;
1261         used = __ceph_caps_used(ci);
1262         wanted = __ceph_caps_file_wanted(ci);
1263         oissued = __ceph_caps_issued_other(ci, cap);
1264 
1265         dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n",
1266              inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
1267              ceph_cap_string(used), ceph_cap_string(wanted));
1268         if (cap == ci->i_auth_cap) {
1269                 if (ci->i_dirty_caps | ci->i_flushing_caps)
1270                         goto out;
1271                 if ((used | wanted) & CEPH_CAP_ANY_WR)
1272                         goto out;
1273         }
1274         if ((used | wanted) & ~oissued & mine)
1275                 goto out;   /* we need these caps */
1276 
1277         session->s_trim_caps--;
1278         if (oissued) {
1279                 /* we aren't the only cap.. just remove us */
1280                 __ceph_remove_cap(cap, true);
1281         } else {
1282                 /* try to drop referring dentries */
1283                 spin_unlock(&ci->i_ceph_lock);
1284                 d_prune_aliases(inode);
1285                 dout("trim_caps_cb %p cap %p  pruned, count now %d\n",
1286                      inode, cap, atomic_read(&inode->i_count));
1287                 return 0;
1288         }
1289 
1290 out:
1291         spin_unlock(&ci->i_ceph_lock);
1292         return 0;
1293 }
1294 
1295 /*
1296  * Trim session cap count down to some max number.
1297  */
1298 static int trim_caps(struct ceph_mds_client *mdsc,
1299                      struct ceph_mds_session *session,
1300                      int max_caps)
1301 {
1302         int trim_caps = session->s_nr_caps - max_caps;
1303 
1304         dout("trim_caps mds%d start: %d / %d, trim %d\n",
1305              session->s_mds, session->s_nr_caps, max_caps, trim_caps);
1306         if (trim_caps > 0) {
1307                 session->s_trim_caps = trim_caps;
1308                 iterate_session_caps(session, trim_caps_cb, session);
1309                 dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
1310                      session->s_mds, session->s_nr_caps, max_caps,
1311                         trim_caps - session->s_trim_caps);
1312                 session->s_trim_caps = 0;
1313         }
1314         return 0;
1315 }
1316 
1317 /*
1318  * Allocate cap_release messages.  If there is a partially full message
1319  * in the queue, try to allocate enough to cover it's remainder, so that
1320  * we can send it immediately.
1321  *
1322  * Called under s_mutex.
1323  */
1324 int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
1325                           struct ceph_mds_session *session)
1326 {
1327         struct ceph_msg *msg, *partial = NULL;
1328         struct ceph_mds_cap_release *head;
1329         int err = -ENOMEM;
1330         int extra = mdsc->fsc->mount_options->cap_release_safety;
1331         int num;
1332 
1333         dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
1334              extra);
1335 
1336         spin_lock(&session->s_cap_lock);
1337 
1338         if (!list_empty(&session->s_cap_releases)) {
1339                 msg = list_first_entry(&session->s_cap_releases,
1340                                        struct ceph_msg,
1341                                  list_head);
1342                 head = msg->front.iov_base;
1343                 num = le32_to_cpu(head->num);
1344                 if (num) {
1345                         dout(" partial %p with (%d/%d)\n", msg, num,
1346                              (int)CEPH_CAPS_PER_RELEASE);
1347                         extra += CEPH_CAPS_PER_RELEASE - num;
1348                         partial = msg;
1349                 }
1350         }
1351         while (session->s_num_cap_releases < session->s_nr_caps + extra) {
1352                 spin_unlock(&session->s_cap_lock);
1353                 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
1354                                    GFP_NOFS, false);
1355                 if (!msg)
1356                         goto out_unlocked;
1357                 dout("add_cap_releases %p msg %p now %d\n", session, msg,
1358                      (int)msg->front.iov_len);
1359                 head = msg->front.iov_base;
1360                 head->num = cpu_to_le32(0);
1361                 msg->front.iov_len = sizeof(*head);
1362                 spin_lock(&session->s_cap_lock);
1363                 list_add(&msg->list_head, &session->s_cap_releases);
1364                 session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
1365         }
1366 
1367         if (partial) {
1368                 head = partial->front.iov_base;
1369                 num = le32_to_cpu(head->num);
1370                 dout(" queueing partial %p with %d/%d\n", partial, num,
1371                      (int)CEPH_CAPS_PER_RELEASE);
1372                 list_move_tail(&partial->list_head,
1373                                &session->s_cap_releases_done);
1374                 session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num;
1375         }
1376         err = 0;
1377         spin_unlock(&session->s_cap_lock);
1378 out_unlocked:
1379         return err;
1380 }
1381 
1382 /*
1383  * flush all dirty inode data to disk.
1384  *
1385  * returns true if we've flushed through want_flush_seq
1386  */
1387 static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
1388 {
1389         int mds, ret = 1;
1390 
1391         dout("check_cap_flush want %lld\n", want_flush_seq);
1392         mutex_lock(&mdsc->mutex);
1393         for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
1394                 struct ceph_mds_session *session = mdsc->sessions[mds];
1395 
1396                 if (!session)
1397                         continue;
1398                 get_session(session);
1399                 mutex_unlock(&mdsc->mutex);
1400 
1401                 mutex_lock(&session->s_mutex);
1402                 if (!list_empty(&session->s_cap_flushing)) {
1403                         struct ceph_inode_info *ci =
1404                                 list_entry(session->s_cap_flushing.next,
1405                                            struct ceph_inode_info,
1406                                            i_flushing_item);
1407                         struct inode *inode = &ci->vfs_inode;
1408 
1409                         spin_lock(&ci->i_ceph_lock);
1410                         if (ci->i_cap_flush_seq <= want_flush_seq) {
1411                                 dout("check_cap_flush still flushing %p "
1412                                      "seq %lld <= %lld to mds%d\n", inode,
1413                                      ci->i_cap_flush_seq, want_flush_seq,
1414                                      session->s_mds);
1415                                 ret = 0;
1416                         }
1417                         spin_unlock(&ci->i_ceph_lock);
1418                 }
1419                 mutex_unlock(&session->s_mutex);
1420                 ceph_put_mds_session(session);
1421 
1422                 if (!ret)
1423                         return ret;
1424                 mutex_lock(&mdsc->mutex);
1425         }
1426 
1427         mutex_unlock(&mdsc->mutex);
1428         dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
1429         return ret;
1430 }
1431 
1432 /*
1433  * called under s_mutex
1434  */
1435 void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
1436                             struct ceph_mds_session *session)
1437 {
1438         struct ceph_msg *msg;
1439 
1440         dout("send_cap_releases mds%d\n", session->s_mds);
1441         spin_lock(&session->s_cap_lock);
1442         while (!list_empty(&session->s_cap_releases_done)) {
1443                 msg = list_first_entry(&session->s_cap_releases_done,
1444                                  struct ceph_msg, list_head);
1445                 list_del_init(&msg->list_head);
1446                 spin_unlock(&session->s_cap_lock);
1447                 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1448                 dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
1449                 ceph_con_send(&session->s_con, msg);
1450                 spin_lock(&session->s_cap_lock);
1451         }
1452         spin_unlock(&session->s_cap_lock);
1453 }
1454 
1455 static void discard_cap_releases(struct ceph_mds_client *mdsc,
1456                                  struct ceph_mds_session *session)
1457 {
1458         struct ceph_msg *msg;
1459         struct ceph_mds_cap_release *head;
1460         unsigned num;
1461 
1462         dout("discard_cap_releases mds%d\n", session->s_mds);
1463 
1464         if (!list_empty(&session->s_cap_releases)) {
1465                 /* zero out the in-progress message */
1466                 msg = list_first_entry(&session->s_cap_releases,
1467                                         struct ceph_msg, list_head);
1468                 head = msg->front.iov_base;
1469                 num = le32_to_cpu(head->num);
1470                 dout("discard_cap_releases mds%d %p %u\n",
1471                      session->s_mds, msg, num);
1472                 head->num = cpu_to_le32(0);
1473                 msg->front.iov_len = sizeof(*head);
1474                 session->s_num_cap_releases += num;
1475         }
1476 
1477         /* requeue completed messages */
1478         while (!list_empty(&session->s_cap_releases_done)) {
1479                 msg = list_first_entry(&session->s_cap_releases_done,
1480                                  struct ceph_msg, list_head);
1481                 list_del_init(&msg->list_head);
1482 
1483                 head = msg->front.iov_base;
1484                 num = le32_to_cpu(head->num);
1485                 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
1486                      num);
1487                 session->s_num_cap_releases += num;
1488                 head->num = cpu_to_le32(0);
1489                 msg->front.iov_len = sizeof(*head);
1490                 list_add(&msg->list_head, &session->s_cap_releases);
1491         }
1492 }
1493 
1494 /*
1495  * requests
1496  */
1497 
1498 /*
1499  * Create an mds request.
1500  */
1501 struct ceph_mds_request *
1502 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1503 {
1504         struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
1505 
1506         if (!req)
1507                 return ERR_PTR(-ENOMEM);
1508 
1509         mutex_init(&req->r_fill_mutex);
1510         req->r_mdsc = mdsc;
1511         req->r_started = jiffies;
1512         req->r_resend_mds = -1;
1513         INIT_LIST_HEAD(&req->r_unsafe_dir_item);
1514         req->r_fmode = -1;
1515         kref_init(&req->r_kref);
1516         INIT_LIST_HEAD(&req->r_wait);
1517         init_completion(&req->r_completion);
1518         init_completion(&req->r_safe_completion);
1519         INIT_LIST_HEAD(&req->r_unsafe_item);
1520 
1521         req->r_op = op;
1522         req->r_direct_mode = mode;
1523         return req;
1524 }
1525 
1526 /*
1527  * return oldest (lowest) request, tid in request tree, 0 if none.
1528  *
1529  * called under mdsc->mutex.
1530  */
1531 static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
1532 {
1533         if (RB_EMPTY_ROOT(&mdsc->request_tree))
1534                 return NULL;
1535         return rb_entry(rb_first(&mdsc->request_tree),
1536                         struct ceph_mds_request, r_node);
1537 }
1538 
1539 static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
1540 {
1541         struct ceph_mds_request *req = __get_oldest_req(mdsc);
1542 
1543         if (req)
1544                 return req->r_tid;
1545         return 0;
1546 }
1547 
1548 /*
1549  * Build a dentry's path.  Allocate on heap; caller must kfree.  Based
1550  * on build_path_from_dentry in fs/cifs/dir.c.
1551  *
1552  * If @stop_on_nosnap, generate path relative to the first non-snapped
1553  * inode.
1554  *
1555  * Encode hidden .snap dirs as a double /, i.e.
1556  *   foo/.snap/bar -> foo//bar
1557  */
1558 char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
1559                            int stop_on_nosnap)
1560 {
1561         struct dentry *temp;
1562         char *path;
1563         int len, pos;
1564         unsigned seq;
1565 
1566         if (dentry == NULL)
1567                 return ERR_PTR(-EINVAL);
1568 
1569 retry:
1570         len = 0;
1571         seq = read_seqbegin(&rename_lock);
1572         rcu_read_lock();
1573         for (temp = dentry; !IS_ROOT(temp);) {
1574                 struct inode *inode = temp->d_inode;
1575                 if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
1576                         len++;  /* slash only */
1577                 else if (stop_on_nosnap && inode &&
1578                          ceph_snap(inode) == CEPH_NOSNAP)
1579                         break;
1580                 else
1581                         len += 1 + temp->d_name.len;
1582                 temp = temp->d_parent;
1583         }
1584         rcu_read_unlock();
1585         if (len)
1586                 len--;  /* no leading '/' */
1587 
1588         path = kmalloc(len+1, GFP_NOFS);
1589         if (path == NULL)
1590                 return ERR_PTR(-ENOMEM);
1591         pos = len;
1592         path[pos] = 0;  /* trailing null */
1593         rcu_read_lock();
1594         for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
1595                 struct inode *inode;
1596 
1597                 spin_lock(&temp->d_lock);
1598                 inode = temp->d_inode;
1599                 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
1600                         dout("build_path path+%d: %p SNAPDIR\n",
1601                              pos, temp);
1602                 } else if (stop_on_nosnap && inode &&
1603                            ceph_snap(inode) == CEPH_NOSNAP) {
1604                         spin_unlock(&temp->d_lock);
1605                         break;
1606                 } else {
1607                         pos -= temp->d_name.len;
1608                         if (pos < 0) {
1609                                 spin_unlock(&temp->d_lock);
1610                                 break;
1611                         }
1612                         strncpy(path + pos, temp->d_name.name,
1613                                 temp->d_name.len);
1614                 }
1615                 spin_unlock(&temp->d_lock);
1616                 if (pos)
1617                         path[--pos] = '/';
1618                 temp = temp->d_parent;
1619         }
1620         rcu_read_unlock();
1621         if (pos != 0 || read_seqretry(&rename_lock, seq)) {
1622                 pr_err("build_path did not end path lookup where "
1623                        "expected, namelen is %d, pos is %d\n", len, pos);
1624                 /* presumably this is only possible if racing with a
1625                    rename of one of the parent directories (we can not
1626                    lock the dentries above us to prevent this, but
1627                    retrying should be harmless) */
1628                 kfree(path);
1629                 goto retry;
1630         }
1631 
1632         *base = ceph_ino(temp->d_inode);
1633         *plen = len;
1634         dout("build_path on %p %d built %llx '%.*s'\n",
1635              dentry, d_count(dentry), *base, len, path);
1636         return path;
1637 }
1638 
1639 static int build_dentry_path(struct dentry *dentry,
1640                              const char **ppath, int *ppathlen, u64 *pino,
1641                              int *pfreepath)
1642 {
1643         char *path;
1644 
1645         if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
1646                 *pino = ceph_ino(dentry->d_parent->d_inode);
1647                 *ppath = dentry->d_name.name;
1648                 *ppathlen = dentry->d_name.len;
1649                 return 0;
1650         }
1651         path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1652         if (IS_ERR(path))
1653                 return PTR_ERR(path);
1654         *ppath = path;
1655         *pfreepath = 1;
1656         return 0;
1657 }
1658 
1659 static int build_inode_path(struct inode *inode,
1660                             const char **ppath, int *ppathlen, u64 *pino,
1661                             int *pfreepath)
1662 {
1663         struct dentry *dentry;
1664         char *path;
1665 
1666         if (ceph_snap(inode) == CEPH_NOSNAP) {
1667                 *pino = ceph_ino(inode);
1668                 *ppathlen = 0;
1669                 return 0;
1670         }
1671         dentry = d_find_alias(inode);
1672         path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1673         dput(dentry);
1674         if (IS_ERR(path))
1675                 return PTR_ERR(path);
1676         *ppath = path;
1677         *pfreepath = 1;
1678         return 0;
1679 }
1680 
1681 /*
1682  * request arguments may be specified via an inode *, a dentry *, or
1683  * an explicit ino+path.
1684  */
1685 static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
1686                                   const char *rpath, u64 rino,
1687                                   const char **ppath, int *pathlen,
1688                                   u64 *ino, int *freepath)
1689 {
1690         int r = 0;
1691 
1692         if (rinode) {
1693                 r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
1694                 dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
1695                      ceph_snap(rinode));
1696         } else if (rdentry) {
1697                 r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
1698                 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
1699                      *ppath);
1700         } else if (rpath || rino) {
1701                 *ino = rino;
1702                 *ppath = rpath;
1703                 *pathlen = rpath ? strlen(rpath) : 0;
1704                 dout(" path %.*s\n", *pathlen, rpath);
1705         }
1706 
1707         return r;
1708 }
1709 
1710 /*
1711  * called under mdsc->mutex
1712  */
1713 static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1714                                                struct ceph_mds_request *req,
1715                                                int mds)
1716 {
1717         struct ceph_msg *msg;
1718         struct ceph_mds_request_head *head;
1719         const char *path1 = NULL;
1720         const char *path2 = NULL;
1721         u64 ino1 = 0, ino2 = 0;
1722         int pathlen1 = 0, pathlen2 = 0;
1723         int freepath1 = 0, freepath2 = 0;
1724         int len;
1725         u16 releases;
1726         void *p, *end;
1727         int ret;
1728 
1729         ret = set_request_path_attr(req->r_inode, req->r_dentry,
1730                               req->r_path1, req->r_ino1.ino,
1731                               &path1, &pathlen1, &ino1, &freepath1);
1732         if (ret < 0) {
1733                 msg = ERR_PTR(ret);
1734                 goto out;
1735         }
1736 
1737         ret = set_request_path_attr(NULL, req->r_old_dentry,
1738                               req->r_path2, req->r_ino2.ino,
1739                               &path2, &pathlen2, &ino2, &freepath2);
1740         if (ret < 0) {
1741                 msg = ERR_PTR(ret);
1742                 goto out_free1;
1743         }
1744 
1745         len = sizeof(*head) +
1746                 pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
1747 
1748         /* calculate (max) length for cap releases */
1749         len += sizeof(struct ceph_mds_request_release) *
1750                 (!!req->r_inode_drop + !!req->r_dentry_drop +
1751                  !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
1752         if (req->r_dentry_drop)
1753                 len += req->r_dentry->d_name.len;
1754         if (req->r_old_dentry_drop)
1755                 len += req->r_old_dentry->d_name.len;
1756 
1757         msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false);
1758         if (!msg) {
1759                 msg = ERR_PTR(-ENOMEM);
1760                 goto out_free2;
1761         }
1762 
1763         msg->hdr.tid = cpu_to_le64(req->r_tid);
1764 
1765         head = msg->front.iov_base;
1766         p = msg->front.iov_base + sizeof(*head);
1767         end = msg->front.iov_base + msg->front.iov_len;
1768 
1769         head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
1770         head->op = cpu_to_le32(req->r_op);
1771         head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid));
1772         head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid));
1773         head->args = req->r_args;
1774 
1775         ceph_encode_filepath(&p, end, ino1, path1);
1776         ceph_encode_filepath(&p, end, ino2, path2);
1777 
1778         /* make note of release offset, in case we need to replay */
1779         req->r_request_release_offset = p - msg->front.iov_base;
1780 
1781         /* cap releases */
1782         releases = 0;
1783         if (req->r_inode_drop)
1784                 releases += ceph_encode_inode_release(&p,
1785                       req->r_inode ? req->r_inode : req->r_dentry->d_inode,
1786                       mds, req->r_inode_drop, req->r_inode_unless, 0);
1787         if (req->r_dentry_drop)
1788                 releases += ceph_encode_dentry_release(&p, req->r_dentry,
1789                        mds, req->r_dentry_drop, req->r_dentry_unless);
1790         if (req->r_old_dentry_drop)
1791                 releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
1792                        mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
1793         if (req->r_old_inode_drop)
1794                 releases += ceph_encode_inode_release(&p,
1795                       req->r_old_dentry->d_inode,
1796                       mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
1797         head->num_releases = cpu_to_le16(releases);
1798 
1799         BUG_ON(p > end);
1800         msg->front.iov_len = p - msg->front.iov_base;
1801         msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1802 
1803         if (req->r_data_len) {
1804                 /* outbound data set only by ceph_sync_setxattr() */
1805                 BUG_ON(!req->r_pages);
1806                 ceph_msg_data_add_pages(msg, req->r_pages, req->r_data_len, 0);
1807         }
1808 
1809         msg->hdr.data_len = cpu_to_le32(req->r_data_len);
1810         msg->hdr.data_off = cpu_to_le16(0);
1811 
1812 out_free2:
1813         if (freepath2)
1814                 kfree((char *)path2);
1815 out_free1:
1816         if (freepath1)
1817                 kfree((char *)path1);
1818 out:
1819         return msg;
1820 }
1821 
1822 /*
1823  * called under mdsc->mutex if error, under no mutex if
1824  * success.
1825  */
1826 static void complete_request(struct ceph_mds_client *mdsc,
1827                              struct ceph_mds_request *req)
1828 {
1829         if (req->r_callback)
1830                 req->r_callback(mdsc, req);
1831         else
1832                 complete_all(&req->r_completion);
1833 }
1834 
1835 /*
1836  * called under mdsc->mutex
1837  */
1838 static int __prepare_send_request(struct ceph_mds_client *mdsc,
1839                                   struct ceph_mds_request *req,
1840                                   int mds)
1841 {
1842         struct ceph_mds_request_head *rhead;
1843         struct ceph_msg *msg;
1844         int flags = 0;
1845 
1846         req->r_attempts++;
1847         if (req->r_inode) {
1848                 struct ceph_cap *cap =
1849                         ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds);
1850 
1851                 if (cap)
1852                         req->r_sent_on_mseq = cap->mseq;
1853                 else
1854                         req->r_sent_on_mseq = -1;
1855         }
1856         dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
1857              req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
1858 
1859         if (req->r_got_unsafe) {
1860                 /*
1861                  * Replay.  Do not regenerate message (and rebuild
1862                  * paths, etc.); just use the original message.
1863                  * Rebuilding paths will break for renames because
1864                  * d_move mangles the src name.
1865                  */
1866                 msg = req->r_request;
1867                 rhead = msg->front.iov_base;
1868 
1869                 flags = le32_to_cpu(rhead->flags);
1870                 flags |= CEPH_MDS_FLAG_REPLAY;
1871                 rhead->flags = cpu_to_le32(flags);
1872 
1873                 if (req->r_target_inode)
1874                         rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
1875 
1876                 rhead->num_retry = req->r_attempts - 1;
1877 
1878                 /* remove cap/dentry releases from message */
1879                 rhead->num_releases = 0;
1880                 msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset);
1881                 msg->front.iov_len = req->r_request_release_offset;
1882                 return 0;
1883         }
1884 
1885         if (req->r_request) {
1886                 ceph_msg_put(req->r_request);
1887                 req->r_request = NULL;
1888         }
1889         msg = create_request_message(mdsc, req, mds);
1890         if (IS_ERR(msg)) {
1891                 req->r_err = PTR_ERR(msg);
1892                 complete_request(mdsc, req);
1893                 return PTR_ERR(msg);
1894         }
1895         req->r_request = msg;
1896 
1897         rhead = msg->front.iov_base;
1898         rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
1899         if (req->r_got_unsafe)
1900                 flags |= CEPH_MDS_FLAG_REPLAY;
1901         if (req->r_locked_dir)
1902                 flags |= CEPH_MDS_FLAG_WANT_DENTRY;
1903         rhead->flags = cpu_to_le32(flags);
1904         rhead->num_fwd = req->r_num_fwd;
1905         rhead->num_retry = req->r_attempts - 1;
1906         rhead->ino = 0;
1907 
1908         dout(" r_locked_dir = %p\n", req->r_locked_dir);
1909         return 0;
1910 }
1911 
1912 /*
1913  * send request, or put it on the appropriate wait list.
1914  */
1915 static int __do_request(struct ceph_mds_client *mdsc,
1916                         struct ceph_mds_request *req)
1917 {
1918         struct ceph_mds_session *session = NULL;
1919         int mds = -1;
1920         int err = -EAGAIN;
1921 
1922         if (req->r_err || req->r_got_result) {
1923                 if (req->r_aborted)
1924                         __unregister_request(mdsc, req);
1925                 goto out;
1926         }
1927 
1928         if (req->r_timeout &&
1929             time_after_eq(jiffies, req->r_started + req->r_timeout)) {
1930                 dout("do_request timed out\n");
1931                 err = -EIO;
1932                 goto finish;
1933         }
1934 
1935         put_request_session(req);
1936 
1937         mds = __choose_mds(mdsc, req);
1938         if (mds < 0 ||
1939             ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
1940                 dout("do_request no mds or not active, waiting for map\n");
1941                 list_add(&req->r_wait, &mdsc->waiting_for_map);
1942                 goto out;
1943         }
1944 
1945         /* get, open session */
1946         session = __ceph_lookup_mds_session(mdsc, mds);
1947         if (!session) {
1948                 session = register_session(mdsc, mds);
1949                 if (IS_ERR(session)) {
1950                         err = PTR_ERR(session);
1951                         goto finish;
1952                 }
1953         }
1954         req->r_session = get_session(session);
1955 
1956         dout("do_request mds%d session %p state %s\n", mds, session,
1957              session_state_name(session->s_state));
1958         if (session->s_state != CEPH_MDS_SESSION_OPEN &&
1959             session->s_state != CEPH_MDS_SESSION_HUNG) {
1960                 if (session->s_state == CEPH_MDS_SESSION_NEW ||
1961                     session->s_state == CEPH_MDS_SESSION_CLOSING)
1962                         __open_session(mdsc, session);
1963                 list_add(&req->r_wait, &session->s_waiting);
1964                 goto out_session;
1965         }
1966 
1967         /* send request */
1968         req->r_resend_mds = -1;   /* forget any previous mds hint */
1969 
1970         if (req->r_request_started == 0)   /* note request start time */
1971                 req->r_request_started = jiffies;
1972 
1973         err = __prepare_send_request(mdsc, req, mds);
1974         if (!err) {
1975                 ceph_msg_get(req->r_request);
1976                 ceph_con_send(&session->s_con, req->r_request);
1977         }
1978 
1979 out_session:
1980         ceph_put_mds_session(session);
1981 out:
1982         return err;
1983 
1984 finish:
1985         req->r_err = err;
1986         complete_request(mdsc, req);
1987         goto out;
1988 }
1989 
1990 /*
1991  * called under mdsc->mutex
1992  */
1993 static void __wake_requests(struct ceph_mds_client *mdsc,
1994                             struct list_head *head)
1995 {
1996         struct ceph_mds_request *req;
1997         LIST_HEAD(tmp_list);
1998 
1999         list_splice_init(head, &tmp_list);
2000 
2001         while (!list_empty(&tmp_list)) {
2002                 req = list_entry(tmp_list.next,
2003                                  struct ceph_mds_request, r_wait);
2004                 list_del_init(&req->r_wait);
2005                 dout(" wake request %p tid %llu\n", req, req->r_tid);
2006                 __do_request(mdsc, req);
2007         }
2008 }
2009 
2010 /*
2011  * Wake up threads with requests pending for @mds, so that they can
2012  * resubmit their requests to a possibly different mds.
2013  */
2014 static void kick_requests(struct ceph_mds_client *mdsc, int mds)
2015 {
2016         struct ceph_mds_request *req;
2017         struct rb_node *p;
2018 
2019         dout("kick_requests mds%d\n", mds);
2020         for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
2021                 req = rb_entry(p, struct ceph_mds_request, r_node);
2022                 if (req->r_got_unsafe)
2023                         continue;
2024                 if (req->r_session &&
2025                     req->r_session->s_mds == mds) {
2026                         dout(" kicking tid %llu\n", req->r_tid);
2027                         __do_request(mdsc, req);
2028                 }
2029         }
2030 }
2031 
2032 void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
2033                               struct ceph_mds_request *req)
2034 {
2035         dout("submit_request on %p\n", req);
2036         mutex_lock(&mdsc->mutex);
2037         __register_request(mdsc, req, NULL);
2038         __do_request(mdsc, req);
2039         mutex_unlock(&mdsc->mutex);
2040 }
2041 
2042 /*
2043  * Synchrously perform an mds request.  Take care of all of the
2044  * session setup, forwarding, retry details.
2045  */
2046 int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
2047                          struct inode *dir,
2048                          struct ceph_mds_request *req)
2049 {
2050         int err;
2051 
2052         dout("do_request on %p\n", req);
2053 
2054         /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
2055         if (req->r_inode)
2056                 ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
2057         if (req->r_locked_dir)
2058                 ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
2059         if (req->r_old_dentry)
2060                 ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
2061                                   CEPH_CAP_PIN);
2062 
2063         /* issue */
2064         mutex_lock(&mdsc->mutex);
2065         __register_request(mdsc, req, dir);
2066         __do_request(mdsc, req);
2067 
2068         if (req->r_err) {
2069                 err = req->r_err;
2070                 __unregister_request(mdsc, req);
2071                 dout("do_request early error %d\n", err);
2072                 goto out;
2073         }
2074 
2075         /* wait */
2076         mutex_unlock(&mdsc->mutex);
2077         dout("do_request waiting\n");
2078         if (req->r_timeout) {
2079                 err = (long)wait_for_completion_killable_timeout(
2080                         &req->r_completion, req->r_timeout);
2081                 if (err == 0)
2082                         err = -EIO;
2083         } else {
2084                 err = wait_for_completion_killable(&req->r_completion);
2085         }
2086         dout("do_request waited, got %d\n", err);
2087         mutex_lock(&mdsc->mutex);
2088 
2089         /* only abort if we didn't race with a real reply */
2090         if (req->r_got_result) {
2091                 err = le32_to_cpu(req->r_reply_info.head->result);
2092         } else if (err < 0) {
2093                 dout("aborted request %lld with %d\n", req->r_tid, err);
2094 
2095                 /*
2096                  * ensure we aren't running concurrently with
2097                  * ceph_fill_trace or ceph_readdir_prepopulate, which
2098                  * rely on locks (dir mutex) held by our caller.
2099                  */
2100                 mutex_lock(&req->r_fill_mutex);
2101                 req->r_err = err;
2102                 req->r_aborted = true;
2103                 mutex_unlock(&req->r_fill_mutex);
2104 
2105                 if (req->r_locked_dir &&
2106                     (req->r_op & CEPH_MDS_OP_WRITE))
2107                         ceph_invalidate_dir_request(req);
2108         } else {
2109                 err = req->r_err;
2110         }
2111 
2112 out:
2113         mutex_unlock(&mdsc->mutex);
2114         dout("do_request %p done, result %d\n", req, err);
2115         return err;
2116 }
2117 
2118 /*
2119  * Invalidate dir's completeness, dentry lease state on an aborted MDS
2120  * namespace request.
2121  */
2122 void ceph_invalidate_dir_request(struct ceph_mds_request *req)
2123 {
2124         struct inode *inode = req->r_locked_dir;
2125 
2126         dout("invalidate_dir_request %p (complete, lease(s))\n", inode);
2127 
2128         ceph_dir_clear_complete(inode);
2129         if (req->r_dentry)
2130                 ceph_invalidate_dentry_lease(req->r_dentry);
2131         if (req->r_old_dentry)
2132                 ceph_invalidate_dentry_lease(req->r_old_dentry);
2133 }
2134 
2135 /*
2136  * Handle mds reply.
2137  *
2138  * We take the session mutex and parse and process the reply immediately.
2139  * This preserves the logical ordering of replies, capabilities, etc., sent
2140  * by the MDS as they are applied to our local cache.
2141  */
2142 static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2143 {
2144         struct ceph_mds_client *mdsc = session->s_mdsc;
2145         struct ceph_mds_request *req;
2146         struct ceph_mds_reply_head *head = msg->front.iov_base;
2147         struct ceph_mds_reply_info_parsed *rinfo;  /* parsed reply info */
2148         u64 tid;
2149         int err, result;
2150         int mds = session->s_mds;
2151 
2152         if (msg->front.iov_len < sizeof(*head)) {
2153                 pr_err("mdsc_handle_reply got corrupt (short) reply\n");
2154                 ceph_msg_dump(msg);
2155                 return;
2156         }
2157 
2158         /* get request, session */
2159         tid = le64_to_cpu(msg->hdr.tid);
2160         mutex_lock(&mdsc->mutex);
2161         req = __lookup_request(mdsc, tid);
2162         if (!req) {
2163                 dout("handle_reply on unknown tid %llu\n", tid);
2164                 mutex_unlock(&mdsc->mutex);
2165                 return;
2166         }
2167         dout("handle_reply %p\n", req);
2168 
2169         /* correct session? */
2170         if (req->r_session != session) {
2171                 pr_err("mdsc_handle_reply got %llu on session mds%d"
2172                        " not mds%d\n", tid, session->s_mds,
2173                        req->r_session ? req->r_session->s_mds : -1);
2174                 mutex_unlock(&mdsc->mutex);
2175                 goto out;
2176         }
2177 
2178         /* dup? */
2179         if ((req->r_got_unsafe && !head->safe) ||
2180             (req->r_got_safe && head->safe)) {
2181                 pr_warning("got a dup %s reply on %llu from mds%d\n",
2182                            head->safe ? "safe" : "unsafe", tid, mds);
2183                 mutex_unlock(&mdsc->mutex);
2184                 goto out;
2185         }
2186         if (req->r_got_safe && !head->safe) {
2187                 pr_warning("got unsafe after safe on %llu from mds%d\n",
2188                            tid, mds);
2189                 mutex_unlock(&mdsc->mutex);
2190                 goto out;
2191         }
2192 
2193         result = le32_to_cpu(head->result);
2194 
2195         /*
2196          * Handle an ESTALE
2197          * if we're not talking to the authority, send to them
2198          * if the authority has changed while we weren't looking,
2199          * send to new authority
2200          * Otherwise we just have to return an ESTALE
2201          */
2202         if (result == -ESTALE) {
2203                 dout("got ESTALE on request %llu", req->r_tid);
2204                 if (req->r_direct_mode != USE_AUTH_MDS) {
2205                         dout("not using auth, setting for that now");
2206                         req->r_direct_mode = USE_AUTH_MDS;
2207                         __do_request(mdsc, req);
2208                         mutex_unlock(&mdsc->mutex);
2209                         goto out;
2210                 } else  {
2211                         int mds = __choose_mds(mdsc, req);
2212                         if (mds >= 0 && mds != req->r_session->s_mds) {
2213                                 dout("but auth changed, so resending");
2214                                 __do_request(mdsc, req);
2215                                 mutex_unlock(&mdsc->mutex);
2216                                 goto out;
2217                         }
2218                 }
2219                 dout("have to return ESTALE on request %llu", req->r_tid);
2220         }
2221 
2222 
2223         if (head->safe) {
2224                 req->r_got_safe = true;
2225                 __unregister_request(mdsc, req);
2226 
2227                 if (req->r_got_unsafe) {
2228                         /*
2229                          * We already handled the unsafe response, now do the
2230                          * cleanup.  No need to examine the response; the MDS
2231                          * doesn't include any result info in the safe
2232                          * response.  And even if it did, there is nothing
2233                          * useful we could do with a revised return value.
2234                          */
2235                         dout("got safe reply %llu, mds%d\n", tid, mds);
2236                         list_del_init(&req->r_unsafe_item);
2237 
2238                         /* last unsafe request during umount? */
2239                         if (mdsc->stopping && !__get_oldest_req(mdsc))
2240                                 complete_all(&mdsc->safe_umount_waiters);
2241                         mutex_unlock(&mdsc->mutex);
2242                         goto out;
2243                 }
2244         } else {
2245                 req->r_got_unsafe = true;
2246                 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
2247         }
2248 
2249         dout("handle_reply tid %lld result %d\n", tid, result);
2250         rinfo = &req->r_reply_info;
2251         err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
2252         mutex_unlock(&mdsc->mutex);
2253 
2254         mutex_lock(&session->s_mutex);
2255         if (err < 0) {
2256                 pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
2257                 ceph_msg_dump(msg);
2258                 goto out_err;
2259         }
2260 
2261         /* snap trace */
2262         if (rinfo->snapblob_len) {
2263                 down_write(&mdsc->snap_rwsem);
2264                 ceph_update_snap_trace(mdsc, rinfo->snapblob,
2265                                rinfo->snapblob + rinfo->snapblob_len,
2266                                le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
2267                 downgrade_write(&mdsc->snap_rwsem);
2268         } else {
2269                 down_read(&mdsc->snap_rwsem);
2270         }
2271 
2272         /* insert trace into our cache */
2273         mutex_lock(&req->r_fill_mutex);
2274         err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
2275         if (err == 0) {
2276                 if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
2277                                     req->r_op == CEPH_MDS_OP_LSSNAP))
2278                         ceph_readdir_prepopulate(req, req->r_session);
2279                 ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
2280         }
2281         mutex_unlock(&req->r_fill_mutex);
2282 
2283         up_read(&mdsc->snap_rwsem);
2284 out_err:
2285         mutex_lock(&mdsc->mutex);
2286         if (!req->r_aborted) {
2287                 if (err) {
2288                         req->r_err = err;
2289                 } else {
2290                         req->r_reply = msg;
2291                         ceph_msg_get(msg);
2292                         req->r_got_result = true;
2293                 }
2294         } else {
2295                 dout("reply arrived after request %lld was aborted\n", tid);
2296         }
2297         mutex_unlock(&mdsc->mutex);
2298 
2299         ceph_add_cap_releases(mdsc, req->r_session);
2300         mutex_unlock(&session->s_mutex);
2301 
2302         /* kick calling process */
2303         complete_request(mdsc, req);
2304 out:
2305         ceph_mdsc_put_request(req);
2306         return;
2307 }
2308 
2309 
2310 
2311 /*
2312  * handle mds notification that our request has been forwarded.
2313  */
2314 static void handle_forward(struct ceph_mds_client *mdsc,
2315                            struct ceph_mds_session *session,
2316                            struct ceph_msg *msg)
2317 {
2318         struct ceph_mds_request *req;
2319         u64 tid = le64_to_cpu(msg->hdr.tid);
2320         u32 next_mds;
2321         u32 fwd_seq;
2322         int err = -EINVAL;
2323         void *p = msg->front.iov_base;
2324         void *end = p + msg->front.iov_len;
2325 
2326         ceph_decode_need(&p, end, 2*sizeof(u32), bad);
2327         next_mds = ceph_decode_32(&p);
2328         fwd_seq = ceph_decode_32(&p);
2329 
2330         mutex_lock(&mdsc->mutex);
2331         req = __lookup_request(mdsc, tid);
2332         if (!req) {
2333                 dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
2334                 goto out;  /* dup reply? */
2335         }
2336 
2337         if (req->r_aborted) {
2338                 dout("forward tid %llu aborted, unregistering\n", tid);
2339                 __unregister_request(mdsc, req);
2340         } else if (fwd_seq <= req->r_num_fwd) {
2341                 dout("forward tid %llu to mds%d - old seq %d <= %d\n",
2342                      tid, next_mds, req->r_num_fwd, fwd_seq);
2343         } else {
2344                 /* resend. forward race not possible; mds would drop */
2345                 dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
2346                 BUG_ON(req->r_err);
2347                 BUG_ON(req->r_got_result);
2348                 req->r_num_fwd = fwd_seq;
2349                 req->r_resend_mds = next_mds;
2350                 put_request_session(req);
2351                 __do_request(mdsc, req);
2352         }
2353         ceph_mdsc_put_request(req);
2354 out:
2355         mutex_unlock(&mdsc->mutex);
2356         return;
2357 
2358 bad:
2359         pr_err("mdsc_handle_forward decode error err=%d\n", err);
2360 }
2361 
2362 /*
2363  * handle a mds session control message
2364  */
2365 static void handle_session(struct ceph_mds_session *session,
2366                            struct ceph_msg *msg)
2367 {
2368         struct ceph_mds_client *mdsc = session->s_mdsc;
2369         u32 op;
2370         u64 seq;
2371         int mds = session->s_mds;
2372         struct ceph_mds_session_head *h = msg->front.iov_base;
2373         int wake = 0;
2374 
2375         /* decode */
2376         if (msg->front.iov_len != sizeof(*h))
2377                 goto bad;
2378         op = le32_to_cpu(h->op);
2379         seq = le64_to_cpu(h->seq);
2380 
2381         mutex_lock(&mdsc->mutex);
2382         if (op == CEPH_SESSION_CLOSE)
2383                 __unregister_session(mdsc, session);
2384         /* FIXME: this ttl calculation is generous */
2385         session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
2386         mutex_unlock(&mdsc->mutex);
2387 
2388         mutex_lock(&session->s_mutex);
2389 
2390         dout("handle_session mds%d %s %p state %s seq %llu\n",
2391              mds, ceph_session_op_name(op), session,
2392              session_state_name(session->s_state), seq);
2393 
2394         if (session->s_state == CEPH_MDS_SESSION_HUNG) {
2395                 session->s_state = CEPH_MDS_SESSION_OPEN;
2396                 pr_info("mds%d came back\n", session->s_mds);
2397         }
2398 
2399         switch (op) {
2400         case CEPH_SESSION_OPEN:
2401                 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
2402                         pr_info("mds%d reconnect success\n", session->s_mds);
2403                 session->s_state = CEPH_MDS_SESSION_OPEN;
2404                 renewed_caps(mdsc, session, 0);
2405                 wake = 1;
2406                 if (mdsc->stopping)
2407                         __close_session(mdsc, session);
2408                 break;
2409 
2410         case CEPH_SESSION_RENEWCAPS:
2411                 if (session->s_renew_seq == seq)
2412                         renewed_caps(mdsc, session, 1);
2413                 break;
2414 
2415         case CEPH_SESSION_CLOSE:
2416                 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
2417                         pr_info("mds%d reconnect denied\n", session->s_mds);
2418                 remove_session_caps(session);
2419                 wake = 1; /* for good measure */
2420                 wake_up_all(&mdsc->session_close_wq);
2421                 kick_requests(mdsc, mds);
2422                 break;
2423 
2424         case CEPH_SESSION_STALE:
2425                 pr_info("mds%d caps went stale, renewing\n",
2426                         session->s_mds);
2427                 spin_lock(&session->s_gen_ttl_lock);
2428                 session->s_cap_gen++;
2429                 session->s_cap_ttl = jiffies - 1;
2430                 spin_unlock(&session->s_gen_ttl_lock);
2431                 send_renew_caps(mdsc, session);
2432                 break;
2433 
2434         case CEPH_SESSION_RECALL_STATE:
2435                 trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
2436                 break;
2437 
2438         case CEPH_SESSION_FLUSHMSG:
2439                 send_flushmsg_ack(mdsc, session, seq);
2440                 break;
2441 
2442         default:
2443                 pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
2444                 WARN_ON(1);
2445         }
2446 
2447         mutex_unlock(&session->s_mutex);
2448         if (wake) {
2449                 mutex_lock(&mdsc->mutex);
2450                 __wake_requests(mdsc, &session->s_waiting);
2451                 mutex_unlock(&mdsc->mutex);
2452         }
2453         return;
2454 
2455 bad:
2456         pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
2457                (int)msg->front.iov_len);
2458         ceph_msg_dump(msg);
2459         return;
2460 }
2461 
2462 
2463 /*
2464  * called under session->mutex.
2465  */
2466 static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
2467                                    struct ceph_mds_session *session)
2468 {
2469         struct ceph_mds_request *req, *nreq;
2470         int err;
2471 
2472         dout("replay_unsafe_requests mds%d\n", session->s_mds);
2473 
2474         mutex_lock(&mdsc->mutex);
2475         list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
2476                 err = __prepare_send_request(mdsc, req, session->s_mds);
2477                 if (!err) {
2478                         ceph_msg_get(req->r_request);
2479                         ceph_con_send(&session->s_con, req->r_request);
2480                 }
2481         }
2482         mutex_unlock(&mdsc->mutex);
2483 }
2484 
2485 /*
2486  * Encode information about a cap for a reconnect with the MDS.
2487  */
2488 static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2489                           void *arg)
2490 {
2491         union {
2492                 struct ceph_mds_cap_reconnect v2;
2493                 struct ceph_mds_cap_reconnect_v1 v1;
2494         } rec;
2495         size_t reclen;
2496         struct ceph_inode_info *ci;
2497         struct ceph_reconnect_state *recon_state = arg;
2498         struct ceph_pagelist *pagelist = recon_state->pagelist;
2499         char *path;
2500         int pathlen, err;
2501         u64 pathbase;
2502         struct dentry *dentry;
2503 
2504         ci = cap->ci;
2505 
2506         dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
2507              inode, ceph_vinop(inode), cap, cap->cap_id,
2508              ceph_cap_string(cap->issued));
2509         err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
2510         if (err)
2511                 return err;
2512 
2513         dentry = d_find_alias(inode);
2514         if (dentry) {
2515                 path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
2516                 if (IS_ERR(path)) {
2517                         err = PTR_ERR(path);
2518                         goto out_dput;
2519                 }
2520         } else {
2521                 path = NULL;
2522                 pathlen = 0;
2523         }
2524         err = ceph_pagelist_encode_string(pagelist, path, pathlen);
2525         if (err)
2526                 goto out_free;
2527 
2528         spin_lock(&ci->i_ceph_lock);
2529         cap->seq = 0;        /* reset cap seq */
2530         cap->issue_seq = 0;  /* and issue_seq */
2531         cap->mseq = 0;       /* and migrate_seq */
2532         cap->cap_gen = cap->session->s_cap_gen;
2533 
2534         if (recon_state->flock) {
2535                 rec.v2.cap_id = cpu_to_le64(cap->cap_id);
2536                 rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2537                 rec.v2.issued = cpu_to_le32(cap->issued);
2538                 rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2539                 rec.v2.pathbase = cpu_to_le64(pathbase);
2540                 rec.v2.flock_len = 0;
2541                 reclen = sizeof(rec.v2);
2542         } else {
2543                 rec.v1.cap_id = cpu_to_le64(cap->cap_id);
2544                 rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2545                 rec.v1.issued = cpu_to_le32(cap->issued);
2546                 rec.v1.size = cpu_to_le64(inode->i_size);
2547                 ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime);
2548                 ceph_encode_timespec(&rec.v1.atime, &inode->i_atime);
2549                 rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2550                 rec.v1.pathbase = cpu_to_le64(pathbase);
2551                 reclen = sizeof(rec.v1);
2552         }
2553         spin_unlock(&ci->i_ceph_lock);
2554 
2555         if (recon_state->flock) {
2556                 int num_fcntl_locks, num_flock_locks;
2557                 struct ceph_filelock *flocks;
2558 
2559 encode_again:
2560                 spin_lock(&inode->i_lock);
2561                 ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
2562                 spin_unlock(&inode->i_lock);
2563                 flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
2564                                  sizeof(struct ceph_filelock), GFP_NOFS);
2565                 if (!flocks) {
2566                         err = -ENOMEM;
2567                         goto out_free;
2568                 }
2569                 spin_lock(&inode->i_lock);
2570                 err = ceph_encode_locks_to_buffer(inode, flocks,
2571                                                   num_fcntl_locks,
2572                                                   num_flock_locks);
2573                 spin_unlock(&inode->i_lock);
2574                 if (err) {
2575                         kfree(flocks);
2576                         if (err == -ENOSPC)
2577                                 goto encode_again;
2578                         goto out_free;
2579                 }
2580                 /*
2581                  * number of encoded locks is stable, so copy to pagelist
2582                  */
2583                 rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) +
2584                                     (num_fcntl_locks+num_flock_locks) *
2585                                     sizeof(struct ceph_filelock));
2586                 err = ceph_pagelist_append(pagelist, &rec, reclen);
2587                 if (!err)
2588                         err = ceph_locks_to_pagelist(flocks, pagelist,
2589                                                      num_fcntl_locks,
2590                                                      num_flock_locks);
2591                 kfree(flocks);
2592         } else {
2593                 err = ceph_pagelist_append(pagelist, &rec, reclen);
2594         }
2595 
2596         recon_state->nr_caps++;
2597 out_free:
2598         kfree(path);
2599 out_dput:
2600         dput(dentry);
2601         return err;
2602 }
2603 
2604 
2605 /*
2606  * If an MDS fails and recovers, clients need to reconnect in order to
2607  * reestablish shared state.  This includes all caps issued through
2608  * this session _and_ the snap_realm hierarchy.  Because it's not
2609  * clear which snap realms the mds cares about, we send everything we
2610  * know about.. that ensures we'll then get any new info the
2611  * recovering MDS might have.
2612  *
2613  * This is a relatively heavyweight operation, but it's rare.
2614  *
2615  * called with mdsc->mutex held.
2616  */
2617 static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2618                                struct ceph_mds_session *session)
2619 {
2620         struct ceph_msg *reply;
2621         struct rb_node *p;
2622         int mds = session->s_mds;
2623         int err = -ENOMEM;
2624         int s_nr_caps;
2625         struct ceph_pagelist *pagelist;
2626         struct ceph_reconnect_state recon_state;
2627 
2628         pr_info("mds%d reconnect start\n", mds);
2629 
2630         pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
2631         if (!pagelist)
2632                 goto fail_nopagelist;
2633         ceph_pagelist_init(pagelist);
2634 
2635         reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false);
2636         if (!reply)
2637                 goto fail_nomsg;
2638 
2639         mutex_lock(&session->s_mutex);
2640         session->s_state = CEPH_MDS_SESSION_RECONNECTING;
2641         session->s_seq = 0;
2642 
2643         ceph_con_close(&session->s_con);
2644         ceph_con_open(&session->s_con,
2645                       CEPH_ENTITY_TYPE_MDS, mds,
2646                       ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
2647 
2648         /* replay unsafe requests */
2649         replay_unsafe_requests(mdsc, session);
2650 
2651         down_read(&mdsc->snap_rwsem);
2652 
2653         dout("session %p state %s\n", session,
2654              session_state_name(session->s_state));
2655 
2656         spin_lock(&session->s_gen_ttl_lock);
2657         session->s_cap_gen++;
2658         spin_unlock(&session->s_gen_ttl_lock);
2659 
2660         spin_lock(&session->s_cap_lock);
2661         /*
2662          * notify __ceph_remove_cap() that we are composing cap reconnect.
2663          * If a cap get released before being added to the cap reconnect,
2664          * __ceph_remove_cap() should skip queuing cap release.
2665          */
2666         session->s_cap_reconnect = 1;
2667         /* drop old cap expires; we're about to reestablish that state */
2668         discard_cap_releases(mdsc, session);
2669         spin_unlock(&session->s_cap_lock);
2670 
2671         /* traverse this session's caps */
2672         s_nr_caps = session->s_nr_caps;
2673         err = ceph_pagelist_encode_32(pagelist, s_nr_caps);
2674         if (err)
2675                 goto fail;
2676 
2677         recon_state.nr_caps = 0;
2678         recon_state.pagelist = pagelist;
2679         recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
2680         err = iterate_session_caps(session, encode_caps_cb, &recon_state);
2681         if (err < 0)
2682                 goto fail;
2683 
2684         spin_lock(&session->s_cap_lock);
2685         session->s_cap_reconnect = 0;
2686         spin_unlock(&session->s_cap_lock);
2687 
2688         /*
2689          * snaprealms.  we provide mds with the ino, seq (version), and
2690          * parent for all of our realms.  If the mds has any newer info,
2691          * it will tell us.
2692          */
2693         for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
2694                 struct ceph_snap_realm *realm =
2695                         rb_entry(p, struct ceph_snap_realm, node);
2696                 struct ceph_mds_snaprealm_reconnect sr_rec;
2697 
2698                 dout(" adding snap realm %llx seq %lld parent %llx\n",
2699                      realm->ino, realm->seq, realm->parent_ino);
2700                 sr_rec.ino = cpu_to_le64(realm->ino);
2701                 sr_rec.seq = cpu_to_le64(realm->seq);
2702                 sr_rec.parent = cpu_to_le64(realm->parent_ino);
2703                 err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
2704                 if (err)
2705                         goto fail;
2706         }
2707 
2708         if (recon_state.flock)
2709                 reply->hdr.version = cpu_to_le16(2);
2710 
2711         /* raced with cap release? */
2712         if (s_nr_caps != recon_state.nr_caps) {
2713                 struct page *page = list_first_entry(&pagelist->head,
2714                                                      struct page, lru);
2715                 __le32 *addr = kmap_atomic(page);
2716                 *addr = cpu_to_le32(recon_state.nr_caps);
2717                 kunmap_atomic(addr);
2718         }
2719 
2720         reply->hdr.data_len = cpu_to_le32(pagelist->length);
2721         ceph_msg_data_add_pagelist(reply, pagelist);
2722         ceph_con_send(&session->s_con, reply);
2723 
2724         mutex_unlock(&session->s_mutex);
2725 
2726         mutex_lock(&mdsc->mutex);
2727         __wake_requests(mdsc, &session->s_waiting);
2728         mutex_unlock(&mdsc->mutex);
2729 
2730         up_read(&mdsc->snap_rwsem);
2731         return;
2732 
2733 fail:
2734         ceph_msg_put(reply);
2735         up_read(&mdsc->snap_rwsem);
2736         mutex_unlock(&session->s_mutex);
2737 fail_nomsg:
2738         ceph_pagelist_release(pagelist);
2739         kfree(pagelist);
2740 fail_nopagelist:
2741         pr_err("error %d preparing reconnect for mds%d\n", err, mds);
2742         return;
2743 }
2744 
2745 
2746 /*
2747  * compare old and new mdsmaps, kicking requests
2748  * and closing out old connections as necessary
2749  *
2750  * called under mdsc->mutex.
2751  */
2752 static void check_new_map(struct ceph_mds_client *mdsc,
2753                           struct ceph_mdsmap *newmap,
2754                           struct ceph_mdsmap *oldmap)
2755 {
2756         int i;
2757         int oldstate, newstate;
2758         struct ceph_mds_session *s;
2759 
2760         dout("check_new_map new %u old %u\n",
2761              newmap->m_epoch, oldmap->m_epoch);
2762 
2763         for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
2764                 if (mdsc->sessions[i] == NULL)
2765                         continue;
2766                 s = mdsc->sessions[i];
2767                 oldstate = ceph_mdsmap_get_state(oldmap, i);
2768                 newstate = ceph_mdsmap_get_state(newmap, i);
2769 
2770                 dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
2771                      i, ceph_mds_state_name(oldstate),
2772                      ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
2773                      ceph_mds_state_name(newstate),
2774                      ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
2775                      session_state_name(s->s_state));
2776 
2777                 if (i >= newmap->m_max_mds ||
2778                     memcmp(ceph_mdsmap_get_addr(oldmap, i),
2779                            ceph_mdsmap_get_addr(newmap, i),
2780                            sizeof(struct ceph_entity_addr))) {
2781                         if (s->s_state == CEPH_MDS_SESSION_OPENING) {
2782                                 /* the session never opened, just close it
2783                                  * out now */
2784                                 __wake_requests(mdsc, &s->s_waiting);
2785                                 __unregister_session(mdsc, s);
2786                         } else {
2787                                 /* just close it */
2788                                 mutex_unlock(&mdsc->mutex);
2789                                 mutex_lock(&s->s_mutex);
2790                                 mutex_lock(&mdsc->mutex);
2791                                 ceph_con_close(&s->s_con);
2792                                 mutex_unlock(&s->s_mutex);
2793                                 s->s_state = CEPH_MDS_SESSION_RESTARTING;
2794                         }
2795 
2796                         /* kick any requests waiting on the recovering mds */
2797                         kick_requests(mdsc, i);
2798                 } else if (oldstate == newstate) {
2799                         continue;  /* nothing new with this mds */
2800                 }
2801 
2802                 /*
2803                  * send reconnect?
2804                  */
2805                 if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
2806                     newstate >= CEPH_MDS_STATE_RECONNECT) {
2807                         mutex_unlock(&mdsc->mutex);
2808                         send_mds_reconnect(mdsc, s);
2809                         mutex_lock(&mdsc->mutex);
2810                 }
2811 
2812                 /*
2813                  * kick request on any mds that has gone active.
2814                  */
2815                 if (oldstate < CEPH_MDS_STATE_ACTIVE &&
2816                     newstate >= CEPH_MDS_STATE_ACTIVE) {
2817                         if (oldstate != CEPH_MDS_STATE_CREATING &&
2818                             oldstate != CEPH_MDS_STATE_STARTING)
2819                                 pr_info("mds%d recovery completed\n", s->s_mds);
2820                         kick_requests(mdsc, i);
2821                         ceph_kick_flushing_caps(mdsc, s);
2822                         wake_up_session_caps(s, 1);
2823                 }
2824         }
2825 
2826         for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) {
2827                 s = mdsc->sessions[i];
2828                 if (!s)
2829                         continue;
2830                 if (!ceph_mdsmap_is_laggy(newmap, i))
2831                         continue;
2832                 if (s->s_state == CEPH_MDS_SESSION_OPEN ||
2833                     s->s_state == CEPH_MDS_SESSION_HUNG ||
2834                     s->s_state == CEPH_MDS_SESSION_CLOSING) {
2835                         dout(" connecting to export targets of laggy mds%d\n",
2836                              i);
2837                         __open_export_target_sessions(mdsc, s);
2838                 }
2839         }
2840 }
2841 
2842 
2843 
2844 /*
2845  * leases
2846  */
2847 
2848 /*
2849  * caller must hold session s_mutex, dentry->d_lock
2850  */
2851 void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
2852 {
2853         struct ceph_dentry_info *di = ceph_dentry(dentry);
2854 
2855         ceph_put_mds_session(di->lease_session);
2856         di->lease_session = NULL;
2857 }
2858 
2859 static void handle_lease(struct ceph_mds_client *mdsc,
2860                          struct ceph_mds_session *session,
2861                          struct ceph_msg *msg)
2862 {
2863         struct super_block *sb = mdsc->fsc->sb;
2864         struct inode *inode;
2865         struct dentry *parent, *dentry;
2866         struct ceph_dentry_info *di;
2867         int mds = session->s_mds;
2868         struct ceph_mds_lease *h = msg->front.iov_base;
2869         u32 seq;
2870         struct ceph_vino vino;
2871         struct qstr dname;
2872         int release = 0;
2873 
2874         dout("handle_lease from mds%d\n", mds);
2875 
2876         /* decode */
2877         if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
2878                 goto bad;
2879         vino.ino = le64_to_cpu(h->ino);
2880         vino.snap = CEPH_NOSNAP;
2881         seq = le32_to_cpu(h->seq);
2882         dname.name = (void *)h + sizeof(*h) + sizeof(u32);
2883         dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
2884         if (dname.len != get_unaligned_le32(h+1))
2885                 goto bad;
2886 
2887         mutex_lock(&session->s_mutex);
2888         session->s_seq++;
2889 
2890         /* lookup inode */
2891         inode = ceph_find_inode(sb, vino);
2892         dout("handle_lease %s, ino %llx %p %.*s\n",
2893              ceph_lease_op_name(h->action), vino.ino, inode,
2894              dname.len, dname.name);
2895         if (inode == NULL) {
2896                 dout("handle_lease no inode %llx\n", vino.ino);
2897                 goto release;
2898         }
2899 
2900         /* dentry */
2901         parent = d_find_alias(inode);
2902         if (!parent) {
2903                 dout("no parent dentry on inode %p\n", inode);
2904                 WARN_ON(1);
2905                 goto release;  /* hrm... */
2906         }
2907         dname.hash = full_name_hash(dname.name, dname.len);
2908         dentry = d_lookup(parent, &dname);
2909         dput(parent);
2910         if (!dentry)
2911                 goto release;
2912 
2913         spin_lock(&dentry->d_lock);
2914         di = ceph_dentry(dentry);
2915         switch (h->action) {
2916         case CEPH_MDS_LEASE_REVOKE:
2917                 if (di->lease_session == session) {
2918                         if (ceph_seq_cmp(di->lease_seq, seq) > 0)
2919                                 h->seq = cpu_to_le32(di->lease_seq);
2920                         __ceph_mdsc_drop_dentry_lease(dentry);
2921                 }
2922                 release = 1;
2923                 break;
2924 
2925         case CEPH_MDS_LEASE_RENEW:
2926                 if (di->lease_session == session &&
2927                     di->lease_gen == session->s_cap_gen &&
2928                     di->lease_renew_from &&
2929                     di->lease_renew_after == 0) {
2930                         unsigned long duration =
2931                                 le32_to_cpu(h->duration_ms) * HZ / 1000;
2932 
2933                         di->lease_seq = seq;
2934                         dentry->d_time = di->lease_renew_from + duration;
2935                         di->lease_renew_after = di->lease_renew_from +
2936                                 (duration >> 1);
2937                         di->lease_renew_from = 0;
2938                 }
2939                 break;
2940         }
2941         spin_unlock(&dentry->d_lock);
2942         dput(dentry);
2943 
2944         if (!release)
2945                 goto out;
2946 
2947 release:
2948         /* let's just reuse the same message */
2949         h->action = CEPH_MDS_LEASE_REVOKE_ACK;
2950         ceph_msg_get(msg);
2951         ceph_con_send(&session->s_con, msg);
2952 
2953 out:
2954         iput(inode);
2955         mutex_unlock(&session->s_mutex);
2956         return;
2957 
2958 bad:
2959         pr_err("corrupt lease message\n");
2960         ceph_msg_dump(msg);
2961 }
2962 
2963 void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
2964                               struct inode *inode,
2965                               struct dentry *dentry, char action,
2966                               u32 seq)
2967 {
2968         struct ceph_msg *msg;
2969         struct ceph_mds_lease *lease;
2970         int len = sizeof(*lease) + sizeof(u32);
2971         int dnamelen = 0;
2972 
2973         dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
2974              inode, dentry, ceph_lease_op_name(action), session->s_mds);
2975         dnamelen = dentry->d_name.len;
2976         len += dnamelen;
2977 
2978         msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
2979         if (!msg)
2980                 return;
2981         lease = msg->front.iov_base;
2982         lease->action = action;
2983         lease->ino = cpu_to_le64(ceph_vino(inode).ino);
2984         lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
2985         lease->seq = cpu_to_le32(seq);
2986         put_unaligned_le32(dnamelen, lease + 1);
2987         memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
2988 
2989         /*
2990          * if this is a preemptive lease RELEASE, no need to
2991          * flush request stream, since the actual request will
2992          * soon follow.
2993          */
2994         msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
2995 
2996         ceph_con_send(&session->s_con, msg);
2997 }
2998 
2999 /*
3000  * Preemptively release a lease we expect to invalidate anyway.
3001  * Pass @inode always, @dentry is optional.
3002  */
3003 void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
3004                              struct dentry *dentry)
3005 {
3006         struct ceph_dentry_info *di;
3007         struct ceph_mds_session *session;
3008         u32 seq;
3009 
3010         BUG_ON(inode == NULL);
3011         BUG_ON(dentry == NULL);
3012 
3013         /* is dentry lease valid? */
3014         spin_lock(&dentry->d_lock);
3015         di = ceph_dentry(dentry);
3016         if (!di || !di->lease_session ||
3017             di->lease_session->s_mds < 0 ||
3018             di->lease_gen != di->lease_session->s_cap_gen ||
3019             !time_before(jiffies, dentry->d_time)) {
3020                 dout("lease_release inode %p dentry %p -- "
3021                      "no lease\n",
3022                      inode, dentry);
3023                 spin_unlock(&dentry->d_lock);
3024                 return;
3025         }
3026 
3027         /* we do have a lease on this dentry; note mds and seq */
3028         session = ceph_get_mds_session(di->lease_session);
3029         seq = di->lease_seq;
3030         __ceph_mdsc_drop_dentry_lease(dentry);
3031         spin_unlock(&dentry->d_lock);
3032 
3033         dout("lease_release inode %p dentry %p to mds%d\n",
3034              inode, dentry, session->s_mds);
3035         ceph_mdsc_lease_send_msg(session, inode, dentry,
3036                                  CEPH_MDS_LEASE_RELEASE, seq);
3037         ceph_put_mds_session(session);
3038 }
3039 
3040 /*
3041  * drop all leases (and dentry refs) in preparation for umount
3042  */
3043 static void drop_leases(struct ceph_mds_client *mdsc)
3044 {
3045         int i;
3046 
3047         dout("drop_leases\n");
3048         mutex_lock(&mdsc->mutex);
3049         for (i = 0; i < mdsc->max_sessions; i++) {
3050                 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
3051                 if (!s)
3052                         continue;
3053                 mutex_unlock(&mdsc->mutex);
3054                 mutex_lock(&s->s_mutex);
3055                 mutex_unlock(&s->s_mutex);
3056                 ceph_put_mds_session(s);
3057                 mutex_lock(&mdsc->mutex);
3058         }
3059         mutex_unlock(&mdsc->mutex);
3060 }
3061 
3062 
3063 
3064 /*
3065  * delayed work -- periodically trim expired leases, renew caps with mds
3066  */
3067 static void schedule_delayed(struct ceph_mds_client *mdsc)
3068 {
3069         int delay = 5;
3070         unsigned hz = round_jiffies_relative(HZ * delay);
3071         schedule_delayed_work(&mdsc->delayed_work, hz);
3072 }
3073 
3074 static void delayed_work(struct work_struct *work)
3075 {
3076         int i;
3077         struct ceph_mds_client *mdsc =
3078                 container_of(work, struct ceph_mds_client, delayed_work.work);
3079         int renew_interval;
3080         int renew_caps;
3081 
3082         dout("mdsc delayed_work\n");
3083         ceph_check_delayed_caps(mdsc);
3084 
3085         mutex_lock(&mdsc->mutex);
3086         renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
3087         renew_caps = time_after_eq(jiffies, HZ*renew_interval +
3088                                    mdsc->last_renew_caps);
3089         if (renew_caps)
3090                 mdsc->last_renew_caps = jiffies;
3091 
3092         for (i = 0; i < mdsc->max_sessions; i++) {
3093                 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
3094                 if (s == NULL)
3095                         continue;
3096                 if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
3097                         dout("resending session close request for mds%d\n",
3098                              s->s_mds);
3099                         request_close_session(mdsc, s);
3100                         ceph_put_mds_session(s);
3101                         continue;
3102                 }
3103                 if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
3104                         if (s->s_state == CEPH_MDS_SESSION_OPEN) {
3105                                 s->s_state = CEPH_MDS_SESSION_HUNG;
3106                                 pr_info("mds%d hung\n", s->s_mds);
3107                         }
3108                 }
3109                 if (s->s_state < CEPH_MDS_SESSION_OPEN) {
3110                         /* this mds is failed or recovering, just wait */
3111                         ceph_put_mds_session(s);
3112                         continue;
3113                 }
3114                 mutex_unlock(&mdsc->mutex);
3115 
3116                 mutex_lock(&s->s_mutex);
3117                 if (renew_caps)
3118                         send_renew_caps(mdsc, s);
3119                 else
3120                         ceph_con_keepalive(&s->s_con);
3121                 ceph_add_cap_releases(mdsc, s);
3122                 if (s->s_state == CEPH_MDS_SESSION_OPEN ||
3123                     s->s_state == CEPH_MDS_SESSION_HUNG)
3124                         ceph_send_cap_releases(mdsc, s);
3125                 mutex_unlock(&s->s_mutex);
3126                 ceph_put_mds_session(s);
3127 
3128                 mutex_lock(&mdsc->mutex);
3129         }
3130         mutex_unlock(&mdsc->mutex);
3131 
3132         schedule_delayed(mdsc);
3133 }
3134 
3135 int ceph_mdsc_init(struct ceph_fs_client *fsc)
3136 
3137 {
3138         struct ceph_mds_client *mdsc;
3139 
3140         mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
3141         if (!mdsc)
3142                 return -ENOMEM;
3143         mdsc->fsc = fsc;
3144         fsc->mdsc = mdsc;
3145         mutex_init(&mdsc->mutex);
3146         mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
3147         if (mdsc->mdsmap == NULL) {
3148                 kfree(mdsc);
3149                 return -ENOMEM;
3150         }
3151 
3152         init_completion(&mdsc->safe_umount_waiters);
3153         init_waitqueue_head(&mdsc->session_close_wq);
3154         INIT_LIST_HEAD(&mdsc->waiting_for_map);
3155         mdsc->sessions = NULL;
3156         mdsc->max_sessions = 0;
3157         mdsc->stopping = 0;
3158         init_rwsem(&mdsc->snap_rwsem);
3159         mdsc->snap_realms = RB_ROOT;
3160         INIT_LIST_HEAD(&mdsc->snap_empty);
3161         spin_lock_init(&mdsc->snap_empty_lock);
3162         mdsc->last_tid = 0;
3163         mdsc->request_tree = RB_ROOT;
3164         INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
3165         mdsc->last_renew_caps = jiffies;
3166         INIT_LIST_HEAD(&mdsc->cap_delay_list);
3167         spin_lock_init(&mdsc->cap_delay_lock);
3168         INIT_LIST_HEAD(&mdsc->snap_flush_list);
3169         spin_lock_init(&mdsc->snap_flush_lock);
3170         mdsc->cap_flush_seq = 0;
3171         INIT_LIST_HEAD(&mdsc->cap_dirty);
3172         INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
3173         mdsc->num_cap_flushing = 0;
3174         spin_lock_init(&mdsc->cap_dirty_lock);
3175         init_waitqueue_head(&mdsc->cap_flushing_wq);
3176         spin_lock_init(&mdsc->dentry_lru_lock);
3177         INIT_LIST_HEAD(&mdsc->dentry_lru);
3178 
3179         ceph_caps_init(mdsc);
3180         ceph_adjust_min_caps(mdsc, fsc->min_caps);
3181 
3182         return 0;
3183 }
3184 
3185 /*
3186  * Wait for safe replies on open mds requests.  If we time out, drop
3187  * all requests from the tree to avoid dangling dentry refs.
3188  */
3189 static void wait_requests(struct ceph_mds_client *mdsc)
3190 {
3191         struct ceph_mds_request *req;
3192         struct ceph_fs_client *fsc = mdsc->fsc;
3193 
3194         mutex_lock(&mdsc->mutex);
3195         if (__get_oldest_req(mdsc)) {
3196                 mutex_unlock(&mdsc->mutex);
3197 
3198                 dout("wait_requests waiting for requests\n");
3199                 wait_for_completion_timeout(&mdsc->safe_umount_waiters,
3200                                     fsc->client->options->mount_timeout * HZ);
3201 
3202                 /* tear down remaining requests */
3203                 mutex_lock(&mdsc->mutex);
3204                 while ((req = __get_oldest_req(mdsc))) {
3205                         dout("wait_requests timed out on tid %llu\n",
3206                              req->r_tid);
3207                         __unregister_request(mdsc, req);
3208                 }
3209         }
3210         mutex_unlock(&mdsc->mutex);
3211         dout("wait_requests done\n");
3212 }
3213 
3214 /*
3215  * called before mount is ro, and before dentries are torn down.
3216  * (hmm, does this still race with new lookups?)
3217  */
3218 void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
3219 {
3220         dout("pre_umount\n");
3221         mdsc->stopping = 1;
3222 
3223         drop_leases(mdsc);
3224         ceph_flush_dirty_caps(mdsc);
3225         wait_requests(mdsc);
3226 
3227         /*
3228          * wait for reply handlers to drop their request refs and
3229          * their inode/dcache refs
3230          */
3231         ceph_msgr_flush();
3232 }
3233 
3234 /*
3235  * wait for all write mds requests to flush.
3236  */
3237 static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
3238 {
3239         struct ceph_mds_request *req = NULL, *nextreq;
3240         struct rb_node *n;
3241 
3242         mutex_lock(&mdsc->mutex);
3243         dout("wait_unsafe_requests want %lld\n", want_tid);
3244 restart:
3245         req = __get_oldest_req(mdsc);
3246         while (req && req->r_tid <= want_tid) {
3247                 /* find next request */
3248                 n = rb_next(&req->r_node);
3249                 if (n)
3250                         nextreq = rb_entry(n, struct ceph_mds_request, r_node);
3251                 else
3252                         nextreq = NULL;
3253                 if ((req->r_op & CEPH_MDS_OP_WRITE)) {
3254                         /* write op */
3255                         ceph_mdsc_get_request(req);
3256                         if (nextreq)
3257                                 ceph_mdsc_get_request(nextreq);
3258                         mutex_unlock(&mdsc->mutex);
3259                         dout("wait_unsafe_requests  wait on %llu (want %llu)\n",
3260                              req->r_tid, want_tid);
3261                         wait_for_completion(&req->r_safe_completion);
3262                         mutex_lock(&mdsc->mutex);
3263                         ceph_mdsc_put_request(req);
3264                         if (!nextreq)
3265                                 break;  /* next dne before, so we're done! */
3266                         if (RB_EMPTY_NODE(&nextreq->r_node)) {
3267                                 /* next request was removed from tree */
3268                                 ceph_mdsc_put_request(nextreq);
3269                                 goto restart;
3270                         }
3271                         ceph_mdsc_put_request(nextreq);  /* won't go away */
3272                 }
3273                 req = nextreq;
3274         }
3275         mutex_unlock(&mdsc->mutex);
3276         dout("wait_unsafe_requests done\n");
3277 }
3278 
3279 void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
3280 {
3281         u64 want_tid, want_flush;
3282 
3283         if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
3284                 return;
3285 
3286         dout("sync\n");
3287         mutex_lock(&mdsc->mutex);
3288         want_tid = mdsc->last_tid;
3289         want_flush = mdsc->cap_flush_seq;
3290         mutex_unlock(&mdsc->mutex);
3291         dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
3292 
3293         ceph_flush_dirty_caps(mdsc);
3294 
3295         wait_unsafe_requests(mdsc, want_tid);
3296         wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
3297 }
3298 
3299 /*
3300  * true if all sessions are closed, or we force unmount
3301  */
3302 static bool done_closing_sessions(struct ceph_mds_client *mdsc)
3303 {
3304         int i, n = 0;
3305 
3306         if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
3307                 return true;
3308 
3309         mutex_lock(&mdsc->mutex);
3310         for (i = 0; i < mdsc->max_sessions; i++)
3311                 if (mdsc->sessions[i])
3312                         n++;
3313         mutex_unlock(&mdsc->mutex);
3314         return n == 0;
3315 }
3316 
3317 /*
3318  * called after sb is ro.
3319  */
3320 void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
3321 {
3322         struct ceph_mds_session *session;
3323         int i;
3324         struct ceph_fs_client *fsc = mdsc->fsc;
3325         unsigned long timeout = fsc->client->options->mount_timeout * HZ;
3326 
3327         dout("close_sessions\n");
3328 
3329         /* close sessions */
3330         mutex_lock(&mdsc->mutex);
3331         for (i = 0; i < mdsc->max_sessions; i++) {
3332                 session = __ceph_lookup_mds_session(mdsc, i);
3333                 if (!session)
3334                         continue;
3335                 mutex_unlock(&mdsc->mutex);
3336                 mutex_lock(&session->s_mutex);
3337                 __close_session(mdsc, session);
3338                 mutex_unlock(&session->s_mutex);
3339                 ceph_put_mds_session(session);
3340                 mutex_lock(&mdsc->mutex);
3341         }
3342         mutex_unlock(&mdsc->mutex);
3343 
3344         dout("waiting for sessions to close\n");
3345         wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
3346                            timeout);
3347 
3348         /* tear down remaining sessions */
3349         mutex_lock(&mdsc->mutex);
3350         for (i = 0; i < mdsc->max_sessions; i++) {
3351                 if (mdsc->sessions[i]) {
3352                         session = get_session(mdsc->sessions[i]);
3353                         __unregister_session(mdsc, session);
3354                         mutex_unlock(&mdsc->mutex);
3355                         mutex_lock(&session->s_mutex);
3356                         remove_session_caps(session);
3357                         mutex_unlock(&session->s_mutex);
3358                         ceph_put_mds_session(session);
3359                         mutex_lock(&mdsc->mutex);
3360                 }
3361         }
3362         WARN_ON(!list_empty(&mdsc->cap_delay_list));
3363         mutex_unlock(&mdsc->mutex);
3364 
3365         ceph_cleanup_empty_realms(mdsc);
3366 
3367         cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
3368 
3369         dout("stopped\n");
3370 }
3371 
3372 static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
3373 {
3374         dout("stop\n");
3375         cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
3376         if (mdsc->mdsmap)
3377                 ceph_mdsmap_destroy(mdsc->mdsmap);
3378         kfree(mdsc->sessions);
3379         ceph_caps_finalize(mdsc);
3380 }
3381 
3382 void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
3383 {
3384         struct ceph_mds_client *mdsc = fsc->mdsc;
3385 
3386         dout("mdsc_destroy %p\n", mdsc);
3387         ceph_mdsc_stop(mdsc);
3388 
3389         /* flush out any connection work with references to us */
3390         ceph_msgr_flush();
3391 
3392         fsc->mdsc = NULL;
3393         kfree(mdsc);
3394         dout("mdsc_destroy %p done\n", mdsc);
3395 }
3396 
3397 
3398 /*
3399  * handle mds map update.
3400  */
3401 void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
3402 {
3403         u32 epoch;
3404         u32 maplen;
3405         void *p = msg->front.iov_base;
3406         void *end = p + msg->front.iov_len;
3407         struct ceph_mdsmap *newmap, *oldmap;
3408         struct ceph_fsid fsid;
3409         int err = -EINVAL;
3410 
3411         ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
3412         ceph_decode_copy(&p, &fsid, sizeof(fsid));
3413         if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
3414                 return;
3415         epoch = ceph_decode_32(&p);
3416         maplen = ceph_decode_32(&p);
3417         dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
3418 
3419         /* do we need it? */
3420         ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
3421         mutex_lock(&mdsc->mutex);
3422         if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
3423                 dout("handle_map epoch %u <= our %u\n",
3424                      epoch, mdsc->mdsmap->m_epoch);
3425                 mutex_unlock(&mdsc->mutex);
3426                 return;
3427         }
3428 
3429         newmap = ceph_mdsmap_decode(&p, end);
3430         if (IS_ERR(newmap)) {
3431                 err = PTR_ERR(newmap);
3432                 goto bad_unlock;
3433         }
3434 
3435         /* swap into place */
3436         if (mdsc->mdsmap) {
3437                 oldmap = mdsc->mdsmap;
3438                 mdsc->mdsmap = newmap;
3439                 check_new_map(mdsc, newmap, oldmap);
3440                 ceph_mdsmap_destroy(oldmap);
3441         } else {
3442                 mdsc->mdsmap = newmap;  /* first mds map */
3443         }
3444         mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
3445 
3446         __wake_requests(mdsc, &mdsc->waiting_for_map);
3447 
3448         mutex_unlock(&mdsc->mutex);
3449         schedule_delayed(mdsc);
3450         return;
3451 
3452 bad_unlock:
3453         mutex_unlock(&mdsc->mutex);
3454 bad:
3455         pr_err("error decoding mdsmap %d\n", err);
3456         return;
3457 }
3458 
3459 static struct ceph_connection *con_get(struct ceph_connection *con)
3460 {
3461         struct ceph_mds_session *s = con->private;
3462 
3463         if (get_session(s)) {
3464                 dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
3465                 return con;
3466         }
3467         dout("mdsc con_get %p FAIL\n", s);
3468         return NULL;
3469 }
3470 
3471 static void con_put(struct ceph_connection *con)
3472 {
3473         struct ceph_mds_session *s = con->private;
3474 
3475         dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1);
3476         ceph_put_mds_session(s);
3477 }
3478 
3479 /*
3480  * if the client is unresponsive for long enough, the mds will kill
3481  * the session entirely.
3482  */
3483 static void peer_reset(struct ceph_connection *con)
3484 {
3485         struct ceph_mds_session *s = con->private;
3486         struct ceph_mds_client *mdsc = s->s_mdsc;
3487 
3488         pr_warning("mds%d closed our session\n", s->s_mds);
3489         send_mds_reconnect(mdsc, s);
3490 }
3491 
3492 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
3493 {
3494         struct ceph_mds_session *s = con->private;
3495         struct ceph_mds_client *mdsc = s->s_mdsc;
3496         int type = le16_to_cpu(msg->hdr.type);
3497 
3498         mutex_lock(&mdsc->mutex);
3499         if (__verify_registered_session(mdsc, s) < 0) {
3500                 mutex_unlock(&mdsc->mutex);
3501                 goto out;
3502         }
3503         mutex_unlock(&mdsc->mutex);
3504 
3505         switch (type) {
3506         case CEPH_MSG_MDS_MAP:
3507                 ceph_mdsc_handle_map(mdsc, msg);
3508                 break;
3509         case CEPH_MSG_CLIENT_SESSION:
3510                 handle_session(s, msg);
3511                 break;
3512         case CEPH_MSG_CLIENT_REPLY:
3513                 handle_reply(s, msg);
3514                 break;
3515         case CEPH_MSG_CLIENT_REQUEST_FORWARD:
3516                 handle_forward(mdsc, s, msg);
3517                 break;
3518         case CEPH_MSG_CLIENT_CAPS:
3519                 ceph_handle_caps(s, msg);
3520                 break;
3521         case CEPH_MSG_CLIENT_SNAP:
3522                 ceph_handle_snap(mdsc, s, msg);
3523                 break;
3524         case CEPH_MSG_CLIENT_LEASE:
3525                 handle_lease(mdsc, s, msg);
3526                 break;
3527 
3528         default:
3529                 pr_err("received unknown message type %d %s\n", type,
3530                        ceph_msg_type_name(type));
3531         }
3532 out:
3533         ceph_msg_put(msg);
3534 }
3535 
3536 /*
3537  * authentication
3538  */
3539 
3540 /*
3541  * Note: returned pointer is the address of a structure that's
3542  * managed separately.  Caller must *not* attempt to free it.
3543  */
3544 static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
3545                                         int *proto, int force_new)
3546 {
3547         struct ceph_mds_session *s = con->private;
3548         struct ceph_mds_client *mdsc = s->s_mdsc;
3549         struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3550         struct ceph_auth_handshake *auth = &s->s_auth;
3551 
3552         if (force_new && auth->authorizer) {
3553                 ceph_auth_destroy_authorizer(ac, auth->authorizer);
3554                 auth->authorizer = NULL;
3555         }
3556         if (!auth->authorizer) {
3557                 int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
3558                                                       auth);
3559                 if (ret)
3560                         return ERR_PTR(ret);
3561         } else {
3562                 int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
3563                                                       auth);
3564                 if (ret)
3565                         return ERR_PTR(ret);
3566         }
3567         *proto = ac->protocol;
3568 
3569         return auth;
3570 }
3571 
3572 
3573 static int verify_authorizer_reply(struct ceph_connection *con, int len)
3574 {
3575         struct ceph_mds_session *s = con->private;
3576         struct ceph_mds_client *mdsc = s->s_mdsc;
3577         struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3578 
3579         return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer, len);
3580 }
3581 
3582 static int invalidate_authorizer(struct ceph_connection *con)
3583 {
3584         struct ceph_mds_session *s = con->private;
3585         struct ceph_mds_client *mdsc = s->s_mdsc;
3586         struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3587 
3588         ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
3589 
3590         return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
3591 }
3592 
3593 static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
3594                                 struct ceph_msg_header *hdr, int *skip)
3595 {
3596         struct ceph_msg *msg;
3597         int type = (int) le16_to_cpu(hdr->type);
3598         int front_len = (int) le32_to_cpu(hdr->front_len);
3599 
3600         if (con->in_msg)
3601                 return con->in_msg;
3602 
3603         *skip = 0;
3604         msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
3605         if (!msg) {
3606                 pr_err("unable to allocate msg type %d len %d\n",
3607                        type, front_len);
3608                 return NULL;
3609         }
3610 
3611         return msg;
3612 }
3613 
3614 static const struct ceph_connection_operations mds_con_ops = {
3615         .get = con_get,
3616         .put = con_put,
3617         .dispatch = dispatch,
3618         .get_authorizer = get_authorizer,
3619         .verify_authorizer_reply = verify_authorizer_reply,
3620         .invalidate_authorizer = invalidate_authorizer,
3621         .peer_reset = peer_reset,
3622         .alloc_msg = mds_alloc_msg,
3623 };
3624 
3625 /* eof */
3626 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp