~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/x86/kvm/x86.c

Version: ~ [ linux-5.6-rc7 ] ~ [ linux-5.5.11 ] ~ [ linux-5.4.27 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.112 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.174 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.217 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.217 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.82 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-3.9.11 ] ~ [ linux-3.8.13 ] ~ [ linux-3.7.10 ] ~ [ linux-3.6.11 ] ~ [ linux-3.5.7 ] ~ [ linux-3.4.113 ] ~ [ linux-3.3.8 ] ~ [ linux-3.2.102 ] ~ [ linux-3.1.10 ] ~ [ linux-3.0.101 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * Kernel-based Virtual Machine driver for Linux
  3  *
  4  * derived from drivers/kvm/kvm_main.c
  5  *
  6  * Copyright (C) 2006 Qumranet, Inc.
  7  * Copyright (C) 2008 Qumranet, Inc.
  8  * Copyright IBM Corporation, 2008
  9  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
 10  *
 11  * Authors:
 12  *   Avi Kivity   <avi@qumranet.com>
 13  *   Yaniv Kamay  <yaniv@qumranet.com>
 14  *   Amit Shah    <amit.shah@qumranet.com>
 15  *   Ben-Ami Yassour <benami@il.ibm.com>
 16  *
 17  * This work is licensed under the terms of the GNU GPL, version 2.  See
 18  * the COPYING file in the top-level directory.
 19  *
 20  */
 21 
 22 #include <linux/kvm_host.h>
 23 #include "irq.h"
 24 #include "mmu.h"
 25 #include "i8254.h"
 26 #include "tss.h"
 27 #include "kvm_cache_regs.h"
 28 #include "x86.h"
 29 #include "cpuid.h"
 30 #include "pmu.h"
 31 #include "hyperv.h"
 32 
 33 #include <linux/clocksource.h>
 34 #include <linux/interrupt.h>
 35 #include <linux/kvm.h>
 36 #include <linux/fs.h>
 37 #include <linux/vmalloc.h>
 38 #include <linux/export.h>
 39 #include <linux/moduleparam.h>
 40 #include <linux/mman.h>
 41 #include <linux/highmem.h>
 42 #include <linux/iommu.h>
 43 #include <linux/intel-iommu.h>
 44 #include <linux/cpufreq.h>
 45 #include <linux/user-return-notifier.h>
 46 #include <linux/srcu.h>
 47 #include <linux/slab.h>
 48 #include <linux/perf_event.h>
 49 #include <linux/uaccess.h>
 50 #include <linux/hash.h>
 51 #include <linux/pci.h>
 52 #include <linux/timekeeper_internal.h>
 53 #include <linux/pvclock_gtod.h>
 54 #include <linux/kvm_irqfd.h>
 55 #include <linux/irqbypass.h>
 56 #include <linux/sched/stat.h>
 57 #include <linux/mem_encrypt.h>
 58 
 59 #include <trace/events/kvm.h>
 60 
 61 #include <asm/debugreg.h>
 62 #include <asm/msr.h>
 63 #include <asm/desc.h>
 64 #include <asm/mce.h>
 65 #include <linux/kernel_stat.h>
 66 #include <asm/fpu/internal.h> /* Ugh! */
 67 #include <asm/pvclock.h>
 68 #include <asm/div64.h>
 69 #include <asm/irq_remapping.h>
 70 #include <asm/mshyperv.h>
 71 #include <asm/hypervisor.h>
 72 
 73 #define CREATE_TRACE_POINTS
 74 #include "trace.h"
 75 
 76 #define MAX_IO_MSRS 256
 77 #define KVM_MAX_MCE_BANKS 32
 78 u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
 79 EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
 80 
 81 #define emul_to_vcpu(ctxt) \
 82         container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
 83 
 84 /* EFER defaults:
 85  * - enable syscall per default because its emulated by KVM
 86  * - enable LME and LMA per default on 64 bit KVM
 87  */
 88 #ifdef CONFIG_X86_64
 89 static
 90 u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
 91 #else
 92 static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
 93 #endif
 94 
 95 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
 96 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 97 
 98 #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
 99                                     KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
100 
101 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
102 static void process_nmi(struct kvm_vcpu *vcpu);
103 static void enter_smm(struct kvm_vcpu *vcpu);
104 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
105 static void store_regs(struct kvm_vcpu *vcpu);
106 static int sync_regs(struct kvm_vcpu *vcpu);
107 
108 struct kvm_x86_ops *kvm_x86_ops __read_mostly;
109 EXPORT_SYMBOL_GPL(kvm_x86_ops);
110 
111 static bool __read_mostly ignore_msrs = 0;
112 module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
113 
114 static bool __read_mostly report_ignored_msrs = true;
115 module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
116 
117 unsigned int min_timer_period_us = 200;
118 module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
119 
120 static bool __read_mostly kvmclock_periodic_sync = true;
121 module_param(kvmclock_periodic_sync, bool, S_IRUGO);
122 
123 bool __read_mostly kvm_has_tsc_control;
124 EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
125 u32  __read_mostly kvm_max_guest_tsc_khz;
126 EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
127 u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
128 EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
129 u64  __read_mostly kvm_max_tsc_scaling_ratio;
130 EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
131 u64 __read_mostly kvm_default_tsc_scaling_ratio;
132 EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
133 
134 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
135 static u32 __read_mostly tsc_tolerance_ppm = 250;
136 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
137 
138 /* lapic timer advance (tscdeadline mode only) in nanoseconds */
139 unsigned int __read_mostly lapic_timer_advance_ns = 0;
140 module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
141 EXPORT_SYMBOL_GPL(lapic_timer_advance_ns);
142 
143 static bool __read_mostly vector_hashing = true;
144 module_param(vector_hashing, bool, S_IRUGO);
145 
146 bool __read_mostly enable_vmware_backdoor = false;
147 module_param(enable_vmware_backdoor, bool, S_IRUGO);
148 EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
149 
150 static bool __read_mostly force_emulation_prefix = false;
151 module_param(force_emulation_prefix, bool, S_IRUGO);
152 
153 #define KVM_NR_SHARED_MSRS 16
154 
155 struct kvm_shared_msrs_global {
156         int nr;
157         u32 msrs[KVM_NR_SHARED_MSRS];
158 };
159 
160 struct kvm_shared_msrs {
161         struct user_return_notifier urn;
162         bool registered;
163         struct kvm_shared_msr_values {
164                 u64 host;
165                 u64 curr;
166         } values[KVM_NR_SHARED_MSRS];
167 };
168 
169 static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
170 static struct kvm_shared_msrs __percpu *shared_msrs;
171 
172 struct kvm_stats_debugfs_item debugfs_entries[] = {
173         { "pf_fixed", VCPU_STAT(pf_fixed) },
174         { "pf_guest", VCPU_STAT(pf_guest) },
175         { "tlb_flush", VCPU_STAT(tlb_flush) },
176         { "invlpg", VCPU_STAT(invlpg) },
177         { "exits", VCPU_STAT(exits) },
178         { "io_exits", VCPU_STAT(io_exits) },
179         { "mmio_exits", VCPU_STAT(mmio_exits) },
180         { "signal_exits", VCPU_STAT(signal_exits) },
181         { "irq_window", VCPU_STAT(irq_window_exits) },
182         { "nmi_window", VCPU_STAT(nmi_window_exits) },
183         { "halt_exits", VCPU_STAT(halt_exits) },
184         { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
185         { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
186         { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
187         { "halt_wakeup", VCPU_STAT(halt_wakeup) },
188         { "hypercalls", VCPU_STAT(hypercalls) },
189         { "request_irq", VCPU_STAT(request_irq_exits) },
190         { "irq_exits", VCPU_STAT(irq_exits) },
191         { "host_state_reload", VCPU_STAT(host_state_reload) },
192         { "fpu_reload", VCPU_STAT(fpu_reload) },
193         { "insn_emulation", VCPU_STAT(insn_emulation) },
194         { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
195         { "irq_injections", VCPU_STAT(irq_injections) },
196         { "nmi_injections", VCPU_STAT(nmi_injections) },
197         { "req_event", VCPU_STAT(req_event) },
198         { "l1d_flush", VCPU_STAT(l1d_flush) },
199         { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
200         { "mmu_pte_write", VM_STAT(mmu_pte_write) },
201         { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
202         { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
203         { "mmu_flooded", VM_STAT(mmu_flooded) },
204         { "mmu_recycled", VM_STAT(mmu_recycled) },
205         { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
206         { "mmu_unsync", VM_STAT(mmu_unsync) },
207         { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
208         { "largepages", VM_STAT(lpages) },
209         { "max_mmu_page_hash_collisions",
210                 VM_STAT(max_mmu_page_hash_collisions) },
211         { NULL }
212 };
213 
214 u64 __read_mostly host_xcr0;
215 
216 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
217 
218 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
219 {
220         int i;
221         for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
222                 vcpu->arch.apf.gfns[i] = ~0;
223 }
224 
225 static void kvm_on_user_return(struct user_return_notifier *urn)
226 {
227         unsigned slot;
228         struct kvm_shared_msrs *locals
229                 = container_of(urn, struct kvm_shared_msrs, urn);
230         struct kvm_shared_msr_values *values;
231         unsigned long flags;
232 
233         /*
234          * Disabling irqs at this point since the following code could be
235          * interrupted and executed through kvm_arch_hardware_disable()
236          */
237         local_irq_save(flags);
238         if (locals->registered) {
239                 locals->registered = false;
240                 user_return_notifier_unregister(urn);
241         }
242         local_irq_restore(flags);
243         for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
244                 values = &locals->values[slot];
245                 if (values->host != values->curr) {
246                         wrmsrl(shared_msrs_global.msrs[slot], values->host);
247                         values->curr = values->host;
248                 }
249         }
250 }
251 
252 static void shared_msr_update(unsigned slot, u32 msr)
253 {
254         u64 value;
255         unsigned int cpu = smp_processor_id();
256         struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
257 
258         /* only read, and nobody should modify it at this time,
259          * so don't need lock */
260         if (slot >= shared_msrs_global.nr) {
261                 printk(KERN_ERR "kvm: invalid MSR slot!");
262                 return;
263         }
264         rdmsrl_safe(msr, &value);
265         smsr->values[slot].host = value;
266         smsr->values[slot].curr = value;
267 }
268 
269 void kvm_define_shared_msr(unsigned slot, u32 msr)
270 {
271         BUG_ON(slot >= KVM_NR_SHARED_MSRS);
272         shared_msrs_global.msrs[slot] = msr;
273         if (slot >= shared_msrs_global.nr)
274                 shared_msrs_global.nr = slot + 1;
275 }
276 EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
277 
278 static void kvm_shared_msr_cpu_online(void)
279 {
280         unsigned i;
281 
282         for (i = 0; i < shared_msrs_global.nr; ++i)
283                 shared_msr_update(i, shared_msrs_global.msrs[i]);
284 }
285 
286 int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
287 {
288         unsigned int cpu = smp_processor_id();
289         struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
290         int err;
291 
292         if (((value ^ smsr->values[slot].curr) & mask) == 0)
293                 return 0;
294         smsr->values[slot].curr = value;
295         err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
296         if (err)
297                 return 1;
298 
299         if (!smsr->registered) {
300                 smsr->urn.on_user_return = kvm_on_user_return;
301                 user_return_notifier_register(&smsr->urn);
302                 smsr->registered = true;
303         }
304         return 0;
305 }
306 EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
307 
308 static void drop_user_return_notifiers(void)
309 {
310         unsigned int cpu = smp_processor_id();
311         struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
312 
313         if (smsr->registered)
314                 kvm_on_user_return(&smsr->urn);
315 }
316 
317 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
318 {
319         return vcpu->arch.apic_base;
320 }
321 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
322 
323 enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
324 {
325         return kvm_apic_mode(kvm_get_apic_base(vcpu));
326 }
327 EXPORT_SYMBOL_GPL(kvm_get_apic_mode);
328 
329 int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
330 {
331         enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
332         enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
333         u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff |
334                 (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
335 
336         if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
337                 return 1;
338         if (!msr_info->host_initiated) {
339                 if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
340                         return 1;
341                 if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
342                         return 1;
343         }
344 
345         kvm_lapic_set_base(vcpu, msr_info->data);
346         return 0;
347 }
348 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
349 
350 asmlinkage __visible void kvm_spurious_fault(void)
351 {
352         /* Fault while not rebooting.  We want the trace. */
353         BUG();
354 }
355 EXPORT_SYMBOL_GPL(kvm_spurious_fault);
356 
357 #define EXCPT_BENIGN            0
358 #define EXCPT_CONTRIBUTORY      1
359 #define EXCPT_PF                2
360 
361 static int exception_class(int vector)
362 {
363         switch (vector) {
364         case PF_VECTOR:
365                 return EXCPT_PF;
366         case DE_VECTOR:
367         case TS_VECTOR:
368         case NP_VECTOR:
369         case SS_VECTOR:
370         case GP_VECTOR:
371                 return EXCPT_CONTRIBUTORY;
372         default:
373                 break;
374         }
375         return EXCPT_BENIGN;
376 }
377 
378 #define EXCPT_FAULT             0
379 #define EXCPT_TRAP              1
380 #define EXCPT_ABORT             2
381 #define EXCPT_INTERRUPT         3
382 
383 static int exception_type(int vector)
384 {
385         unsigned int mask;
386 
387         if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
388                 return EXCPT_INTERRUPT;
389 
390         mask = 1 << vector;
391 
392         /* #DB is trap, as instruction watchpoints are handled elsewhere */
393         if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
394                 return EXCPT_TRAP;
395 
396         if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
397                 return EXCPT_ABORT;
398 
399         /* Reserved exceptions will result in fault */
400         return EXCPT_FAULT;
401 }
402 
403 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
404                 unsigned nr, bool has_error, u32 error_code,
405                 bool reinject)
406 {
407         u32 prev_nr;
408         int class1, class2;
409 
410         kvm_make_request(KVM_REQ_EVENT, vcpu);
411 
412         if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
413         queue:
414                 if (has_error && !is_protmode(vcpu))
415                         has_error = false;
416                 if (reinject) {
417                         /*
418                          * On vmentry, vcpu->arch.exception.pending is only
419                          * true if an event injection was blocked by
420                          * nested_run_pending.  In that case, however,
421                          * vcpu_enter_guest requests an immediate exit,
422                          * and the guest shouldn't proceed far enough to
423                          * need reinjection.
424                          */
425                         WARN_ON_ONCE(vcpu->arch.exception.pending);
426                         vcpu->arch.exception.injected = true;
427                 } else {
428                         vcpu->arch.exception.pending = true;
429                         vcpu->arch.exception.injected = false;
430                 }
431                 vcpu->arch.exception.has_error_code = has_error;
432                 vcpu->arch.exception.nr = nr;
433                 vcpu->arch.exception.error_code = error_code;
434                 return;
435         }
436 
437         /* to check exception */
438         prev_nr = vcpu->arch.exception.nr;
439         if (prev_nr == DF_VECTOR) {
440                 /* triple fault -> shutdown */
441                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
442                 return;
443         }
444         class1 = exception_class(prev_nr);
445         class2 = exception_class(nr);
446         if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
447                 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
448                 /*
449                  * Generate double fault per SDM Table 5-5.  Set
450                  * exception.pending = true so that the double fault
451                  * can trigger a nested vmexit.
452                  */
453                 vcpu->arch.exception.pending = true;
454                 vcpu->arch.exception.injected = false;
455                 vcpu->arch.exception.has_error_code = true;
456                 vcpu->arch.exception.nr = DF_VECTOR;
457                 vcpu->arch.exception.error_code = 0;
458         } else
459                 /* replace previous exception with a new one in a hope
460                    that instruction re-execution will regenerate lost
461                    exception */
462                 goto queue;
463 }
464 
465 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
466 {
467         kvm_multiple_exception(vcpu, nr, false, 0, false);
468 }
469 EXPORT_SYMBOL_GPL(kvm_queue_exception);
470 
471 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
472 {
473         kvm_multiple_exception(vcpu, nr, false, 0, true);
474 }
475 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
476 
477 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
478 {
479         if (err)
480                 kvm_inject_gp(vcpu, 0);
481         else
482                 return kvm_skip_emulated_instruction(vcpu);
483 
484         return 1;
485 }
486 EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
487 
488 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
489 {
490         ++vcpu->stat.pf_guest;
491         vcpu->arch.exception.nested_apf =
492                 is_guest_mode(vcpu) && fault->async_page_fault;
493         if (vcpu->arch.exception.nested_apf)
494                 vcpu->arch.apf.nested_apf_token = fault->address;
495         else
496                 vcpu->arch.cr2 = fault->address;
497         kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
498 }
499 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
500 
501 static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
502 {
503         if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
504                 vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
505         else
506                 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
507 
508         return fault->nested_page_fault;
509 }
510 
511 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
512 {
513         atomic_inc(&vcpu->arch.nmi_queued);
514         kvm_make_request(KVM_REQ_NMI, vcpu);
515 }
516 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
517 
518 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
519 {
520         kvm_multiple_exception(vcpu, nr, true, error_code, false);
521 }
522 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
523 
524 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
525 {
526         kvm_multiple_exception(vcpu, nr, true, error_code, true);
527 }
528 EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
529 
530 /*
531  * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
532  * a #GP and return false.
533  */
534 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
535 {
536         if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
537                 return true;
538         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
539         return false;
540 }
541 EXPORT_SYMBOL_GPL(kvm_require_cpl);
542 
543 bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
544 {
545         if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
546                 return true;
547 
548         kvm_queue_exception(vcpu, UD_VECTOR);
549         return false;
550 }
551 EXPORT_SYMBOL_GPL(kvm_require_dr);
552 
553 /*
554  * This function will be used to read from the physical memory of the currently
555  * running guest. The difference to kvm_vcpu_read_guest_page is that this function
556  * can read from guest physical or from the guest's guest physical memory.
557  */
558 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
559                             gfn_t ngfn, void *data, int offset, int len,
560                             u32 access)
561 {
562         struct x86_exception exception;
563         gfn_t real_gfn;
564         gpa_t ngpa;
565 
566         ngpa     = gfn_to_gpa(ngfn);
567         real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
568         if (real_gfn == UNMAPPED_GVA)
569                 return -EFAULT;
570 
571         real_gfn = gpa_to_gfn(real_gfn);
572 
573         return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
574 }
575 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
576 
577 static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
578                                void *data, int offset, int len, u32 access)
579 {
580         return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
581                                        data, offset, len, access);
582 }
583 
584 /*
585  * Load the pae pdptrs.  Return true is they are all valid.
586  */
587 int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
588 {
589         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
590         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
591         int i;
592         int ret;
593         u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
594 
595         ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
596                                       offset * sizeof(u64), sizeof(pdpte),
597                                       PFERR_USER_MASK|PFERR_WRITE_MASK);
598         if (ret < 0) {
599                 ret = 0;
600                 goto out;
601         }
602         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
603                 if ((pdpte[i] & PT_PRESENT_MASK) &&
604                     (pdpte[i] &
605                      vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
606                         ret = 0;
607                         goto out;
608                 }
609         }
610         ret = 1;
611 
612         memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
613         __set_bit(VCPU_EXREG_PDPTR,
614                   (unsigned long *)&vcpu->arch.regs_avail);
615         __set_bit(VCPU_EXREG_PDPTR,
616                   (unsigned long *)&vcpu->arch.regs_dirty);
617 out:
618 
619         return ret;
620 }
621 EXPORT_SYMBOL_GPL(load_pdptrs);
622 
623 bool pdptrs_changed(struct kvm_vcpu *vcpu)
624 {
625         u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
626         bool changed = true;
627         int offset;
628         gfn_t gfn;
629         int r;
630 
631         if (is_long_mode(vcpu) || !is_pae(vcpu))
632                 return false;
633 
634         if (!test_bit(VCPU_EXREG_PDPTR,
635                       (unsigned long *)&vcpu->arch.regs_avail))
636                 return true;
637 
638         gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
639         offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
640         r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
641                                        PFERR_USER_MASK | PFERR_WRITE_MASK);
642         if (r < 0)
643                 goto out;
644         changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
645 out:
646 
647         return changed;
648 }
649 EXPORT_SYMBOL_GPL(pdptrs_changed);
650 
651 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
652 {
653         unsigned long old_cr0 = kvm_read_cr0(vcpu);
654         unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
655 
656         cr0 |= X86_CR0_ET;
657 
658 #ifdef CONFIG_X86_64
659         if (cr0 & 0xffffffff00000000UL)
660                 return 1;
661 #endif
662 
663         cr0 &= ~CR0_RESERVED_BITS;
664 
665         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
666                 return 1;
667 
668         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
669                 return 1;
670 
671         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
672 #ifdef CONFIG_X86_64
673                 if ((vcpu->arch.efer & EFER_LME)) {
674                         int cs_db, cs_l;
675 
676                         if (!is_pae(vcpu))
677                                 return 1;
678                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
679                         if (cs_l)
680                                 return 1;
681                 } else
682 #endif
683                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
684                                                  kvm_read_cr3(vcpu)))
685                         return 1;
686         }
687 
688         if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
689                 return 1;
690 
691         kvm_x86_ops->set_cr0(vcpu, cr0);
692 
693         if ((cr0 ^ old_cr0) & X86_CR0_PG) {
694                 kvm_clear_async_pf_completion_queue(vcpu);
695                 kvm_async_pf_hash_reset(vcpu);
696         }
697 
698         if ((cr0 ^ old_cr0) & update_bits)
699                 kvm_mmu_reset_context(vcpu);
700 
701         if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
702             kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
703             !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
704                 kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
705 
706         return 0;
707 }
708 EXPORT_SYMBOL_GPL(kvm_set_cr0);
709 
710 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
711 {
712         (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
713 }
714 EXPORT_SYMBOL_GPL(kvm_lmsw);
715 
716 static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
717 {
718         if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
719                         !vcpu->guest_xcr0_loaded) {
720                 /* kvm_set_xcr() also depends on this */
721                 if (vcpu->arch.xcr0 != host_xcr0)
722                         xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
723                 vcpu->guest_xcr0_loaded = 1;
724         }
725 }
726 
727 static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
728 {
729         if (vcpu->guest_xcr0_loaded) {
730                 if (vcpu->arch.xcr0 != host_xcr0)
731                         xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
732                 vcpu->guest_xcr0_loaded = 0;
733         }
734 }
735 
736 static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
737 {
738         u64 xcr0 = xcr;
739         u64 old_xcr0 = vcpu->arch.xcr0;
740         u64 valid_bits;
741 
742         /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
743         if (index != XCR_XFEATURE_ENABLED_MASK)
744                 return 1;
745         if (!(xcr0 & XFEATURE_MASK_FP))
746                 return 1;
747         if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
748                 return 1;
749 
750         /*
751          * Do not allow the guest to set bits that we do not support
752          * saving.  However, xcr0 bit 0 is always set, even if the
753          * emulated CPU does not support XSAVE (see fx_init).
754          */
755         valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
756         if (xcr0 & ~valid_bits)
757                 return 1;
758 
759         if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
760             (!(xcr0 & XFEATURE_MASK_BNDCSR)))
761                 return 1;
762 
763         if (xcr0 & XFEATURE_MASK_AVX512) {
764                 if (!(xcr0 & XFEATURE_MASK_YMM))
765                         return 1;
766                 if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
767                         return 1;
768         }
769         vcpu->arch.xcr0 = xcr0;
770 
771         if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
772                 kvm_update_cpuid(vcpu);
773         return 0;
774 }
775 
776 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
777 {
778         if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
779             __kvm_set_xcr(vcpu, index, xcr)) {
780                 kvm_inject_gp(vcpu, 0);
781                 return 1;
782         }
783         return 0;
784 }
785 EXPORT_SYMBOL_GPL(kvm_set_xcr);
786 
787 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
788 {
789         unsigned long old_cr4 = kvm_read_cr4(vcpu);
790         unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
791                                    X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
792 
793         if (cr4 & CR4_RESERVED_BITS)
794                 return 1;
795 
796         if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
797                 return 1;
798 
799         if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
800                 return 1;
801 
802         if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
803                 return 1;
804 
805         if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
806                 return 1;
807 
808         if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
809                 return 1;
810 
811         if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
812                 return 1;
813 
814         if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP))
815                 return 1;
816 
817         if (is_long_mode(vcpu)) {
818                 if (!(cr4 & X86_CR4_PAE))
819                         return 1;
820         } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
821                    && ((cr4 ^ old_cr4) & pdptr_bits)
822                    && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
823                                    kvm_read_cr3(vcpu)))
824                 return 1;
825 
826         if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
827                 if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
828                         return 1;
829 
830                 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
831                 if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
832                         return 1;
833         }
834 
835         if (kvm_x86_ops->set_cr4(vcpu, cr4))
836                 return 1;
837 
838         if (((cr4 ^ old_cr4) & pdptr_bits) ||
839             (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
840                 kvm_mmu_reset_context(vcpu);
841 
842         if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
843                 kvm_update_cpuid(vcpu);
844 
845         return 0;
846 }
847 EXPORT_SYMBOL_GPL(kvm_set_cr4);
848 
849 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
850 {
851 #ifdef CONFIG_X86_64
852         bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
853 
854         if (pcid_enabled)
855                 cr3 &= ~CR3_PCID_INVD;
856 #endif
857 
858         if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
859                 kvm_mmu_sync_roots(vcpu);
860                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
861                 return 0;
862         }
863 
864         if (is_long_mode(vcpu) &&
865             (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
866                 return 1;
867         else if (is_pae(vcpu) && is_paging(vcpu) &&
868                    !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
869                 return 1;
870 
871         vcpu->arch.cr3 = cr3;
872         __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
873         kvm_mmu_new_cr3(vcpu);
874         return 0;
875 }
876 EXPORT_SYMBOL_GPL(kvm_set_cr3);
877 
878 int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
879 {
880         if (cr8 & CR8_RESERVED_BITS)
881                 return 1;
882         if (lapic_in_kernel(vcpu))
883                 kvm_lapic_set_tpr(vcpu, cr8);
884         else
885                 vcpu->arch.cr8 = cr8;
886         return 0;
887 }
888 EXPORT_SYMBOL_GPL(kvm_set_cr8);
889 
890 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
891 {
892         if (lapic_in_kernel(vcpu))
893                 return kvm_lapic_get_cr8(vcpu);
894         else
895                 return vcpu->arch.cr8;
896 }
897 EXPORT_SYMBOL_GPL(kvm_get_cr8);
898 
899 static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
900 {
901         int i;
902 
903         if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
904                 for (i = 0; i < KVM_NR_DB_REGS; i++)
905                         vcpu->arch.eff_db[i] = vcpu->arch.db[i];
906                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
907         }
908 }
909 
910 static void kvm_update_dr6(struct kvm_vcpu *vcpu)
911 {
912         if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
913                 kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
914 }
915 
916 static void kvm_update_dr7(struct kvm_vcpu *vcpu)
917 {
918         unsigned long dr7;
919 
920         if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
921                 dr7 = vcpu->arch.guest_debug_dr7;
922         else
923                 dr7 = vcpu->arch.dr7;
924         kvm_x86_ops->set_dr7(vcpu, dr7);
925         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
926         if (dr7 & DR7_BP_EN_MASK)
927                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
928 }
929 
930 static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
931 {
932         u64 fixed = DR6_FIXED_1;
933 
934         if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
935                 fixed |= DR6_RTM;
936         return fixed;
937 }
938 
939 static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
940 {
941         switch (dr) {
942         case 0 ... 3:
943                 vcpu->arch.db[dr] = val;
944                 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
945                         vcpu->arch.eff_db[dr] = val;
946                 break;
947         case 4:
948                 /* fall through */
949         case 6:
950                 if (val & 0xffffffff00000000ULL)
951                         return -1; /* #GP */
952                 vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
953                 kvm_update_dr6(vcpu);
954                 break;
955         case 5:
956                 /* fall through */
957         default: /* 7 */
958                 if (val & 0xffffffff00000000ULL)
959                         return -1; /* #GP */
960                 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
961                 kvm_update_dr7(vcpu);
962                 break;
963         }
964 
965         return 0;
966 }
967 
968 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
969 {
970         if (__kvm_set_dr(vcpu, dr, val)) {
971                 kvm_inject_gp(vcpu, 0);
972                 return 1;
973         }
974         return 0;
975 }
976 EXPORT_SYMBOL_GPL(kvm_set_dr);
977 
978 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
979 {
980         switch (dr) {
981         case 0 ... 3:
982                 *val = vcpu->arch.db[dr];
983                 break;
984         case 4:
985                 /* fall through */
986         case 6:
987                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
988                         *val = vcpu->arch.dr6;
989                 else
990                         *val = kvm_x86_ops->get_dr6(vcpu);
991                 break;
992         case 5:
993                 /* fall through */
994         default: /* 7 */
995                 *val = vcpu->arch.dr7;
996                 break;
997         }
998         return 0;
999 }
1000 EXPORT_SYMBOL_GPL(kvm_get_dr);
1001 
1002 bool kvm_rdpmc(struct kvm_vcpu *vcpu)
1003 {
1004         u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
1005         u64 data;
1006         int err;
1007 
1008         err = kvm_pmu_rdpmc(vcpu, ecx, &data);
1009         if (err)
1010                 return err;
1011         kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
1012         kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
1013         return err;
1014 }
1015 EXPORT_SYMBOL_GPL(kvm_rdpmc);
1016 
1017 /*
1018  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
1019  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
1020  *
1021  * This list is modified at module load time to reflect the
1022  * capabilities of the host cpu. This capabilities test skips MSRs that are
1023  * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
1024  * may depend on host virtualization features rather than host cpu features.
1025  */
1026 
1027 static u32 msrs_to_save[] = {
1028         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
1029         MSR_STAR,
1030 #ifdef CONFIG_X86_64
1031         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
1032 #endif
1033         MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
1034         MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
1035         MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
1036 };
1037 
1038 static unsigned num_msrs_to_save;
1039 
1040 static u32 emulated_msrs[] = {
1041         MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
1042         MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
1043         HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
1044         HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
1045         HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
1046         HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
1047         HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
1048         HV_X64_MSR_RESET,
1049         HV_X64_MSR_VP_INDEX,
1050         HV_X64_MSR_VP_RUNTIME,
1051         HV_X64_MSR_SCONTROL,
1052         HV_X64_MSR_STIMER0_CONFIG,
1053         HV_X64_MSR_VP_ASSIST_PAGE,
1054         HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
1055         HV_X64_MSR_TSC_EMULATION_STATUS,
1056 
1057         MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
1058         MSR_KVM_PV_EOI_EN,
1059 
1060         MSR_IA32_TSC_ADJUST,
1061         MSR_IA32_TSCDEADLINE,
1062         MSR_IA32_MISC_ENABLE,
1063         MSR_IA32_MCG_STATUS,
1064         MSR_IA32_MCG_CTL,
1065         MSR_IA32_MCG_EXT_CTL,
1066         MSR_IA32_SMBASE,
1067         MSR_SMI_COUNT,
1068         MSR_PLATFORM_INFO,
1069         MSR_MISC_FEATURES_ENABLES,
1070         MSR_AMD64_VIRT_SPEC_CTRL,
1071 };
1072 
1073 static unsigned num_emulated_msrs;
1074 
1075 /*
1076  * List of msr numbers which are used to expose MSR-based features that
1077  * can be used by a hypervisor to validate requested CPU features.
1078  */
1079 static u32 msr_based_features[] = {
1080         MSR_IA32_VMX_BASIC,
1081         MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1082         MSR_IA32_VMX_PINBASED_CTLS,
1083         MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1084         MSR_IA32_VMX_PROCBASED_CTLS,
1085         MSR_IA32_VMX_TRUE_EXIT_CTLS,
1086         MSR_IA32_VMX_EXIT_CTLS,
1087         MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1088         MSR_IA32_VMX_ENTRY_CTLS,
1089         MSR_IA32_VMX_MISC,
1090         MSR_IA32_VMX_CR0_FIXED0,
1091         MSR_IA32_VMX_CR0_FIXED1,
1092         MSR_IA32_VMX_CR4_FIXED0,
1093         MSR_IA32_VMX_CR4_FIXED1,
1094         MSR_IA32_VMX_VMCS_ENUM,
1095         MSR_IA32_VMX_PROCBASED_CTLS2,
1096         MSR_IA32_VMX_EPT_VPID_CAP,
1097         MSR_IA32_VMX_VMFUNC,
1098 
1099         MSR_F10H_DECFG,
1100         MSR_IA32_UCODE_REV,
1101         MSR_IA32_ARCH_CAPABILITIES,
1102 };
1103 
1104 static unsigned int num_msr_based_features;
1105 
1106 u64 kvm_get_arch_capabilities(void)
1107 {
1108         u64 data;
1109 
1110         rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
1111 
1112         /*
1113          * If we're doing cache flushes (either "always" or "cond")
1114          * we will do one whenever the guest does a vmlaunch/vmresume.
1115          * If an outer hypervisor is doing the cache flush for us
1116          * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
1117          * capability to the guest too, and if EPT is disabled we're not
1118          * vulnerable.  Overall, only VMENTER_L1D_FLUSH_NEVER will
1119          * require a nested hypervisor to do a flush of its own.
1120          */
1121         if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
1122                 data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
1123 
1124         return data;
1125 }
1126 EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
1127 
1128 static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
1129 {
1130         switch (msr->index) {
1131         case MSR_IA32_ARCH_CAPABILITIES:
1132                 msr->data = kvm_get_arch_capabilities();
1133                 break;
1134         case MSR_IA32_UCODE_REV:
1135                 rdmsrl_safe(msr->index, &msr->data);
1136                 break;
1137         default:
1138                 if (kvm_x86_ops->get_msr_feature(msr))
1139                         return 1;
1140         }
1141         return 0;
1142 }
1143 
1144 static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1145 {
1146         struct kvm_msr_entry msr;
1147         int r;
1148 
1149         msr.index = index;
1150         r = kvm_get_msr_feature(&msr);
1151         if (r)
1152                 return r;
1153 
1154         *data = msr.data;
1155 
1156         return 0;
1157 }
1158 
1159 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1160 {
1161         if (efer & efer_reserved_bits)
1162                 return false;
1163 
1164         if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
1165                         return false;
1166 
1167         if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
1168                         return false;
1169 
1170         return true;
1171 }
1172 EXPORT_SYMBOL_GPL(kvm_valid_efer);
1173 
1174 static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
1175 {
1176         u64 old_efer = vcpu->arch.efer;
1177 
1178         if (!kvm_valid_efer(vcpu, efer))
1179                 return 1;
1180 
1181         if (is_paging(vcpu)
1182             && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
1183                 return 1;
1184 
1185         efer &= ~EFER_LMA;
1186         efer |= vcpu->arch.efer & EFER_LMA;
1187 
1188         kvm_x86_ops->set_efer(vcpu, efer);
1189 
1190         /* Update reserved bits */
1191         if ((efer ^ old_efer) & EFER_NX)
1192                 kvm_mmu_reset_context(vcpu);
1193 
1194         return 0;
1195 }
1196 
1197 void kvm_enable_efer_bits(u64 mask)
1198 {
1199        efer_reserved_bits &= ~mask;
1200 }
1201 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
1202 
1203 /*
1204  * Writes msr value into into the appropriate "register".
1205  * Returns 0 on success, non-0 otherwise.
1206  * Assumes vcpu_load() was already called.
1207  */
1208 int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
1209 {
1210         switch (msr->index) {
1211         case MSR_FS_BASE:
1212         case MSR_GS_BASE:
1213         case MSR_KERNEL_GS_BASE:
1214         case MSR_CSTAR:
1215         case MSR_LSTAR:
1216                 if (is_noncanonical_address(msr->data, vcpu))
1217                         return 1;
1218                 break;
1219         case MSR_IA32_SYSENTER_EIP:
1220         case MSR_IA32_SYSENTER_ESP:
1221                 /*
1222                  * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
1223                  * non-canonical address is written on Intel but not on
1224                  * AMD (which ignores the top 32-bits, because it does
1225                  * not implement 64-bit SYSENTER).
1226                  *
1227                  * 64-bit code should hence be able to write a non-canonical
1228                  * value on AMD.  Making the address canonical ensures that
1229                  * vmentry does not fail on Intel after writing a non-canonical
1230                  * value, and that something deterministic happens if the guest
1231                  * invokes 64-bit SYSENTER.
1232                  */
1233                 msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu));
1234         }
1235         return kvm_x86_ops->set_msr(vcpu, msr);
1236 }
1237 EXPORT_SYMBOL_GPL(kvm_set_msr);
1238 
1239 /*
1240  * Adapt set_msr() to msr_io()'s calling convention
1241  */
1242 static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1243 {
1244         struct msr_data msr;
1245         int r;
1246 
1247         msr.index = index;
1248         msr.host_initiated = true;
1249         r = kvm_get_msr(vcpu, &msr);
1250         if (r)
1251                 return r;
1252 
1253         *data = msr.data;
1254         return 0;
1255 }
1256 
1257 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1258 {
1259         struct msr_data msr;
1260 
1261         msr.data = *data;
1262         msr.index = index;
1263         msr.host_initiated = true;
1264         return kvm_set_msr(vcpu, &msr);
1265 }
1266 
1267 #ifdef CONFIG_X86_64
1268 struct pvclock_gtod_data {
1269         seqcount_t      seq;
1270 
1271         struct { /* extract of a clocksource struct */
1272                 int vclock_mode;
1273                 u64     cycle_last;
1274                 u64     mask;
1275                 u32     mult;
1276                 u32     shift;
1277         } clock;
1278 
1279         u64             boot_ns;
1280         u64             nsec_base;
1281         u64             wall_time_sec;
1282 };
1283 
1284 static struct pvclock_gtod_data pvclock_gtod_data;
1285 
1286 static void update_pvclock_gtod(struct timekeeper *tk)
1287 {
1288         struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
1289         u64 boot_ns;
1290 
1291         boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
1292 
1293         write_seqcount_begin(&vdata->seq);
1294 
1295         /* copy pvclock gtod data */
1296         vdata->clock.vclock_mode        = tk->tkr_mono.clock->archdata.vclock_mode;
1297         vdata->clock.cycle_last         = tk->tkr_mono.cycle_last;
1298         vdata->clock.mask               = tk->tkr_mono.mask;
1299         vdata->clock.mult               = tk->tkr_mono.mult;
1300         vdata->clock.shift              = tk->tkr_mono.shift;
1301 
1302         vdata->boot_ns                  = boot_ns;
1303         vdata->nsec_base                = tk->tkr_mono.xtime_nsec;
1304 
1305         vdata->wall_time_sec            = tk->xtime_sec;
1306 
1307         write_seqcount_end(&vdata->seq);
1308 }
1309 #endif
1310 
1311 void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
1312 {
1313         /*
1314          * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
1315          * vcpu_enter_guest.  This function is only called from
1316          * the physical CPU that is running vcpu.
1317          */
1318         kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
1319 }
1320 
1321 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
1322 {
1323         int version;
1324         int r;
1325         struct pvclock_wall_clock wc;
1326         struct timespec64 boot;
1327 
1328         if (!wall_clock)
1329                 return;
1330 
1331         r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
1332         if (r)
1333                 return;
1334 
1335         if (version & 1)
1336                 ++version;  /* first time write, random junk */
1337 
1338         ++version;
1339 
1340         if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
1341                 return;
1342 
1343         /*
1344          * The guest calculates current wall clock time by adding
1345          * system time (updated by kvm_guest_time_update below) to the
1346          * wall clock specified here.  guest system time equals host
1347          * system time for us, thus we must fill in host boot time here.
1348          */
1349         getboottime64(&boot);
1350 
1351         if (kvm->arch.kvmclock_offset) {
1352                 struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
1353                 boot = timespec64_sub(boot, ts);
1354         }
1355         wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
1356         wc.nsec = boot.tv_nsec;
1357         wc.version = version;
1358 
1359         kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
1360 
1361         version++;
1362         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
1363 }
1364 
1365 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
1366 {
1367         do_shl32_div32(dividend, divisor);
1368         return dividend;
1369 }
1370 
1371 static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
1372                                s8 *pshift, u32 *pmultiplier)
1373 {
1374         uint64_t scaled64;
1375         int32_t  shift = 0;
1376         uint64_t tps64;
1377         uint32_t tps32;
1378 
1379         tps64 = base_hz;
1380         scaled64 = scaled_hz;
1381         while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
1382                 tps64 >>= 1;
1383                 shift--;
1384         }
1385 
1386         tps32 = (uint32_t)tps64;
1387         while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
1388                 if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
1389                         scaled64 >>= 1;
1390                 else
1391                         tps32 <<= 1;
1392                 shift++;
1393         }
1394 
1395         *pshift = shift;
1396         *pmultiplier = div_frac(scaled64, tps32);
1397 
1398         pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
1399                  __func__, base_hz, scaled_hz, shift, *pmultiplier);
1400 }
1401 
1402 #ifdef CONFIG_X86_64
1403 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
1404 #endif
1405 
1406 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
1407 static unsigned long max_tsc_khz;
1408 
1409 static u32 adjust_tsc_khz(u32 khz, s32 ppm)
1410 {
1411         u64 v = (u64)khz * (1000000 + ppm);
1412         do_div(v, 1000000);
1413         return v;
1414 }
1415 
1416 static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
1417 {
1418         u64 ratio;
1419 
1420         /* Guest TSC same frequency as host TSC? */
1421         if (!scale) {
1422                 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
1423                 return 0;
1424         }
1425 
1426         /* TSC scaling supported? */
1427         if (!kvm_has_tsc_control) {
1428                 if (user_tsc_khz > tsc_khz) {
1429                         vcpu->arch.tsc_catchup = 1;
1430                         vcpu->arch.tsc_always_catchup = 1;
1431                         return 0;
1432                 } else {
1433                         WARN(1, "user requested TSC rate below hardware speed\n");
1434                         return -1;
1435                 }
1436         }
1437 
1438         /* TSC scaling required  - calculate ratio */
1439         ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
1440                                 user_tsc_khz, tsc_khz);
1441 
1442         if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
1443                 WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
1444                           user_tsc_khz);
1445                 return -1;
1446         }
1447 
1448         vcpu->arch.tsc_scaling_ratio = ratio;
1449         return 0;
1450 }
1451 
1452 static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
1453 {
1454         u32 thresh_lo, thresh_hi;
1455         int use_scaling = 0;
1456 
1457         /* tsc_khz can be zero if TSC calibration fails */
1458         if (user_tsc_khz == 0) {
1459                 /* set tsc_scaling_ratio to a safe value */
1460                 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
1461                 return -1;
1462         }
1463 
1464         /* Compute a scale to convert nanoseconds in TSC cycles */
1465         kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
1466                            &vcpu->arch.virtual_tsc_shift,
1467                            &vcpu->arch.virtual_tsc_mult);
1468         vcpu->arch.virtual_tsc_khz = user_tsc_khz;
1469 
1470         /*
1471          * Compute the variation in TSC rate which is acceptable
1472          * within the range of tolerance and decide if the
1473          * rate being applied is within that bounds of the hardware
1474          * rate.  If so, no scaling or compensation need be done.
1475          */
1476         thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
1477         thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
1478         if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
1479                 pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
1480                 use_scaling = 1;
1481         }
1482         return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
1483 }
1484 
1485 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1486 {
1487         u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
1488                                       vcpu->arch.virtual_tsc_mult,
1489                                       vcpu->arch.virtual_tsc_shift);
1490         tsc += vcpu->arch.this_tsc_write;
1491         return tsc;
1492 }
1493 
1494 static inline int gtod_is_based_on_tsc(int mode)
1495 {
1496         return mode == VCLOCK_TSC || mode == VCLOCK_HVCLOCK;
1497 }
1498 
1499 static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
1500 {
1501 #ifdef CONFIG_X86_64
1502         bool vcpus_matched;
1503         struct kvm_arch *ka = &vcpu->kvm->arch;
1504         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1505 
1506         vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1507                          atomic_read(&vcpu->kvm->online_vcpus));
1508 
1509         /*
1510          * Once the masterclock is enabled, always perform request in
1511          * order to update it.
1512          *
1513          * In order to enable masterclock, the host clocksource must be TSC
1514          * and the vcpus need to have matched TSCs.  When that happens,
1515          * perform request to enable masterclock.
1516          */
1517         if (ka->use_master_clock ||
1518             (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
1519                 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
1520 
1521         trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
1522                             atomic_read(&vcpu->kvm->online_vcpus),
1523                             ka->use_master_clock, gtod->clock.vclock_mode);
1524 #endif
1525 }
1526 
1527 static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
1528 {
1529         u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
1530         vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
1531 }
1532 
1533 /*
1534  * Multiply tsc by a fixed point number represented by ratio.
1535  *
1536  * The most significant 64-N bits (mult) of ratio represent the
1537  * integral part of the fixed point number; the remaining N bits
1538  * (frac) represent the fractional part, ie. ratio represents a fixed
1539  * point number (mult + frac * 2^(-N)).
1540  *
1541  * N equals to kvm_tsc_scaling_ratio_frac_bits.
1542  */
1543 static inline u64 __scale_tsc(u64 ratio, u64 tsc)
1544 {
1545         return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
1546 }
1547 
1548 u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
1549 {
1550         u64 _tsc = tsc;
1551         u64 ratio = vcpu->arch.tsc_scaling_ratio;
1552 
1553         if (ratio != kvm_default_tsc_scaling_ratio)
1554                 _tsc = __scale_tsc(ratio, tsc);
1555 
1556         return _tsc;
1557 }
1558 EXPORT_SYMBOL_GPL(kvm_scale_tsc);
1559 
1560 static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1561 {
1562         u64 tsc;
1563 
1564         tsc = kvm_scale_tsc(vcpu, rdtsc());
1565 
1566         return target_tsc - tsc;
1567 }
1568 
1569 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
1570 {
1571         u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
1572 
1573         return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
1574 }
1575 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
1576 
1577 static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1578 {
1579         kvm_x86_ops->write_tsc_offset(vcpu, offset);
1580         vcpu->arch.tsc_offset = offset;
1581 }
1582 
1583 static inline bool kvm_check_tsc_unstable(void)
1584 {
1585 #ifdef CONFIG_X86_64
1586         /*
1587          * TSC is marked unstable when we're running on Hyper-V,
1588          * 'TSC page' clocksource is good.
1589          */
1590         if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK)
1591                 return false;
1592 #endif
1593         return check_tsc_unstable();
1594 }
1595 
1596 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1597 {
1598         struct kvm *kvm = vcpu->kvm;
1599         u64 offset, ns, elapsed;
1600         unsigned long flags;
1601         bool matched;
1602         bool already_matched;
1603         u64 data = msr->data;
1604         bool synchronizing = false;
1605 
1606         raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1607         offset = kvm_compute_tsc_offset(vcpu, data);
1608         ns = ktime_get_boot_ns();
1609         elapsed = ns - kvm->arch.last_tsc_nsec;
1610 
1611         if (vcpu->arch.virtual_tsc_khz) {
1612                 if (data == 0 && msr->host_initiated) {
1613                         /*
1614                          * detection of vcpu initialization -- need to sync
1615                          * with other vCPUs. This particularly helps to keep
1616                          * kvm_clock stable after CPU hotplug
1617                          */
1618                         synchronizing = true;
1619                 } else {
1620                         u64 tsc_exp = kvm->arch.last_tsc_write +
1621                                                 nsec_to_cycles(vcpu, elapsed);
1622                         u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
1623                         /*
1624                          * Special case: TSC write with a small delta (1 second)
1625                          * of virtual cycle time against real time is
1626                          * interpreted as an attempt to synchronize the CPU.
1627                          */
1628                         synchronizing = data < tsc_exp + tsc_hz &&
1629                                         data + tsc_hz > tsc_exp;
1630                 }
1631         }
1632 
1633         /*
1634          * For a reliable TSC, we can match TSC offsets, and for an unstable
1635          * TSC, we add elapsed time in this computation.  We could let the
1636          * compensation code attempt to catch up if we fall behind, but
1637          * it's better to try to match offsets from the beginning.
1638          */
1639         if (synchronizing &&
1640             vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
1641                 if (!kvm_check_tsc_unstable()) {
1642                         offset = kvm->arch.cur_tsc_offset;
1643                         pr_debug("kvm: matched tsc offset for %llu\n", data);
1644                 } else {
1645                         u64 delta = nsec_to_cycles(vcpu, elapsed);
1646                         data += delta;
1647                         offset = kvm_compute_tsc_offset(vcpu, data);
1648                         pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1649                 }
1650                 matched = true;
1651                 already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
1652         } else {
1653                 /*
1654                  * We split periods of matched TSC writes into generations.
1655                  * For each generation, we track the original measured
1656                  * nanosecond time, offset, and write, so if TSCs are in
1657                  * sync, we can match exact offset, and if not, we can match
1658                  * exact software computation in compute_guest_tsc()
1659                  *
1660                  * These values are tracked in kvm->arch.cur_xxx variables.
1661                  */
1662                 kvm->arch.cur_tsc_generation++;
1663                 kvm->arch.cur_tsc_nsec = ns;
1664                 kvm->arch.cur_tsc_write = data;
1665                 kvm->arch.cur_tsc_offset = offset;
1666                 matched = false;
1667                 pr_debug("kvm: new tsc generation %llu, clock %llu\n",
1668                          kvm->arch.cur_tsc_generation, data);
1669         }
1670 
1671         /*
1672          * We also track th most recent recorded KHZ, write and time to
1673          * allow the matching interval to be extended at each write.
1674          */
1675         kvm->arch.last_tsc_nsec = ns;
1676         kvm->arch.last_tsc_write = data;
1677         kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
1678 
1679         vcpu->arch.last_guest_tsc = data;
1680 
1681         /* Keep track of which generation this VCPU has synchronized to */
1682         vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
1683         vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
1684         vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
1685 
1686         if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
1687                 update_ia32_tsc_adjust_msr(vcpu, offset);
1688 
1689         kvm_vcpu_write_tsc_offset(vcpu, offset);
1690         raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1691 
1692         spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
1693         if (!matched) {
1694                 kvm->arch.nr_vcpus_matched_tsc = 0;
1695         } else if (!already_matched) {
1696                 kvm->arch.nr_vcpus_matched_tsc++;
1697         }
1698 
1699         kvm_track_tsc_matching(vcpu);
1700         spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
1701 }
1702 
1703 EXPORT_SYMBOL_GPL(kvm_write_tsc);
1704 
1705 static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
1706                                            s64 adjustment)
1707 {
1708         kvm_vcpu_write_tsc_offset(vcpu, vcpu->arch.tsc_offset + adjustment);
1709 }
1710 
1711 static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
1712 {
1713         if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
1714                 WARN_ON(adjustment < 0);
1715         adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
1716         adjust_tsc_offset_guest(vcpu, adjustment);
1717 }
1718 
1719 #ifdef CONFIG_X86_64
1720 
1721 static u64 read_tsc(void)
1722 {
1723         u64 ret = (u64)rdtsc_ordered();
1724         u64 last = pvclock_gtod_data.clock.cycle_last;
1725 
1726         if (likely(ret >= last))
1727                 return ret;
1728 
1729         /*
1730          * GCC likes to generate cmov here, but this branch is extremely
1731          * predictable (it's just a function of time and the likely is
1732          * very likely) and there's a data dependence, so force GCC
1733          * to generate a branch instead.  I don't barrier() because
1734          * we don't actually need a barrier, and if this function
1735          * ever gets inlined it will generate worse code.
1736          */
1737         asm volatile ("");
1738         return last;
1739 }
1740 
1741 static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
1742 {
1743         long v;
1744         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1745         u64 tsc_pg_val;
1746 
1747         switch (gtod->clock.vclock_mode) {
1748         case VCLOCK_HVCLOCK:
1749                 tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
1750                                                   tsc_timestamp);
1751                 if (tsc_pg_val != U64_MAX) {
1752                         /* TSC page valid */
1753                         *mode = VCLOCK_HVCLOCK;
1754                         v = (tsc_pg_val - gtod->clock.cycle_last) &
1755                                 gtod->clock.mask;
1756                 } else {
1757                         /* TSC page invalid */
1758                         *mode = VCLOCK_NONE;
1759                 }
1760                 break;
1761         case VCLOCK_TSC:
1762                 *mode = VCLOCK_TSC;
1763                 *tsc_timestamp = read_tsc();
1764                 v = (*tsc_timestamp - gtod->clock.cycle_last) &
1765                         gtod->clock.mask;
1766                 break;
1767         default:
1768                 *mode = VCLOCK_NONE;
1769         }
1770 
1771         if (*mode == VCLOCK_NONE)
1772                 *tsc_timestamp = v = 0;
1773 
1774         return v * gtod->clock.mult;
1775 }
1776 
1777 static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
1778 {
1779         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1780         unsigned long seq;
1781         int mode;
1782         u64 ns;
1783 
1784         do {
1785                 seq = read_seqcount_begin(&gtod->seq);
1786                 ns = gtod->nsec_base;
1787                 ns += vgettsc(tsc_timestamp, &mode);
1788                 ns >>= gtod->clock.shift;
1789                 ns += gtod->boot_ns;
1790         } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1791         *t = ns;
1792 
1793         return mode;
1794 }
1795 
1796 static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
1797 {
1798         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1799         unsigned long seq;
1800         int mode;
1801         u64 ns;
1802 
1803         do {
1804                 seq = read_seqcount_begin(&gtod->seq);
1805                 ts->tv_sec = gtod->wall_time_sec;
1806                 ns = gtod->nsec_base;
1807                 ns += vgettsc(tsc_timestamp, &mode);
1808                 ns >>= gtod->clock.shift;
1809         } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1810 
1811         ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
1812         ts->tv_nsec = ns;
1813 
1814         return mode;
1815 }
1816 
1817 /* returns true if host is using TSC based clocksource */
1818 static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
1819 {
1820         /* checked again under seqlock below */
1821         if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
1822                 return false;
1823 
1824         return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
1825                                                       tsc_timestamp));
1826 }
1827 
1828 /* returns true if host is using TSC based clocksource */
1829 static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
1830                                            u64 *tsc_timestamp)
1831 {
1832         /* checked again under seqlock below */
1833         if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
1834                 return false;
1835 
1836         return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
1837 }
1838 #endif
1839 
1840 /*
1841  *
1842  * Assuming a stable TSC across physical CPUS, and a stable TSC
1843  * across virtual CPUs, the following condition is possible.
1844  * Each numbered line represents an event visible to both
1845  * CPUs at the next numbered event.
1846  *
1847  * "timespecX" represents host monotonic time. "tscX" represents
1848  * RDTSC value.
1849  *
1850  *              VCPU0 on CPU0           |       VCPU1 on CPU1
1851  *
1852  * 1.  read timespec0,tsc0
1853  * 2.                                   | timespec1 = timespec0 + N
1854  *                                      | tsc1 = tsc0 + M
1855  * 3. transition to guest               | transition to guest
1856  * 4. ret0 = timespec0 + (rdtsc - tsc0) |
1857  * 5.                                   | ret1 = timespec1 + (rdtsc - tsc1)
1858  *                                      | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
1859  *
1860  * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
1861  *
1862  *      - ret0 < ret1
1863  *      - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
1864  *              ...
1865  *      - 0 < N - M => M < N
1866  *
1867  * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
1868  * always the case (the difference between two distinct xtime instances
1869  * might be smaller then the difference between corresponding TSC reads,
1870  * when updating guest vcpus pvclock areas).
1871  *
1872  * To avoid that problem, do not allow visibility of distinct
1873  * system_timestamp/tsc_timestamp values simultaneously: use a master
1874  * copy of host monotonic time values. Update that master copy
1875  * in lockstep.
1876  *
1877  * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
1878  *
1879  */
1880 
1881 static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
1882 {
1883 #ifdef CONFIG_X86_64
1884         struct kvm_arch *ka = &kvm->arch;
1885         int vclock_mode;
1886         bool host_tsc_clocksource, vcpus_matched;
1887 
1888         vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1889                         atomic_read(&kvm->online_vcpus));
1890 
1891         /*
1892          * If the host uses TSC clock, then passthrough TSC as stable
1893          * to the guest.
1894          */
1895         host_tsc_clocksource = kvm_get_time_and_clockread(
1896                                         &ka->master_kernel_ns,
1897                                         &ka->master_cycle_now);
1898 
1899         ka->use_master_clock = host_tsc_clocksource && vcpus_matched
1900                                 && !ka->backwards_tsc_observed
1901                                 && !ka->boot_vcpu_runs_old_kvmclock;
1902 
1903         if (ka->use_master_clock)
1904                 atomic_set(&kvm_guest_has_master_clock, 1);
1905 
1906         vclock_mode = pvclock_gtod_data.clock.vclock_mode;
1907         trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
1908                                         vcpus_matched);
1909 #endif
1910 }
1911 
1912 void kvm_make_mclock_inprogress_request(struct kvm *kvm)
1913 {
1914         kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
1915 }
1916 
1917 static void kvm_gen_update_masterclock(struct kvm *kvm)
1918 {
1919 #ifdef CONFIG_X86_64
1920         int i;
1921         struct kvm_vcpu *vcpu;
1922         struct kvm_arch *ka = &kvm->arch;
1923 
1924         spin_lock(&ka->pvclock_gtod_sync_lock);
1925         kvm_make_mclock_inprogress_request(kvm);
1926         /* no guest entries from this point */
1927         pvclock_update_vm_gtod_copy(kvm);
1928 
1929         kvm_for_each_vcpu(i, vcpu, kvm)
1930                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
1931 
1932         /* guest entries allowed */
1933         kvm_for_each_vcpu(i, vcpu, kvm)
1934                 kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
1935 
1936         spin_unlock(&ka->pvclock_gtod_sync_lock);
1937 #endif
1938 }
1939 
1940 u64 get_kvmclock_ns(struct kvm *kvm)
1941 {
1942         struct kvm_arch *ka = &kvm->arch;
1943         struct pvclock_vcpu_time_info hv_clock;
1944         u64 ret;
1945 
1946         spin_lock(&ka->pvclock_gtod_sync_lock);
1947         if (!ka->use_master_clock) {
1948                 spin_unlock(&ka->pvclock_gtod_sync_lock);
1949                 return ktime_get_boot_ns() + ka->kvmclock_offset;
1950         }
1951 
1952         hv_clock.tsc_timestamp = ka->master_cycle_now;
1953         hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
1954         spin_unlock(&ka->pvclock_gtod_sync_lock);
1955 
1956         /* both __this_cpu_read() and rdtsc() should be on the same cpu */
1957         get_cpu();
1958 
1959         if (__this_cpu_read(cpu_tsc_khz)) {
1960                 kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
1961                                    &hv_clock.tsc_shift,
1962                                    &hv_clock.tsc_to_system_mul);
1963                 ret = __pvclock_read_cycles(&hv_clock, rdtsc());
1964         } else
1965                 ret = ktime_get_boot_ns() + ka->kvmclock_offset;
1966 
1967         put_cpu();
1968 
1969         return ret;
1970 }
1971 
1972 static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
1973 {
1974         struct kvm_vcpu_arch *vcpu = &v->arch;
1975         struct pvclock_vcpu_time_info guest_hv_clock;
1976 
1977         if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
1978                 &guest_hv_clock, sizeof(guest_hv_clock))))
1979                 return;
1980 
1981         /* This VCPU is paused, but it's legal for a guest to read another
1982          * VCPU's kvmclock, so we really have to follow the specification where
1983          * it says that version is odd if data is being modified, and even after
1984          * it is consistent.
1985          *
1986          * Version field updates must be kept separate.  This is because
1987          * kvm_write_guest_cached might use a "rep movs" instruction, and
1988          * writes within a string instruction are weakly ordered.  So there
1989          * are three writes overall.
1990          *
1991          * As a small optimization, only write the version field in the first
1992          * and third write.  The vcpu->pv_time cache is still valid, because the
1993          * version field is the first in the struct.
1994          */
1995         BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
1996 
1997         if (guest_hv_clock.version & 1)
1998                 ++guest_hv_clock.version;  /* first time write, random junk */
1999 
2000         vcpu->hv_clock.version = guest_hv_clock.version + 1;
2001         kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2002                                 &vcpu->hv_clock,
2003                                 sizeof(vcpu->hv_clock.version));
2004 
2005         smp_wmb();
2006 
2007         /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
2008         vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
2009 
2010         if (vcpu->pvclock_set_guest_stopped_request) {
2011                 vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
2012                 vcpu->pvclock_set_guest_stopped_request = false;
2013         }
2014 
2015         trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
2016 
2017         kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2018                                 &vcpu->hv_clock,
2019                                 sizeof(vcpu->hv_clock));
2020 
2021         smp_wmb();
2022 
2023         vcpu->hv_clock.version++;
2024         kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2025                                 &vcpu->hv_clock,
2026                                 sizeof(vcpu->hv_clock.version));
2027 }
2028 
2029 static int kvm_guest_time_update(struct kvm_vcpu *v)
2030 {
2031         unsigned long flags, tgt_tsc_khz;
2032         struct kvm_vcpu_arch *vcpu = &v->arch;
2033         struct kvm_arch *ka = &v->kvm->arch;
2034         s64 kernel_ns;
2035         u64 tsc_timestamp, host_tsc;
2036         u8 pvclock_flags;
2037         bool use_master_clock;
2038 
2039         kernel_ns = 0;
2040         host_tsc = 0;
2041 
2042         /*
2043          * If the host uses TSC clock, then passthrough TSC as stable
2044          * to the guest.
2045          */
2046         spin_lock(&ka->pvclock_gtod_sync_lock);
2047         use_master_clock = ka->use_master_clock;
2048         if (use_master_clock) {
2049                 host_tsc = ka->master_cycle_now;
2050                 kernel_ns = ka->master_kernel_ns;
2051         }
2052         spin_unlock(&ka->pvclock_gtod_sync_lock);
2053 
2054         /* Keep irq disabled to prevent changes to the clock */
2055         local_irq_save(flags);
2056         tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
2057         if (unlikely(tgt_tsc_khz == 0)) {
2058                 local_irq_restore(flags);
2059                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
2060                 return 1;
2061         }
2062         if (!use_master_clock) {
2063                 host_tsc = rdtsc();
2064                 kernel_ns = ktime_get_boot_ns();
2065         }
2066 
2067         tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
2068 
2069         /*
2070          * We may have to catch up the TSC to match elapsed wall clock
2071          * time for two reasons, even if kvmclock is used.
2072          *   1) CPU could have been running below the maximum TSC rate
2073          *   2) Broken TSC compensation resets the base at each VCPU
2074          *      entry to avoid unknown leaps of TSC even when running
2075          *      again on the same CPU.  This may cause apparent elapsed
2076          *      time to disappear, and the guest to stand still or run
2077          *      very slowly.
2078          */
2079         if (vcpu->tsc_catchup) {
2080                 u64 tsc = compute_guest_tsc(v, kernel_ns);
2081                 if (tsc > tsc_timestamp) {
2082                         adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
2083                         tsc_timestamp = tsc;
2084                 }
2085         }
2086 
2087         local_irq_restore(flags);
2088 
2089         /* With all the info we got, fill in the values */
2090 
2091         if (kvm_has_tsc_control)
2092                 tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
2093 
2094         if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
2095                 kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
2096                                    &vcpu->hv_clock.tsc_shift,
2097                                    &vcpu->hv_clock.tsc_to_system_mul);
2098                 vcpu->hw_tsc_khz = tgt_tsc_khz;
2099         }
2100 
2101         vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
2102         vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
2103         vcpu->last_guest_tsc = tsc_timestamp;
2104 
2105         /* If the host uses TSC clocksource, then it is stable */
2106         pvclock_flags = 0;
2107         if (use_master_clock)
2108                 pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
2109 
2110         vcpu->hv_clock.flags = pvclock_flags;
2111 
2112         if (vcpu->pv_time_enabled)
2113                 kvm_setup_pvclock_page(v);
2114         if (v == kvm_get_vcpu(v->kvm, 0))
2115                 kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
2116         return 0;
2117 }
2118 
2119 /*
2120  * kvmclock updates which are isolated to a given vcpu, such as
2121  * vcpu->cpu migration, should not allow system_timestamp from
2122  * the rest of the vcpus to remain static. Otherwise ntp frequency
2123  * correction applies to one vcpu's system_timestamp but not
2124  * the others.
2125  *
2126  * So in those cases, request a kvmclock update for all vcpus.
2127  * We need to rate-limit these requests though, as they can
2128  * considerably slow guests that have a large number of vcpus.
2129  * The time for a remote vcpu to update its kvmclock is bound
2130  * by the delay we use to rate-limit the updates.
2131  */
2132 
2133 #define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
2134 
2135 static void kvmclock_update_fn(struct work_struct *work)
2136 {
2137         int i;
2138         struct delayed_work *dwork = to_delayed_work(work);
2139         struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
2140                                            kvmclock_update_work);
2141         struct kvm *kvm = container_of(ka, struct kvm, arch);
2142         struct kvm_vcpu *vcpu;
2143 
2144         kvm_for_each_vcpu(i, vcpu, kvm) {
2145                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2146                 kvm_vcpu_kick(vcpu);
2147         }
2148 }
2149 
2150 static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
2151 {
2152         struct kvm *kvm = v->kvm;
2153 
2154         kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
2155         schedule_delayed_work(&kvm->arch.kvmclock_update_work,
2156                                         KVMCLOCK_UPDATE_DELAY);
2157 }
2158 
2159 #define KVMCLOCK_SYNC_PERIOD (300 * HZ)
2160 
2161 static void kvmclock_sync_fn(struct work_struct *work)
2162 {
2163         struct delayed_work *dwork = to_delayed_work(work);
2164         struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
2165                                            kvmclock_sync_work);
2166         struct kvm *kvm = container_of(ka, struct kvm, arch);
2167 
2168         if (!kvmclock_periodic_sync)
2169                 return;
2170 
2171         schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
2172         schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
2173                                         KVMCLOCK_SYNC_PERIOD);
2174 }
2175 
2176 static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2177 {
2178         u64 mcg_cap = vcpu->arch.mcg_cap;
2179         unsigned bank_num = mcg_cap & 0xff;
2180         u32 msr = msr_info->index;
2181         u64 data = msr_info->data;
2182 
2183         switch (msr) {
2184         case MSR_IA32_MCG_STATUS:
2185                 vcpu->arch.mcg_status = data;
2186                 break;
2187         case MSR_IA32_MCG_CTL:
2188                 if (!(mcg_cap & MCG_CTL_P) &&
2189                     (data || !msr_info->host_initiated))
2190                         return 1;
2191                 if (data != 0 && data != ~(u64)0)
2192                         return 1;
2193                 vcpu->arch.mcg_ctl = data;
2194                 break;
2195         default:
2196                 if (msr >= MSR_IA32_MC0_CTL &&
2197                     msr < MSR_IA32_MCx_CTL(bank_num)) {
2198                         u32 offset = msr - MSR_IA32_MC0_CTL;
2199                         /* only 0 or all 1s can be written to IA32_MCi_CTL
2200                          * some Linux kernels though clear bit 10 in bank 4 to
2201                          * workaround a BIOS/GART TBL issue on AMD K8s, ignore
2202                          * this to avoid an uncatched #GP in the guest
2203                          */
2204                         if ((offset & 0x3) == 0 &&
2205                             data != 0 && (data | (1 << 10)) != ~(u64)0)
2206                                 return -1;
2207                         if (!msr_info->host_initiated &&
2208                                 (offset & 0x3) == 1 && data != 0)
2209                                 return -1;
2210                         vcpu->arch.mce_banks[offset] = data;
2211                         break;
2212                 }
2213                 return 1;
2214         }
2215         return 0;
2216 }
2217 
2218 static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
2219 {
2220         struct kvm *kvm = vcpu->kvm;
2221         int lm = is_long_mode(vcpu);
2222         u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
2223                 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
2224         u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
2225                 : kvm->arch.xen_hvm_config.blob_size_32;
2226         u32 page_num = data & ~PAGE_MASK;
2227         u64 page_addr = data & PAGE_MASK;
2228         u8 *page;
2229         int r;
2230 
2231         r = -E2BIG;
2232         if (page_num >= blob_size)
2233                 goto out;
2234         r = -ENOMEM;
2235         page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
2236         if (IS_ERR(page)) {
2237                 r = PTR_ERR(page);
2238                 goto out;
2239         }
2240         if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
2241                 goto out_free;
2242         r = 0;
2243 out_free:
2244         kfree(page);
2245 out:
2246         return r;
2247 }
2248 
2249 static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
2250 {
2251         gpa_t gpa = data & ~0x3f;
2252 
2253         /* Bits 3:5 are reserved, Should be zero */
2254         if (data & 0x38)
2255                 return 1;
2256 
2257         vcpu->arch.apf.msr_val = data;
2258 
2259         if (!(data & KVM_ASYNC_PF_ENABLED)) {
2260                 kvm_clear_async_pf_completion_queue(vcpu);
2261                 kvm_async_pf_hash_reset(vcpu);
2262                 return 0;
2263         }
2264 
2265         if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
2266                                         sizeof(u32)))
2267                 return 1;
2268 
2269         vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
2270         vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
2271         kvm_async_pf_wakeup_all(vcpu);
2272         return 0;
2273 }
2274 
2275 static void kvmclock_reset(struct kvm_vcpu *vcpu)
2276 {
2277         vcpu->arch.pv_time_enabled = false;
2278 }
2279 
2280 static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
2281 {
2282         ++vcpu->stat.tlb_flush;
2283         kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa);
2284 }
2285 
2286 static void record_steal_time(struct kvm_vcpu *vcpu)
2287 {
2288         if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
2289                 return;
2290 
2291         if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2292                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
2293                 return;
2294 
2295         /*
2296          * Doing a TLB flush here, on the guest's behalf, can avoid
2297          * expensive IPIs.
2298          */
2299         if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB)
2300                 kvm_vcpu_flush_tlb(vcpu, false);
2301 
2302         if (vcpu->arch.st.steal.version & 1)
2303                 vcpu->arch.st.steal.version += 1;  /* first time write, random junk */
2304 
2305         vcpu->arch.st.steal.version += 1;
2306 
2307         kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2308                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
2309 
2310         smp_wmb();
2311 
2312         vcpu->arch.st.steal.steal += current->sched_info.run_delay -
2313                 vcpu->arch.st.last_steal;
2314         vcpu->arch.st.last_steal = current->sched_info.run_delay;
2315 
2316         kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2317                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
2318 
2319         smp_wmb();
2320 
2321         vcpu->arch.st.steal.version += 1;
2322 
2323         kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2324                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
2325 }
2326 
2327 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2328 {
2329         bool pr = false;
2330         u32 msr = msr_info->index;
2331         u64 data = msr_info->data;
2332 
2333         switch (msr) {
2334         case MSR_AMD64_NB_CFG:
2335         case MSR_IA32_UCODE_WRITE:
2336         case MSR_VM_HSAVE_PA:
2337         case MSR_AMD64_PATCH_LOADER:
2338         case MSR_AMD64_BU_CFG2:
2339         case MSR_AMD64_DC_CFG:
2340                 break;
2341 
2342         case MSR_IA32_UCODE_REV:
2343                 if (msr_info->host_initiated)
2344                         vcpu->arch.microcode_version = data;
2345                 break;
2346         case MSR_EFER:
2347                 return set_efer(vcpu, data);
2348         case MSR_K7_HWCR:
2349                 data &= ~(u64)0x40;     /* ignore flush filter disable */
2350                 data &= ~(u64)0x100;    /* ignore ignne emulation enable */
2351                 data &= ~(u64)0x8;      /* ignore TLB cache disable */
2352                 data &= ~(u64)0x40000;  /* ignore Mc status write enable */
2353                 if (data != 0) {
2354                         vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
2355                                     data);
2356                         return 1;
2357                 }
2358                 break;
2359         case MSR_FAM10H_MMIO_CONF_BASE:
2360                 if (data != 0) {
2361                         vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
2362                                     "0x%llx\n", data);
2363                         return 1;
2364                 }
2365                 break;
2366         case MSR_IA32_DEBUGCTLMSR:
2367                 if (!data) {
2368                         /* We support the non-activated case already */
2369                         break;
2370                 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
2371                         /* Values other than LBR and BTF are vendor-specific,
2372                            thus reserved and should throw a #GP */
2373                         return 1;
2374                 }
2375                 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
2376                             __func__, data);
2377                 break;
2378         case 0x200 ... 0x2ff:
2379                 return kvm_mtrr_set_msr(vcpu, msr, data);
2380         case MSR_IA32_APICBASE:
2381                 return kvm_set_apic_base(vcpu, msr_info);
2382         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
2383                 return kvm_x2apic_msr_write(vcpu, msr, data);
2384         case MSR_IA32_TSCDEADLINE:
2385                 kvm_set_lapic_tscdeadline_msr(vcpu, data);
2386                 break;
2387         case MSR_IA32_TSC_ADJUST:
2388                 if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
2389                         if (!msr_info->host_initiated) {
2390                                 s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
2391                                 adjust_tsc_offset_guest(vcpu, adj);
2392                         }
2393                         vcpu->arch.ia32_tsc_adjust_msr = data;
2394                 }
2395                 break;
2396         case MSR_IA32_MISC_ENABLE:
2397                 vcpu->arch.ia32_misc_enable_msr = data;
2398                 break;
2399         case MSR_IA32_SMBASE:
2400                 if (!msr_info->host_initiated)
2401                         return 1;
2402                 vcpu->arch.smbase = data;
2403                 break;
2404         case MSR_IA32_TSC:
2405                 kvm_write_tsc(vcpu, msr_info);
2406                 break;
2407         case MSR_SMI_COUNT:
2408                 if (!msr_info->host_initiated)
2409                         return 1;
2410                 vcpu->arch.smi_count = data;
2411                 break;
2412         case MSR_KVM_WALL_CLOCK_NEW:
2413         case MSR_KVM_WALL_CLOCK:
2414                 vcpu->kvm->arch.wall_clock = data;
2415                 kvm_write_wall_clock(vcpu->kvm, data);
2416                 break;
2417         case MSR_KVM_SYSTEM_TIME_NEW:
2418         case MSR_KVM_SYSTEM_TIME: {
2419                 struct kvm_arch *ka = &vcpu->kvm->arch;
2420 
2421                 kvmclock_reset(vcpu);
2422 
2423                 if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
2424                         bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
2425 
2426                         if (ka->boot_vcpu_runs_old_kvmclock != tmp)
2427                                 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2428 
2429                         ka->boot_vcpu_runs_old_kvmclock = tmp;
2430                 }
2431 
2432                 vcpu->arch.time = data;
2433                 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2434 
2435                 /* we verify if the enable bit is set... */
2436                 if (!(data & 1))
2437                         break;
2438 
2439                 if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
2440                      &vcpu->arch.pv_time, data & ~1ULL,
2441                      sizeof(struct pvclock_vcpu_time_info)))
2442                         vcpu->arch.pv_time_enabled = false;
2443                 else
2444                         vcpu->arch.pv_time_enabled = true;
2445 
2446                 break;
2447         }
2448         case MSR_KVM_ASYNC_PF_EN:
2449                 if (kvm_pv_enable_async_pf(vcpu, data))
2450                         return 1;
2451                 break;
2452         case MSR_KVM_STEAL_TIME:
2453 
2454                 if (unlikely(!sched_info_on()))
2455                         return 1;
2456 
2457                 if (data & KVM_STEAL_RESERVED_MASK)
2458                         return 1;
2459 
2460                 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
2461                                                 data & KVM_STEAL_VALID_BITS,
2462                                                 sizeof(struct kvm_steal_time)))
2463                         return 1;
2464 
2465                 vcpu->arch.st.msr_val = data;
2466 
2467                 if (!(data & KVM_MSR_ENABLED))
2468                         break;
2469 
2470                 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2471 
2472                 break;
2473         case MSR_KVM_PV_EOI_EN:
2474                 if (kvm_lapic_enable_pv_eoi(vcpu, data))
2475                         return 1;
2476                 break;
2477 
2478         case MSR_IA32_MCG_CTL:
2479         case MSR_IA32_MCG_STATUS:
2480         case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2481                 return set_msr_mce(vcpu, msr_info);
2482 
2483         case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
2484         case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
2485                 pr = true; /* fall through */
2486         case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
2487         case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
2488                 if (kvm_pmu_is_valid_msr(vcpu, msr))
2489                         return kvm_pmu_set_msr(vcpu, msr_info);
2490 
2491                 if (pr || data != 0)
2492                         vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
2493                                     "0x%x data 0x%llx\n", msr, data);
2494                 break;
2495         case MSR_K7_CLK_CTL:
2496                 /*
2497                  * Ignore all writes to this no longer documented MSR.
2498                  * Writes are only relevant for old K7 processors,
2499                  * all pre-dating SVM, but a recommended workaround from
2500                  * AMD for these chips. It is possible to specify the
2501                  * affected processor models on the command line, hence
2502                  * the need to ignore the workaround.
2503                  */
2504                 break;
2505         case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
2506         case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
2507         case HV_X64_MSR_CRASH_CTL:
2508         case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
2509         case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
2510         case HV_X64_MSR_TSC_EMULATION_CONTROL:
2511         case HV_X64_MSR_TSC_EMULATION_STATUS:
2512                 return kvm_hv_set_msr_common(vcpu, msr, data,
2513                                              msr_info->host_initiated);
2514         case MSR_IA32_BBL_CR_CTL3:
2515                 /* Drop writes to this legacy MSR -- see rdmsr
2516                  * counterpart for further detail.
2517                  */
2518                 if (report_ignored_msrs)
2519                         vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
2520                                 msr, data);
2521                 break;
2522         case MSR_AMD64_OSVW_ID_LENGTH:
2523                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2524                         return 1;
2525                 vcpu->arch.osvw.length = data;
2526                 break;
2527         case MSR_AMD64_OSVW_STATUS:
2528                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2529                         return 1;
2530                 vcpu->arch.osvw.status = data;
2531                 break;
2532         case MSR_PLATFORM_INFO:
2533                 if (!msr_info->host_initiated ||
2534                     data & ~MSR_PLATFORM_INFO_CPUID_FAULT ||
2535                     (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
2536                      cpuid_fault_enabled(vcpu)))
2537                         return 1;
2538                 vcpu->arch.msr_platform_info = data;
2539                 break;
2540         case MSR_MISC_FEATURES_ENABLES:
2541                 if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
2542                     (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
2543                      !supports_cpuid_fault(vcpu)))
2544                         return 1;
2545                 vcpu->arch.msr_misc_features_enables = data;
2546                 break;
2547         default:
2548                 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
2549                         return xen_hvm_config(vcpu, data);
2550                 if (kvm_pmu_is_valid_msr(vcpu, msr))
2551                         return kvm_pmu_set_msr(vcpu, msr_info);
2552                 if (!ignore_msrs) {
2553                         vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
2554                                     msr, data);
2555                         return 1;
2556                 } else {
2557                         if (report_ignored_msrs)
2558                                 vcpu_unimpl(vcpu,
2559                                         "ignored wrmsr: 0x%x data 0x%llx\n",
2560                                         msr, data);
2561                         break;
2562                 }
2563         }
2564         return 0;
2565 }
2566 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
2567 
2568 
2569 /*
2570  * Reads an msr value (of 'msr_index') into 'pdata'.
2571  * Returns 0 on success, non-0 otherwise.
2572  * Assumes vcpu_load() was already called.
2573  */
2574 int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2575 {
2576         return kvm_x86_ops->get_msr(vcpu, msr);
2577 }
2578 EXPORT_SYMBOL_GPL(kvm_get_msr);
2579 
2580 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
2581 {
2582         u64 data;
2583         u64 mcg_cap = vcpu->arch.mcg_cap;
2584         unsigned bank_num = mcg_cap & 0xff;
2585 
2586         switch (msr) {
2587         case MSR_IA32_P5_MC_ADDR:
2588         case MSR_IA32_P5_MC_TYPE:
2589                 data = 0;
2590                 break;
2591         case MSR_IA32_MCG_CAP:
2592                 data = vcpu->arch.mcg_cap;
2593                 break;
2594         case MSR_IA32_MCG_CTL:
2595                 if (!(mcg_cap & MCG_CTL_P) && !host)
2596                         return 1;
2597                 data = vcpu->arch.mcg_ctl;
2598                 break;
2599         case MSR_IA32_MCG_STATUS:
2600                 data = vcpu->arch.mcg_status;
2601                 break;
2602         default:
2603                 if (msr >= MSR_IA32_MC0_CTL &&
2604                     msr < MSR_IA32_MCx_CTL(bank_num)) {
2605                         u32 offset = msr - MSR_IA32_MC0_CTL;
2606                         data = vcpu->arch.mce_banks[offset];
2607                         break;
2608                 }
2609                 return 1;
2610         }
2611         *pdata = data;
2612         return 0;
2613 }
2614 
2615 int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2616 {
2617         switch (msr_info->index) {
2618         case MSR_IA32_PLATFORM_ID:
2619         case MSR_IA32_EBL_CR_POWERON:
2620         case MSR_IA32_DEBUGCTLMSR:
2621         case MSR_IA32_LASTBRANCHFROMIP:
2622         case MSR_IA32_LASTBRANCHTOIP:
2623         case MSR_IA32_LASTINTFROMIP:
2624         case MSR_IA32_LASTINTTOIP:
2625         case MSR_K8_SYSCFG:
2626         case MSR_K8_TSEG_ADDR:
2627         case MSR_K8_TSEG_MASK:
2628         case MSR_K7_HWCR:
2629         case MSR_VM_HSAVE_PA:
2630         case MSR_K8_INT_PENDING_MSG:
2631         case MSR_AMD64_NB_CFG:
2632         case MSR_FAM10H_MMIO_CONF_BASE:
2633         case MSR_AMD64_BU_CFG2:
2634         case MSR_IA32_PERF_CTL:
2635         case MSR_AMD64_DC_CFG:
2636                 msr_info->data = 0;
2637                 break;
2638         case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
2639         case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
2640         case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
2641         case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
2642         case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
2643                 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
2644                         return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
2645                 msr_info->data = 0;
2646                 break;
2647         case MSR_IA32_UCODE_REV:
2648                 msr_info->data = vcpu->arch.microcode_version;
2649                 break;
2650         case MSR_IA32_TSC:
2651                 msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
2652                 break;
2653         case MSR_MTRRcap:
2654         case 0x200 ... 0x2ff:
2655                 return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
2656         case 0xcd: /* fsb frequency */
2657                 msr_info->data = 3;
2658                 break;
2659                 /*
2660                  * MSR_EBC_FREQUENCY_ID
2661                  * Conservative value valid for even the basic CPU models.
2662                  * Models 0,1: 000 in bits 23:21 indicating a bus speed of
2663                  * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
2664                  * and 266MHz for model 3, or 4. Set Core Clock
2665                  * Frequency to System Bus Frequency Ratio to 1 (bits
2666                  * 31:24) even though these are only valid for CPU
2667                  * models > 2, however guests may end up dividing or
2668                  * multiplying by zero otherwise.
2669                  */
2670         case MSR_EBC_FREQUENCY_ID:
2671                 msr_info->data = 1 << 24;
2672                 break;
2673         case MSR_IA32_APICBASE:
2674                 msr_info->data = kvm_get_apic_base(vcpu);
2675                 break;
2676         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
2677                 return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
2678                 break;
2679         case MSR_IA32_TSCDEADLINE:
2680                 msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
2681                 break;
2682         case MSR_IA32_TSC_ADJUST:
2683                 msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
2684                 break;
2685         case MSR_IA32_MISC_ENABLE:
2686                 msr_info->data = vcpu->arch.ia32_misc_enable_msr;
2687                 break;
2688         case MSR_IA32_SMBASE:
2689                 if (!msr_info->host_initiated)
2690                         return 1;
2691                 msr_info->data = vcpu->arch.smbase;
2692                 break;
2693         case MSR_SMI_COUNT:
2694                 msr_info->data = vcpu->arch.smi_count;
2695                 break;
2696         case MSR_IA32_PERF_STATUS:
2697                 /* TSC increment by tick */
2698                 msr_info->data = 1000ULL;
2699                 /* CPU multiplier */
2700                 msr_info->data |= (((uint64_t)4ULL) << 40);
2701                 break;
2702         case MSR_EFER:
2703                 msr_info->data = vcpu->arch.efer;
2704                 break;
2705         case MSR_KVM_WALL_CLOCK:
2706         case MSR_KVM_WALL_CLOCK_NEW:
2707                 msr_info->data = vcpu->kvm->arch.wall_clock;
2708                 break;
2709         case MSR_KVM_SYSTEM_TIME:
2710         case MSR_KVM_SYSTEM_TIME_NEW:
2711                 msr_info->data = vcpu->arch.time;
2712                 break;
2713         case MSR_KVM_ASYNC_PF_EN:
2714                 msr_info->data = vcpu->arch.apf.msr_val;
2715                 break;
2716         case MSR_KVM_STEAL_TIME:
2717                 msr_info->data = vcpu->arch.st.msr_val;
2718                 break;
2719         case MSR_KVM_PV_EOI_EN:
2720                 msr_info->data = vcpu->arch.pv_eoi.msr_val;
2721                 break;
2722         case MSR_IA32_P5_MC_ADDR:
2723         case MSR_IA32_P5_MC_TYPE:
2724         case MSR_IA32_MCG_CAP:
2725         case MSR_IA32_MCG_CTL:
2726         case MSR_IA32_MCG_STATUS:
2727         case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2728                 return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
2729                                    msr_info->host_initiated);
2730         case MSR_K7_CLK_CTL:
2731                 /*
2732                  * Provide expected ramp-up count for K7. All other
2733                  * are set to zero, indicating minimum divisors for
2734                  * every field.
2735                  *
2736                  * This prevents guest kernels on AMD host with CPU
2737                  * type 6, model 8 and higher from exploding due to
2738                  * the rdmsr failing.
2739                  */
2740                 msr_info->data = 0x20000000;
2741                 break;
2742         case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
2743         case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
2744         case HV_X64_MSR_CRASH_CTL:
2745         case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
2746         case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
2747         case HV_X64_MSR_TSC_EMULATION_CONTROL:
2748         case HV_X64_MSR_TSC_EMULATION_STATUS:
2749                 return kvm_hv_get_msr_common(vcpu,
2750                                              msr_info->index, &msr_info->data,
2751                                              msr_info->host_initiated);
2752                 break;
2753         case MSR_IA32_BBL_CR_CTL3:
2754                 /* This legacy MSR exists but isn't fully documented in current
2755                  * silicon.  It is however accessed by winxp in very narrow
2756                  * scenarios where it sets bit #19, itself documented as
2757                  * a "reserved" bit.  Best effort attempt to source coherent
2758                  * read data here should the balance of the register be
2759                  * interpreted by the guest:
2760                  *
2761                  * L2 cache control register 3: 64GB range, 256KB size,
2762                  * enabled, latency 0x1, configured
2763                  */
2764                 msr_info->data = 0xbe702111;
2765                 break;
2766         case MSR_AMD64_OSVW_ID_LENGTH:
2767                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2768                         return 1;
2769                 msr_info->data = vcpu->arch.osvw.length;
2770                 break;
2771         case MSR_AMD64_OSVW_STATUS:
2772                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2773                         return 1;
2774                 msr_info->data = vcpu->arch.osvw.status;
2775                 break;
2776         case MSR_PLATFORM_INFO:
2777                 msr_info->data = vcpu->arch.msr_platform_info;
2778                 break;
2779         case MSR_MISC_FEATURES_ENABLES:
2780                 msr_info->data = vcpu->arch.msr_misc_features_enables;
2781                 break;
2782         default:
2783                 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
2784                         return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
2785                 if (!ignore_msrs) {
2786                         vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
2787                                                msr_info->index);
2788                         return 1;
2789                 } else {
2790                         if (report_ignored_msrs)
2791                                 vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n",
2792                                         msr_info->index);
2793                         msr_info->data = 0;
2794                 }
2795                 break;
2796         }
2797         return 0;
2798 }
2799 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
2800 
2801 /*
2802  * Read or write a bunch of msrs. All parameters are kernel addresses.
2803  *
2804  * @return number of msrs set successfully.
2805  */
2806 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2807                     struct kvm_msr_entry *entries,
2808                     int (*do_msr)(struct kvm_vcpu *vcpu,
2809                                   unsigned index, u64 *data))
2810 {
2811         int i;
2812 
2813         for (i = 0; i < msrs->nmsrs; ++i)
2814                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
2815                         break;
2816 
2817         return i;
2818 }
2819 
2820 /*
2821  * Read or write a bunch of msrs. Parameters are user addresses.
2822  *
2823  * @return number of msrs set successfully.
2824  */
2825 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2826                   int (*do_msr)(struct kvm_vcpu *vcpu,
2827                                 unsigned index, u64 *data),
2828                   int writeback)
2829 {
2830         struct kvm_msrs msrs;
2831         struct kvm_msr_entry *entries;
2832         int r, n;
2833         unsigned size;
2834 
2835         r = -EFAULT;
2836         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2837                 goto out;
2838 
2839         r = -E2BIG;
2840         if (msrs.nmsrs >= MAX_IO_MSRS)
2841                 goto out;
2842 
2843         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2844         entries = memdup_user(user_msrs->entries, size);
2845         if (IS_ERR(entries)) {
2846                 r = PTR_ERR(entries);
2847                 goto out;
2848         }
2849 
2850         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2851         if (r < 0)
2852                 goto out_free;
2853 
2854         r = -EFAULT;
2855         if (writeback && copy_to_user(user_msrs->entries, entries, size))
2856                 goto out_free;
2857 
2858         r = n;
2859 
2860 out_free:
2861         kfree(entries);
2862 out:
2863         return r;
2864 }
2865 
2866 static inline bool kvm_can_mwait_in_guest(void)
2867 {
2868         return boot_cpu_has(X86_FEATURE_MWAIT) &&
2869                 !boot_cpu_has_bug(X86_BUG_MONITOR) &&
2870                 boot_cpu_has(X86_FEATURE_ARAT);
2871 }
2872 
2873 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
2874 {
2875         int r = 0;
2876 
2877         switch (ext) {
2878         case KVM_CAP_IRQCHIP:
2879         case KVM_CAP_HLT:
2880         case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
2881         case KVM_CAP_SET_TSS_ADDR:
2882         case KVM_CAP_EXT_CPUID:
2883         case KVM_CAP_EXT_EMUL_CPUID:
2884         case KVM_CAP_CLOCKSOURCE:
2885         case KVM_CAP_PIT:
2886         case KVM_CAP_NOP_IO_DELAY:
2887         case KVM_CAP_MP_STATE:
2888         case KVM_CAP_SYNC_MMU:
2889         case KVM_CAP_USER_NMI:
2890         case KVM_CAP_REINJECT_CONTROL:
2891         case KVM_CAP_IRQ_INJECT_STATUS:
2892         case KVM_CAP_IOEVENTFD:
2893         case KVM_CAP_IOEVENTFD_NO_LENGTH:
2894         case KVM_CAP_PIT2:
2895         case KVM_CAP_PIT_STATE2:
2896         case KVM_CAP_SET_IDENTITY_MAP_ADDR:
2897         case KVM_CAP_XEN_HVM:
2898         case KVM_CAP_VCPU_EVENTS:
2899         case KVM_CAP_HYPERV:
2900         case KVM_CAP_HYPERV_VAPIC:
2901         case KVM_CAP_HYPERV_SPIN:
2902         case KVM_CAP_HYPERV_SYNIC:
2903         case KVM_CAP_HYPERV_SYNIC2:
2904         case KVM_CAP_HYPERV_VP_INDEX:
2905         case KVM_CAP_HYPERV_EVENTFD:
2906         case KVM_CAP_HYPERV_TLBFLUSH:
2907         case KVM_CAP_PCI_SEGMENT:
2908         case KVM_CAP_DEBUGREGS:
2909         case KVM_CAP_X86_ROBUST_SINGLESTEP:
2910         case KVM_CAP_XSAVE:
2911         case KVM_CAP_ASYNC_PF:
2912         case KVM_CAP_GET_TSC_KHZ:
2913         case KVM_CAP_KVMCLOCK_CTRL:
2914         case KVM_CAP_READONLY_MEM:
2915         case KVM_CAP_HYPERV_TIME:
2916         case KVM_CAP_IOAPIC_POLARITY_IGNORED:
2917         case KVM_CAP_TSC_DEADLINE_TIMER:
2918         case KVM_CAP_ENABLE_CAP_VM:
2919         case KVM_CAP_DISABLE_QUIRKS:
2920         case KVM_CAP_SET_BOOT_CPU_ID:
2921         case KVM_CAP_SPLIT_IRQCHIP:
2922         case KVM_CAP_IMMEDIATE_EXIT:
2923         case KVM_CAP_GET_MSR_FEATURES:
2924                 r = 1;
2925                 break;
2926         case KVM_CAP_SYNC_REGS:
2927                 r = KVM_SYNC_X86_VALID_FIELDS;
2928                 break;
2929         case KVM_CAP_ADJUST_CLOCK:
2930                 r = KVM_CLOCK_TSC_STABLE;
2931                 break;
2932         case KVM_CAP_X86_DISABLE_EXITS:
2933                 r |=  KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE;
2934                 if(kvm_can_mwait_in_guest())
2935                         r |= KVM_X86_DISABLE_EXITS_MWAIT;
2936                 break;
2937         case KVM_CAP_X86_SMM:
2938                 /* SMBASE is usually relocated above 1M on modern chipsets,
2939                  * and SMM handlers might indeed rely on 4G segment limits,
2940                  * so do not report SMM to be available if real mode is
2941                  * emulated via vm86 mode.  Still, do not go to great lengths
2942                  * to avoid userspace's usage of the feature, because it is a
2943                  * fringe case that is not enabled except via specific settings
2944                  * of the module parameters.
2945                  */
2946                 r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE);
2947                 break;
2948         case KVM_CAP_VAPIC:
2949                 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
2950                 break;
2951         case KVM_CAP_NR_VCPUS:
2952                 r = KVM_SOFT_MAX_VCPUS;
2953                 break;
2954         case KVM_CAP_MAX_VCPUS:
2955                 r = KVM_MAX_VCPUS;
2956                 break;
2957         case KVM_CAP_NR_MEMSLOTS:
2958                 r = KVM_USER_MEM_SLOTS;
2959                 break;
2960         case KVM_CAP_PV_MMU:    /* obsolete */
2961                 r = 0;
2962                 break;
2963         case KVM_CAP_MCE:
2964                 r = KVM_MAX_MCE_BANKS;
2965                 break;
2966         case KVM_CAP_XCRS:
2967                 r = boot_cpu_has(X86_FEATURE_XSAVE);
2968                 break;
2969         case KVM_CAP_TSC_CONTROL:
2970                 r = kvm_has_tsc_control;
2971                 break;
2972         case KVM_CAP_X2APIC_API:
2973                 r = KVM_X2APIC_API_VALID_FLAGS;
2974                 break;
2975         default:
2976                 break;
2977         }
2978         return r;
2979 
2980 }
2981 
2982 long kvm_arch_dev_ioctl(struct file *filp,
2983                         unsigned int ioctl, unsigned long arg)
2984 {
2985         void __user *argp = (void __user *)arg;
2986         long r;
2987 
2988         switch (ioctl) {
2989         case KVM_GET_MSR_INDEX_LIST: {
2990                 struct kvm_msr_list __user *user_msr_list = argp;
2991                 struct kvm_msr_list msr_list;
2992                 unsigned n;
2993 
2994                 r = -EFAULT;
2995                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
2996                         goto out;
2997                 n = msr_list.nmsrs;
2998                 msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
2999                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
3000                         goto out;
3001                 r = -E2BIG;
3002                 if (n < msr_list.nmsrs)
3003                         goto out;
3004                 r = -EFAULT;
3005                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
3006                                  num_msrs_to_save * sizeof(u32)))
3007                         goto out;
3008                 if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
3009                                  &emulated_msrs,
3010                                  num_emulated_msrs * sizeof(u32)))
3011                         goto out;
3012                 r = 0;
3013                 break;
3014         }
3015         case KVM_GET_SUPPORTED_CPUID:
3016         case KVM_GET_EMULATED_CPUID: {
3017                 struct kvm_cpuid2 __user *cpuid_arg = argp;
3018                 struct kvm_cpuid2 cpuid;
3019 
3020                 r = -EFAULT;
3021                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3022                         goto out;
3023 
3024                 r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
3025                                             ioctl);
3026                 if (r)
3027                         goto out;
3028 
3029                 r = -EFAULT;
3030                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
3031                         goto out;
3032                 r = 0;
3033                 break;
3034         }
3035         case KVM_X86_GET_MCE_CAP_SUPPORTED: {
3036                 r = -EFAULT;
3037                 if (copy_to_user(argp, &kvm_mce_cap_supported,
3038                                  sizeof(kvm_mce_cap_supported)))
3039                         goto out;
3040                 r = 0;
3041                 break;
3042         case KVM_GET_MSR_FEATURE_INDEX_LIST: {
3043                 struct kvm_msr_list __user *user_msr_list = argp;
3044                 struct kvm_msr_list msr_list;
3045                 unsigned int n;
3046 
3047                 r = -EFAULT;
3048                 if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
3049                         goto out;
3050                 n = msr_list.nmsrs;
3051                 msr_list.nmsrs = num_msr_based_features;
3052                 if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
3053                         goto out;
3054                 r = -E2BIG;
3055                 if (n < msr_list.nmsrs)
3056                         goto out;
3057                 r = -EFAULT;
3058                 if (copy_to_user(user_msr_list->indices, &msr_based_features,
3059                                  num_msr_based_features * sizeof(u32)))
3060                         goto out;
3061                 r = 0;
3062                 break;
3063         }
3064         case KVM_GET_MSRS:
3065                 r = msr_io(NULL, argp, do_get_msr_feature, 1);
3066                 break;
3067         }
3068         default:
3069                 r = -EINVAL;
3070         }
3071 out:
3072         return r;
3073 }
3074 
3075 static void wbinvd_ipi(void *garbage)
3076 {
3077         wbinvd();
3078 }
3079 
3080 static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
3081 {
3082         return kvm_arch_has_noncoherent_dma(vcpu->kvm);
3083 }
3084 
3085 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
3086 {
3087         /* Address WBINVD may be executed by guest */
3088         if (need_emulate_wbinvd(vcpu)) {
3089                 if (kvm_x86_ops->has_wbinvd_exit())
3090                         cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
3091                 else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
3092                         smp_call_function_single(vcpu->cpu,
3093                                         wbinvd_ipi, NULL, 1);
3094         }
3095 
3096         kvm_x86_ops->vcpu_load(vcpu, cpu);
3097 
3098         /* Apply any externally detected TSC adjustments (due to suspend) */
3099         if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
3100                 adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
3101                 vcpu->arch.tsc_offset_adjustment = 0;
3102                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
3103         }
3104 
3105         if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
3106                 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
3107                                 rdtsc() - vcpu->arch.last_host_tsc;
3108                 if (tsc_delta < 0)
3109                         mark_tsc_unstable("KVM discovered backwards TSC");
3110 
3111                 if (kvm_check_tsc_unstable()) {
3112                         u64 offset = kvm_compute_tsc_offset(vcpu,
3113                                                 vcpu->arch.last_guest_tsc);
3114                         kvm_vcpu_write_tsc_offset(vcpu, offset);
3115                         vcpu->arch.tsc_catchup = 1;
3116                 }
3117 
3118                 if (kvm_lapic_hv_timer_in_use(vcpu))
3119                         kvm_lapic_restart_hv_timer(vcpu);
3120 
3121                 /*
3122                  * On a host with synchronized TSC, there is no need to update
3123                  * kvmclock on vcpu->cpu migration
3124                  */
3125                 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
3126                         kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
3127                 if (vcpu->cpu != cpu)
3128                         kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
3129                 vcpu->cpu = cpu;
3130         }
3131 
3132         kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
3133 }
3134 
3135 static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
3136 {
3137         if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
3138                 return;
3139 
3140         vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
3141 
3142         kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
3143                         &vcpu->arch.st.steal.preempted,
3144                         offsetof(struct kvm_steal_time, preempted),
3145                         sizeof(vcpu->arch.st.steal.preempted));
3146 }
3147 
3148 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
3149 {
3150         int idx;
3151 
3152         if (vcpu->preempted)
3153                 vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu);
3154 
3155         /*
3156          * Disable page faults because we're in atomic context here.
3157          * kvm_write_guest_offset_cached() would call might_fault()
3158          * that relies on pagefault_disable() to tell if there's a
3159          * bug. NOTE: the write to guest memory may not go through if
3160          * during postcopy live migration or if there's heavy guest
3161          * paging.
3162          */
3163         pagefault_disable();
3164         /*
3165          * kvm_memslots() will be called by
3166          * kvm_write_guest_offset_cached() so take the srcu lock.
3167          */
3168         idx = srcu_read_lock(&vcpu->kvm->srcu);
3169         kvm_steal_time_set_preempted(vcpu);
3170         srcu_read_unlock(&vcpu->kvm->srcu, idx);
3171         pagefault_enable();
3172         kvm_x86_ops->vcpu_put(vcpu);
3173         vcpu->arch.last_host_tsc = rdtsc();
3174         /*
3175          * If userspace has set any breakpoints or watchpoints, dr6 is restored
3176          * on every vmexit, but if not, we might have a stale dr6 from the
3177          * guest. do_debug expects dr6 to be cleared after it runs, do the same.
3178          */
3179         set_debugreg(0, 6);
3180 }
3181 
3182 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
3183                                     struct kvm_lapic_state *s)
3184 {
3185         if (vcpu->arch.apicv_active)
3186                 kvm_x86_ops->sync_pir_to_irr(vcpu);
3187 
3188         return kvm_apic_get_state(vcpu, s);
3189 }
3190 
3191 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
3192                                     struct kvm_lapic_state *s)
3193 {
3194         int r;
3195 
3196         r = kvm_apic_set_state(vcpu, s);
3197         if (r)
3198                 return r;
3199         update_cr8_intercept(vcpu);
3200 
3201         return 0;
3202 }
3203 
3204 static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
3205 {
3206         return (!lapic_in_kernel(vcpu) ||
3207                 kvm_apic_accept_pic_intr(vcpu));
3208 }
3209 
3210 /*
3211  * if userspace requested an interrupt window, check that the
3212  * interrupt window is open.
3213  *
3214  * No need to exit to userspace if we already have an interrupt queued.
3215  */
3216 static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
3217 {
3218         return kvm_arch_interrupt_allowed(vcpu) &&
3219                 !kvm_cpu_has_interrupt(vcpu) &&
3220                 !kvm_event_needs_reinjection(vcpu) &&
3221                 kvm_cpu_accept_dm_intr(vcpu);
3222 }
3223 
3224 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
3225                                     struct kvm_interrupt *irq)
3226 {
3227         if (irq->irq >= KVM_NR_INTERRUPTS)
3228                 return -EINVAL;
3229 
3230         if (!irqchip_in_kernel(vcpu->kvm)) {
3231                 kvm_queue_interrupt(vcpu, irq->irq, false);
3232                 kvm_make_request(KVM_REQ_EVENT, vcpu);
3233                 return 0;
3234         }
3235 
3236         /*
3237          * With in-kernel LAPIC, we only use this to inject EXTINT, so
3238          * fail for in-kernel 8259.
3239          */
3240         if (pic_in_kernel(vcpu->kvm))
3241                 return -ENXIO;
3242 
3243         if (vcpu->arch.pending_external_vector != -1)
3244                 return -EEXIST;
3245 
3246         vcpu->arch.pending_external_vector = irq->irq;
3247         kvm_make_request(KVM_REQ_EVENT, vcpu);
3248         return 0;
3249 }
3250 
3251 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
3252 {
3253         kvm_inject_nmi(vcpu);
3254 
3255         return 0;
3256 }
3257 
3258 static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
3259 {
3260         kvm_make_request(KVM_REQ_SMI, vcpu);
3261 
3262         return 0;
3263 }
3264 
3265 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
3266                                            struct kvm_tpr_access_ctl *tac)
3267 {
3268         if (tac->flags)
3269                 return -EINVAL;
3270         vcpu->arch.tpr_access_reporting = !!tac->enabled;
3271         return 0;
3272 }
3273 
3274 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
3275                                         u64 mcg_cap)
3276 {
3277         int r;
3278         unsigned bank_num = mcg_cap & 0xff, bank;
3279 
3280         r = -EINVAL;
3281         if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
3282                 goto out;
3283         if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
3284                 goto out;
3285         r = 0;
3286         vcpu->arch.mcg_cap = mcg_cap;
3287         /* Init IA32_MCG_CTL to all 1s */
3288         if (mcg_cap & MCG_CTL_P)
3289                 vcpu->arch.mcg_ctl = ~(u64)0;
3290         /* Init IA32_MCi_CTL to all 1s */
3291         for (bank = 0; bank < bank_num; bank++)
3292                 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
3293 
3294         if (kvm_x86_ops->setup_mce)
3295                 kvm_x86_ops->setup_mce(vcpu);
3296 out:
3297         return r;
3298 }
3299 
3300 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
3301                                       struct kvm_x86_mce *mce)
3302 {
3303         u64 mcg_cap = vcpu->arch.mcg_cap;
3304         unsigned bank_num = mcg_cap & 0xff;
3305         u64 *banks = vcpu->arch.mce_banks;
3306 
3307         if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
3308                 return -EINVAL;
3309         /*
3310          * if IA32_MCG_CTL is not all 1s, the uncorrected error
3311          * reporting is disabled
3312          */
3313         if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
3314             vcpu->arch.mcg_ctl != ~(u64)0)
3315                 return 0;
3316         banks += 4 * mce->bank;
3317         /*
3318          * if IA32_MCi_CTL is not all 1s, the uncorrected error
3319          * reporting is disabled for the bank
3320          */
3321         if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
3322                 return 0;
3323         if (mce->status & MCI_STATUS_UC) {
3324                 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
3325                     !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
3326                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3327                         return 0;
3328                 }
3329                 if (banks[1] & MCI_STATUS_VAL)
3330                         mce->status |= MCI_STATUS_OVER;
3331                 banks[2] = mce->addr;
3332                 banks[3] = mce->misc;
3333                 vcpu->arch.mcg_status = mce->mcg_status;
3334                 banks[1] = mce->status;
3335                 kvm_queue_exception(vcpu, MC_VECTOR);
3336         } else if (!(banks[1] & MCI_STATUS_VAL)
3337                    || !(banks[1] & MCI_STATUS_UC)) {
3338                 if (banks[1] & MCI_STATUS_VAL)
3339                         mce->status |= MCI_STATUS_OVER;
3340                 banks[2] = mce->addr;
3341                 banks[3] = mce->misc;
3342                 banks[1] = mce->status;
3343         } else
3344                 banks[1] |= MCI_STATUS_OVER;
3345         return 0;
3346 }
3347 
3348 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
3349                                                struct kvm_vcpu_events *events)
3350 {
3351         process_nmi(vcpu);
3352         /*
3353          * FIXME: pass injected and pending separately.  This is only
3354          * needed for nested virtualization, whose state cannot be
3355          * migrated yet.  For now we can combine them.
3356          */
3357         events->exception.injected =
3358                 (vcpu->arch.exception.pending ||
3359                  vcpu->arch.exception.injected) &&
3360                 !kvm_exception_is_soft(vcpu->arch.exception.nr);
3361         events->exception.nr = vcpu->arch.exception.nr;
3362         events->exception.has_error_code = vcpu->arch.exception.has_error_code;
3363         events->exception.pad = 0;
3364         events->exception.error_code = vcpu->arch.exception.error_code;
3365 
3366         events->interrupt.injected =
3367                 vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
3368         events->interrupt.nr = vcpu->arch.interrupt.nr;
3369         events->interrupt.soft = 0;
3370         events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
3371 
3372         events->nmi.injected = vcpu->arch.nmi_injected;
3373         events->nmi.pending = vcpu->arch.nmi_pending != 0;
3374         events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
3375         events->nmi.pad = 0;
3376 
3377         events->sipi_vector = 0; /* never valid when reporting to user space */
3378 
3379         events->smi.smm = is_smm(vcpu);
3380         events->smi.pending = vcpu->arch.smi_pending;
3381         events->smi.smm_inside_nmi =
3382                 !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
3383         events->smi.latched_init = kvm_lapic_latched_init(vcpu);
3384 
3385         events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
3386                          | KVM_VCPUEVENT_VALID_SHADOW
3387                          | KVM_VCPUEVENT_VALID_SMM);
3388         memset(&events->reserved, 0, sizeof(events->reserved));
3389 }
3390 
3391 static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags);
3392 
3393 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
3394                                               struct kvm_vcpu_events *events)
3395 {
3396         if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
3397                               | KVM_VCPUEVENT_VALID_SIPI_VECTOR
3398                               | KVM_VCPUEVENT_VALID_SHADOW
3399                               | KVM_VCPUEVENT_VALID_SMM))
3400                 return -EINVAL;
3401 
3402         if (events->exception.injected &&
3403             (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR ||
3404              is_guest_mode(vcpu)))
3405                 return -EINVAL;
3406 
3407         /* INITs are latched while in SMM */
3408         if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
3409             (events->smi.smm || events->smi.pending) &&
3410             vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
3411                 return -EINVAL;
3412 
3413         process_nmi(vcpu);
3414         vcpu->arch.exception.injected = false;
3415         vcpu->arch.exception.pending = events->exception.injected;
3416         vcpu->arch.exception.nr = events->exception.nr;
3417         vcpu->arch.exception.has_error_code = events->exception.has_error_code;
3418         vcpu->arch.exception.error_code = events->exception.error_code;
3419 
3420         vcpu->arch.interrupt.injected = events->interrupt.injected;
3421         vcpu->arch.interrupt.nr = events->interrupt.nr;
3422         vcpu->arch.interrupt.soft = events->interrupt.soft;
3423         if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
3424                 kvm_x86_ops->set_interrupt_shadow(vcpu,
3425                                                   events->interrupt.shadow);
3426 
3427         vcpu->arch.nmi_injected = events->nmi.injected;
3428         if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
3429                 vcpu->arch.nmi_pending = events->nmi.pending;
3430         kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
3431 
3432         if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
3433             lapic_in_kernel(vcpu))
3434                 vcpu->arch.apic->sipi_vector = events->sipi_vector;
3435 
3436         if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
3437                 u32 hflags = vcpu->arch.hflags;
3438                 if (events->smi.smm)
3439                         hflags |= HF_SMM_MASK;
3440                 else
3441                         hflags &= ~HF_SMM_MASK;
3442                 kvm_set_hflags(vcpu, hflags);
3443 
3444                 vcpu->arch.smi_pending = events->smi.pending;
3445 
3446                 if (events->smi.smm) {
3447                         if (events->smi.smm_inside_nmi)
3448                                 vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
3449                         else
3450                                 vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
3451                         if (lapic_in_kernel(vcpu)) {
3452                                 if (events->smi.latched_init)
3453                                         set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3454                                 else
3455                                         clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3456                         }
3457                 }
3458         }
3459 
3460         kvm_make_request(KVM_REQ_EVENT, vcpu);
3461 
3462         return 0;
3463 }
3464 
3465 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
3466                                              struct kvm_debugregs *dbgregs)
3467 {
3468         unsigned long val;
3469 
3470         memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
3471         kvm_get_dr(vcpu, 6, &val);
3472         dbgregs->dr6 = val;
3473         dbgregs->dr7 = vcpu->arch.dr7;
3474         dbgregs->flags = 0;
3475         memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
3476 }
3477 
3478 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
3479                                             struct kvm_debugregs *dbgregs)
3480 {
3481         if (dbgregs->flags)
3482                 return -EINVAL;
3483 
3484         if (dbgregs->dr6 & ~0xffffffffull)
3485                 return -EINVAL;
3486         if (dbgregs->dr7 & ~0xffffffffull)
3487                 return -EINVAL;
3488 
3489         memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
3490         kvm_update_dr0123(vcpu);
3491         vcpu->arch.dr6 = dbgregs->dr6;
3492         kvm_update_dr6(vcpu);
3493         vcpu->arch.dr7 = dbgregs->dr7;
3494         kvm_update_dr7(vcpu);
3495 
3496         return 0;
3497 }
3498 
3499 #define XSTATE_COMPACTION_ENABLED (1ULL << 63)
3500 
3501 static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
3502 {
3503         struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
3504         u64 xstate_bv = xsave->header.xfeatures;
3505         u64 valid;
3506 
3507         /*
3508          * Copy legacy XSAVE area, to avoid complications with CPUID
3509          * leaves 0 and 1 in the loop below.
3510          */
3511         memcpy(dest, xsave, XSAVE_HDR_OFFSET);
3512 
3513         /* Set XSTATE_BV */
3514         xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE;
3515         *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
3516 
3517         /*
3518          * Copy each region from the possibly compacted offset to the
3519          * non-compacted offset.
3520          */
3521         valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
3522         while (valid) {
3523                 u64 feature = valid & -valid;
3524                 int index = fls64(feature) - 1;
3525                 void *src = get_xsave_addr(xsave, feature);
3526 
3527                 if (src) {
3528                         u32 size, offset, ecx, edx;
3529                         cpuid_count(XSTATE_CPUID, index,
3530                                     &size, &offset, &ecx, &edx);
3531                         if (feature == XFEATURE_MASK_PKRU)
3532                                 memcpy(dest + offset, &vcpu->arch.pkru,
3533                                        sizeof(vcpu->arch.pkru));
3534                         else
3535                                 memcpy(dest + offset, src, size);
3536 
3537                 }
3538 
3539                 valid -= feature;
3540         }
3541 }
3542 
3543 static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
3544 {
3545         struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
3546         u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
3547         u64 valid;
3548 
3549         /*
3550          * Copy legacy XSAVE area, to avoid complications with CPUID
3551          * leaves 0 and 1 in the loop below.
3552          */
3553         memcpy(xsave, src, XSAVE_HDR_OFFSET);
3554 
3555         /* Set XSTATE_BV and possibly XCOMP_BV.  */
3556         xsave->header.xfeatures = xstate_bv;
3557         if (boot_cpu_has(X86_FEATURE_XSAVES))
3558                 xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
3559 
3560         /*
3561          * Copy each region from the non-compacted offset to the
3562          * possibly compacted offset.
3563          */
3564         valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
3565         while (valid) {
3566                 u64 feature = valid & -valid;
3567                 int index = fls64(feature) - 1;
3568                 void *dest = get_xsave_addr(xsave, feature);
3569 
3570                 if (dest) {
3571                         u32 size, offset, ecx, edx;
3572                         cpuid_count(XSTATE_CPUID, index,
3573                                     &size, &offset, &ecx, &edx);
3574                         if (feature == XFEATURE_MASK_PKRU)
3575                                 memcpy(&vcpu->arch.pkru, src + offset,
3576                                        sizeof(vcpu->arch.pkru));
3577                         else
3578                                 memcpy(dest, src + offset, size);
3579                 }
3580 
3581                 valid -= feature;
3582         }
3583 }
3584 
3585 static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
3586                                          struct kvm_xsave *guest_xsave)
3587 {
3588         if (boot_cpu_has(X86_FEATURE_XSAVE)) {
3589                 memset(guest_xsave, 0, sizeof(struct kvm_xsave));
3590                 fill_xsave((u8 *) guest_xsave->region, vcpu);
3591         } else {
3592                 memcpy(guest_xsave->region,
3593                         &vcpu->arch.guest_fpu.state.fxsave,
3594                         sizeof(struct fxregs_state));
3595                 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
3596                         XFEATURE_MASK_FPSSE;
3597         }
3598 }
3599 
3600 #define XSAVE_MXCSR_OFFSET 24
3601 
3602 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
3603                                         struct kvm_xsave *guest_xsave)
3604 {
3605         u64 xstate_bv =
3606                 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
3607         u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
3608 
3609         if (boot_cpu_has(X86_FEATURE_XSAVE)) {
3610                 /*
3611                  * Here we allow setting states that are not present in
3612                  * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
3613                  * with old userspace.
3614                  */
3615                 if (xstate_bv & ~kvm_supported_xcr0() ||
3616                         mxcsr & ~mxcsr_feature_mask)
3617                         return -EINVAL;
3618                 load_xsave(vcpu, (u8 *)guest_xsave->region);
3619         } else {
3620                 if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
3621                         mxcsr & ~mxcsr_feature_mask)
3622                         return -EINVAL;
3623                 memcpy(&vcpu->arch.guest_fpu.state.fxsave,
3624                         guest_xsave->region, sizeof(struct fxregs_state));
3625         }
3626         return 0;
3627 }
3628 
3629 static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
3630                                         struct kvm_xcrs *guest_xcrs)
3631 {
3632         if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
3633                 guest_xcrs->nr_xcrs = 0;
3634                 return;
3635         }
3636 
3637         guest_xcrs->nr_xcrs = 1;
3638         guest_xcrs->flags = 0;
3639         guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
3640         guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
3641 }
3642 
3643 static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
3644                                        struct kvm_xcrs *guest_xcrs)
3645 {
3646         int i, r = 0;
3647 
3648         if (!boot_cpu_has(X86_FEATURE_XSAVE))
3649                 return -EINVAL;
3650 
3651         if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
3652                 return -EINVAL;
3653 
3654         for (i = 0; i < guest_xcrs->nr_xcrs; i++)
3655                 /* Only support XCR0 currently */
3656                 if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
3657                         r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
3658                                 guest_xcrs->xcrs[i].value);
3659                         break;
3660                 }
3661         if (r)
3662                 r = -EINVAL;
3663         return r;
3664 }
3665 
3666 /*
3667  * kvm_set_guest_paused() indicates to the guest kernel that it has been
3668  * stopped by the hypervisor.  This function will be called from the host only.
3669  * EINVAL is returned when the host attempts to set the flag for a guest that
3670  * does not support pv clocks.
3671  */
3672 static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
3673 {
3674         if (!vcpu->arch.pv_time_enabled)
3675                 return -EINVAL;
3676         vcpu->arch.pvclock_set_guest_stopped_request = true;
3677         kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
3678         return 0;
3679 }
3680 
3681 static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
3682                                      struct kvm_enable_cap *cap)
3683 {
3684         if (cap->flags)
3685                 return -EINVAL;
3686 
3687         switch (cap->cap) {
3688         case KVM_CAP_HYPERV_SYNIC2:
3689                 if (cap->args[0])
3690                         return -EINVAL;
3691         case KVM_CAP_HYPERV_SYNIC:
3692                 if (!irqchip_in_kernel(vcpu->kvm))
3693                         return -EINVAL;
3694                 return kvm_hv_activate_synic(vcpu, cap->cap ==
3695                                              KVM_CAP_HYPERV_SYNIC2);
3696         default:
3697                 return -EINVAL;
3698         }
3699 }
3700 
3701 long kvm_arch_vcpu_ioctl(struct file *filp,
3702                          unsigned int ioctl, unsigned long arg)
3703 {
3704         struct kvm_vcpu *vcpu = filp->private_data;
3705         void __user *argp = (void __user *)arg;
3706         int r;
3707         union {
3708                 struct kvm_lapic_state *lapic;
3709                 struct kvm_xsave *xsave;
3710                 struct kvm_xcrs *xcrs;
3711                 void *buffer;
3712         } u;
3713 
3714         vcpu_load(vcpu);
3715 
3716         u.buffer = NULL;
3717         switch (ioctl) {
3718         case KVM_GET_LAPIC: {
3719                 r = -EINVAL;
3720                 if (!lapic_in_kernel(vcpu))
3721                         goto out;
3722                 u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
3723 
3724                 r = -ENOMEM;
3725                 if (!u.lapic)
3726                         goto out;
3727                 r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
3728                 if (r)
3729                         goto out;
3730                 r = -EFAULT;
3731                 if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
3732                         goto out;
3733                 r = 0;
3734                 break;
3735         }
3736         case KVM_SET_LAPIC: {
3737                 r = -EINVAL;
3738                 if (!lapic_in_kernel(vcpu))
3739                         goto out;
3740                 u.lapic = memdup_user(argp, sizeof(*u.lapic));
3741                 if (IS_ERR(u.lapic)) {
3742                         r = PTR_ERR(u.lapic);
3743                         goto out_nofree;
3744                 }
3745 
3746                 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
3747                 break;
3748         }
3749         case KVM_INTERRUPT: {
3750                 struct kvm_interrupt irq;
3751 
3752                 r = -EFAULT;
3753                 if (copy_from_user(&irq, argp, sizeof irq))
3754                         goto out;
3755                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
3756                 break;
3757         }
3758         case KVM_NMI: {
3759                 r = kvm_vcpu_ioctl_nmi(vcpu);
3760                 break;
3761         }
3762         case KVM_SMI: {
3763                 r = kvm_vcpu_ioctl_smi(vcpu);
3764                 break;
3765         }
3766         case KVM_SET_CPUID: {
3767                 struct kvm_cpuid __user *cpuid_arg = argp;
3768                 struct kvm_cpuid cpuid;
3769 
3770                 r = -EFAULT;
3771                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3772                         goto out;
3773                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
3774                 break;
3775         }
3776         case KVM_SET_CPUID2: {
3777                 struct kvm_cpuid2 __user *cpuid_arg = argp;
3778                 struct kvm_cpuid2 cpuid;
3779 
3780                 r = -EFAULT;
3781                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3782                         goto out;
3783                 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
3784                                               cpuid_arg->entries);
3785                 break;
3786         }
3787         case KVM_GET_CPUID2: {
3788                 struct kvm_cpuid2 __user *cpuid_arg = argp;
3789                 struct kvm_cpuid2 cpuid;
3790 
3791                 r = -EFAULT;
3792                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3793                         goto out;
3794                 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
3795                                               cpuid_arg->entries);
3796                 if (r)
3797                         goto out;
3798                 r = -EFAULT;
3799                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
3800                         goto out;
3801                 r = 0;
3802                 break;
3803         }
3804         case KVM_GET_MSRS: {
3805                 int idx = srcu_read_lock(&vcpu->kvm->srcu);
3806                 r = msr_io(vcpu, argp, do_get_msr, 1);
3807                 srcu_read_unlock(&vcpu->kvm->srcu, idx);
3808                 break;
3809         }
3810         case KVM_SET_MSRS: {
3811                 int idx = srcu_read_lock(&vcpu->kvm->srcu);
3812                 r = msr_io(vcpu, argp, do_set_msr, 0);
3813                 srcu_read_unlock(&vcpu->kvm->srcu, idx);
3814                 break;
3815         }
3816         case KVM_TPR_ACCESS_REPORTING: {
3817                 struct kvm_tpr_access_ctl tac;
3818 
3819                 r = -EFAULT;
3820                 if (copy_from_user(&tac, argp, sizeof tac))
3821                         goto out;
3822                 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
3823                 if (r)
3824                         goto out;
3825                 r = -EFAULT;
3826                 if (copy_to_user(argp, &tac, sizeof tac))
3827                         goto out;
3828                 r = 0;
3829                 break;
3830         };
3831         case KVM_SET_VAPIC_ADDR: {
3832                 struct kvm_vapic_addr va;
3833                 int idx;
3834 
3835                 r = -EINVAL;
3836                 if (!lapic_in_kernel(vcpu))
3837                         goto out;
3838                 r = -EFAULT;
3839                 if (copy_from_user(&va, argp, sizeof va))
3840                         goto out;
3841                 idx = srcu_read_lock(&vcpu->kvm->srcu);
3842                 r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
3843                 srcu_read_unlock(&vcpu->kvm->srcu, idx);
3844                 break;
3845         }
3846         case KVM_X86_SETUP_MCE: {
3847                 u64 mcg_cap;
3848 
3849                 r = -EFAULT;
3850                 if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
3851                         goto out;
3852                 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
3853                 break;
3854         }
3855         case KVM_X86_SET_MCE: {
3856                 struct kvm_x86_mce mce;
3857 
3858                 r = -EFAULT;
3859                 if (copy_from_user(&mce, argp, sizeof mce))
3860                         goto out;
3861                 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
3862                 break;
3863         }
3864         case KVM_GET_VCPU_EVENTS: {
3865                 struct kvm_vcpu_events events;
3866 
3867                 kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
3868 
3869                 r = -EFAULT;
3870                 if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
3871                         break;
3872                 r = 0;
3873                 break;
3874         }
3875         case KVM_SET_VCPU_EVENTS: {
3876                 struct kvm_vcpu_events events;
3877 
3878                 r = -EFAULT;
3879                 if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
3880                         break;
3881 
3882                 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
3883                 break;
3884         }
3885         case KVM_GET_DEBUGREGS: {
3886                 struct kvm_debugregs dbgregs;
3887 
3888                 kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
3889 
3890                 r = -EFAULT;
3891                 if (copy_to_user(argp, &dbgregs,
3892                                  sizeof(struct kvm_debugregs)))
3893                         break;
3894                 r = 0;
3895                 break;
3896         }
3897         case KVM_SET_DEBUGREGS: {
3898                 struct kvm_debugregs dbgregs;
3899 
3900                 r = -EFAULT;
3901                 if (copy_from_user(&dbgregs, argp,
3902                                    sizeof(struct kvm_debugregs)))
3903                         break;
3904 
3905                 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
3906                 break;
3907         }
3908         case KVM_GET_XSAVE: {
3909                 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
3910                 r = -ENOMEM;
3911                 if (!u.xsave)
3912                         break;
3913 
3914                 kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
3915 
3916                 r = -EFAULT;
3917                 if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
3918                         break;
3919                 r = 0;
3920                 break;
3921         }
3922         case KVM_SET_XSAVE: {
3923                 u.xsave = memdup_user(argp, sizeof(*u.xsave));
3924                 if (IS_ERR(u.xsave)) {
3925                         r = PTR_ERR(u.xsave);
3926                         goto out_nofree;
3927                 }
3928 
3929                 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
3930                 break;
3931         }
3932         case KVM_GET_XCRS: {
3933                 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
3934                 r = -ENOMEM;
3935                 if (!u.xcrs)
3936                         break;
3937 
3938                 kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
3939 
3940                 r = -EFAULT;
3941                 if (copy_to_user(argp, u.xcrs,
3942                                  sizeof(struct kvm_xcrs)))
3943                         break;
3944                 r = 0;
3945                 break;
3946         }
3947         case KVM_SET_XCRS: {
3948                 u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
3949                 if (IS_ERR(u.xcrs)) {
3950                         r = PTR_ERR(u.xcrs);
3951                         goto out_nofree;
3952                 }
3953 
3954                 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
3955                 break;
3956         }
3957         case KVM_SET_TSC_KHZ: {
3958                 u32 user_tsc_khz;
3959 
3960                 r = -EINVAL;
3961                 user_tsc_khz = (u32)arg;
3962 
3963                 if (user_tsc_khz >= kvm_max_guest_tsc_khz)
3964                         goto out;
3965 
3966                 if (user_tsc_khz == 0)
3967                         user_tsc_khz = tsc_khz;
3968 
3969                 if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
3970                         r = 0;
3971 
3972                 goto out;
3973         }
3974         case KVM_GET_TSC_KHZ: {
3975                 r = vcpu->arch.virtual_tsc_khz;
3976                 goto out;
3977         }
3978         case KVM_KVMCLOCK_CTRL: {
3979                 r = kvm_set_guest_paused(vcpu);
3980                 goto out;
3981         }
3982         case KVM_ENABLE_CAP: {
3983                 struct kvm_enable_cap cap;
3984 
3985                 r = -EFAULT;
3986                 if (copy_from_user(&cap, argp, sizeof(cap)))
3987                         goto out;
3988                 r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
3989                 break;
3990         }
3991         default:
3992                 r = -EINVAL;
3993         }
3994 out:
3995         kfree(u.buffer);
3996 out_nofree:
3997         vcpu_put(vcpu);
3998         return r;
3999 }
4000 
4001 vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
4002 {
4003         return VM_FAULT_SIGBUS;
4004 }
4005 
4006 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
4007 {
4008         int ret;
4009 
4010         if (addr > (unsigned int)(-3 * PAGE_SIZE))
4011                 return -EINVAL;
4012         ret = kvm_x86_ops->set_tss_addr(kvm, addr);
4013         return ret;
4014 }
4015 
4016 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
4017                                               u64 ident_addr)
4018 {
4019         return kvm_x86_ops->set_identity_map_addr(kvm, ident_addr);
4020 }
4021 
4022 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
4023                                           u32 kvm_nr_mmu_pages)
4024 {
4025         if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
4026                 return -EINVAL;
4027 
4028         mutex_lock(&kvm->slots_lock);
4029 
4030         kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
4031         kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
4032 
4033         mutex_unlock(&kvm->slots_lock);
4034         return 0;
4035 }
4036 
4037 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
4038 {
4039         return kvm->arch.n_max_mmu_pages;
4040 }
4041 
4042 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
4043 {
4044         struct kvm_pic *pic = kvm->arch.vpic;
4045         int r;
4046 
4047         r = 0;
4048         switch (chip->chip_id) {
4049         case KVM_IRQCHIP_PIC_MASTER:
4050                 memcpy(&chip->chip.pic, &pic->pics[0],
4051                         sizeof(struct kvm_pic_state));
4052                 break;
4053         case KVM_IRQCHIP_PIC_SLAVE:
4054                 memcpy(&chip->chip.pic, &pic->pics[1],
4055                         sizeof(struct kvm_pic_state));
4056                 break;
4057         case KVM_IRQCHIP_IOAPIC:
4058                 kvm_get_ioapic(kvm, &chip->chip.ioapic);
4059                 break;
4060         default:
4061                 r = -EINVAL;
4062                 break;
4063         }
4064         return r;
4065 }
4066 
4067 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
4068 {
4069         struct kvm_pic *pic = kvm->arch.vpic;
4070         int r;
4071 
4072         r = 0;
4073         switch (chip->chip_id) {
4074         case KVM_IRQCHIP_PIC_MASTER:
4075                 spin_lock(&pic->lock);
4076                 memcpy(&pic->pics[0], &chip->chip.pic,
4077                         sizeof(struct kvm_pic_state));
4078                 spin_unlock(&pic->lock);
4079                 break;
4080         case KVM_IRQCHIP_PIC_SLAVE:
4081                 spin_lock(&pic->lock);
4082                 memcpy(&pic->pics[1], &chip->chip.pic,
4083                         sizeof(struct kvm_pic_state));
4084                 spin_unlock(&pic->lock);
4085                 break;
4086         case KVM_IRQCHIP_IOAPIC:
4087                 kvm_set_ioapic(kvm, &chip->chip.ioapic);
4088                 break;
4089         default:
4090                 r = -EINVAL;
4091                 break;
4092         }
4093         kvm_pic_update_irq(pic);
4094         return r;
4095 }
4096 
4097 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
4098 {
4099         struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
4100 
4101         BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
4102 
4103         mutex_lock(&kps->lock);
4104         memcpy(ps, &kps->channels, sizeof(*ps));
4105         mutex_unlock(&kps->lock);
4106         return 0;
4107 }
4108 
4109 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
4110 {
4111         int i;
4112         struct kvm_pit *pit = kvm->arch.vpit;
4113 
4114         mutex_lock(&pit->pit_state.lock);
4115         memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
4116         for (i = 0; i < 3; i++)
4117                 kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
4118         mutex_unlock(&pit->pit_state.lock);
4119         return 0;
4120 }
4121 
4122 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
4123 {
4124         mutex_lock(&kvm->arch.vpit->pit_state.lock);
4125         memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
4126                 sizeof(ps->channels));
4127         ps->flags = kvm->arch.vpit->pit_state.flags;
4128         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
4129         memset(&ps->reserved, 0, sizeof(ps->reserved));
4130         return 0;
4131 }
4132 
4133 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
4134 {
4135         int start = 0;
4136         int i;
4137         u32 prev_legacy, cur_legacy;
4138         struct kvm_pit *pit = kvm->arch.vpit;
4139 
4140         mutex_lock(&pit->pit_state.lock);
4141         prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
4142         cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
4143         if (!prev_legacy && cur_legacy)
4144                 start = 1;
4145         memcpy(&pit->pit_state.channels, &ps->channels,
4146                sizeof(pit->pit_state.channels));
4147         pit->pit_state.flags = ps->flags;
4148         for (i = 0; i < 3; i++)
4149                 kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
4150                                    start && i == 0);
4151         mutex_unlock(&pit->pit_state.lock);
4152         return 0;
4153 }
4154 
4155 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
4156                                  struct kvm_reinject_control *control)
4157 {
4158         struct kvm_pit *pit = kvm->arch.vpit;
4159 
4160         if (!pit)
4161                 return -ENXIO;
4162 
4163         /* pit->pit_state.lock was overloaded to prevent userspace from getting
4164          * an inconsistent state after running multiple KVM_REINJECT_CONTROL
4165          * ioctls in parallel.  Use a separate lock if that ioctl isn't rare.
4166          */
4167         mutex_lock(&pit->pit_state.lock);
4168         kvm_pit_set_reinject(pit, control->pit_reinject);
4169         mutex_unlock(&pit->pit_state.lock);
4170 
4171         return 0;
4172 }
4173 
4174 /**
4175  * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
4176  * @kvm: kvm instance
4177  * @log: slot id and address to which we copy the log
4178  *
4179  * Steps 1-4 below provide general overview of dirty page logging. See
4180  * kvm_get_dirty_log_protect() function description for additional details.
4181  *
4182  * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
4183  * always flush the TLB (step 4) even if previous step failed  and the dirty
4184  * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
4185  * does not preclude user space subsequent dirty log read. Flushing TLB ensures
4186  * writes will be marked dirty for next log read.
4187  *
4188  *   1. Take a snapshot of the bit and clear it if needed.
4189  *   2. Write protect the corresponding page.
4190  *   3. Copy the snapshot to the userspace.
4191  *   4. Flush TLB's if needed.
4192  */
4193 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
4194 {
4195         bool is_dirty = false;
4196         int r;
4197 
4198         mutex_lock(&kvm->slots_lock);
4199 
4200         /*
4201          * Flush potentially hardware-cached dirty pages to dirty_bitmap.
4202          */
4203         if (kvm_x86_ops->flush_log_dirty)
4204                 kvm_x86_ops->flush_log_dirty(kvm);
4205 
4206         r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
4207 
4208         /*
4209          * All the TLBs can be flushed out of mmu lock, see the comments in
4210          * kvm_mmu_slot_remove_write_access().
4211          */
4212         lockdep_assert_held(&kvm->slots_lock);
4213         if (is_dirty)
4214                 kvm_flush_remote_tlbs(kvm);
4215 
4216         mutex_unlock(&kvm->slots_lock);
4217         return r;
4218 }
4219 
4220 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
4221                         bool line_status)
4222 {
4223         if (!irqchip_in_kernel(kvm))
4224                 return -ENXIO;
4225 
4226         irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
4227                                         irq_event->irq, irq_event->level,
4228                                         line_status);
4229         return 0;
4230 }
4231 
4232 static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4233                                    struct kvm_enable_cap *cap)
4234 {
4235         int r;
4236 
4237         if (cap->flags)
4238                 return -EINVAL;
4239 
4240         switch (cap->cap) {
4241         case KVM_CAP_DISABLE_QUIRKS:
4242                 kvm->arch.disabled_quirks = cap->args[0];
4243                 r = 0;
4244                 break;
4245         case KVM_CAP_SPLIT_IRQCHIP: {
4246                 mutex_lock(&kvm->lock);
4247                 r = -EINVAL;
4248                 if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
4249                         goto split_irqchip_unlock;
4250                 r = -EEXIST;
4251                 if (irqchip_in_kernel(kvm))
4252                         goto split_irqchip_unlock;
4253                 if (kvm->created_vcpus)
4254                         goto split_irqchip_unlock;
4255                 r = kvm_setup_empty_irq_routing(kvm);
4256                 if (r)
4257                         goto split_irqchip_unlock;
4258                 /* Pairs with irqchip_in_kernel. */
4259                 smp_wmb();
4260                 kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
4261                 kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
4262                 r = 0;
4263 split_irqchip_unlock:
4264                 mutex_unlock(&kvm->lock);
4265                 break;
4266         }
4267         case KVM_CAP_X2APIC_API:
4268                 r = -EINVAL;
4269                 if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
4270                         break;
4271 
4272                 if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
4273                         kvm->arch.x2apic_format = true;
4274                 if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
4275                         kvm->arch.x2apic_broadcast_quirk_disabled = true;
4276 
4277                 r = 0;
4278                 break;
4279         case KVM_CAP_X86_DISABLE_EXITS:
4280                 r = -EINVAL;
4281                 if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
4282                         break;
4283 
4284                 if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
4285                         kvm_can_mwait_in_guest())
4286                         kvm->arch.mwait_in_guest = true;
4287                 if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
4288                         kvm->arch.hlt_in_guest = true;
4289                 if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
4290                         kvm->arch.pause_in_guest = true;
4291                 r = 0;
4292                 break;
4293         default:
4294                 r = -EINVAL;
4295                 break;
4296         }
4297         return r;
4298 }
4299 
4300 long kvm_arch_vm_ioctl(struct file *filp,
4301                        unsigned int ioctl, unsigned long arg)
4302 {
4303         struct kvm *kvm = filp->private_data;
4304         void __user *argp = (void __user *)arg;
4305         int r = -ENOTTY;
4306         /*
4307          * This union makes it completely explicit to gcc-3.x
4308          * that these two variables' stack usage should be
4309          * combined, not added together.
4310          */
4311         union {
4312                 struct kvm_pit_state ps;
4313                 struct kvm_pit_state2 ps2;
4314                 struct kvm_pit_config pit_config;
4315         } u;
4316 
4317         switch (ioctl) {
4318         case KVM_SET_TSS_ADDR:
4319                 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
4320                 break;
4321         case KVM_SET_IDENTITY_MAP_ADDR: {
4322                 u64 ident_addr;
4323 
4324                 mutex_lock(&kvm->lock);
4325                 r = -EINVAL;
4326                 if (kvm->created_vcpus)
4327                         goto set_identity_unlock;
4328                 r = -EFAULT;
4329                 if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
4330                         goto set_identity_unlock;
4331                 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
4332 set_identity_unlock:
4333                 mutex_unlock(&kvm->lock);
4334                 break;
4335         }
4336         case KVM_SET_NR_MMU_PAGES:
4337                 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
4338                 break;
4339         case KVM_GET_NR_MMU_PAGES:
4340                 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
4341                 break;
4342         case KVM_CREATE_IRQCHIP: {
4343                 mutex_lock(&kvm->lock);
4344 
4345                 r = -EEXIST;
4346                 if (irqchip_in_kernel(kvm))
4347                         goto create_irqchip_unlock;
4348 
4349                 r = -EINVAL;
4350                 if (kvm->created_vcpus)
4351                         goto create_irqchip_unlock;
4352 
4353                 r = kvm_pic_init(kvm);
4354                 if (r)
4355                         goto create_irqchip_unlock;
4356 
4357                 r = kvm_ioapic_init(kvm);
4358                 if (r) {
4359                         kvm_pic_destroy(kvm);
4360                         goto create_irqchip_unlock;
4361                 }
4362 
4363                 r = kvm_setup_default_irq_routing(kvm);
4364                 if (r) {
4365                         kvm_ioapic_destroy(kvm);
4366                         kvm_pic_destroy(kvm);
4367                         goto create_irqchip_unlock;
4368                 }
4369                 /* Write kvm->irq_routing before enabling irqchip_in_kernel. */
4370                 smp_wmb();
4371                 kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
4372         create_irqchip_unlock:
4373                 mutex_unlock(&kvm->lock);
4374                 break;
4375         }
4376         case KVM_CREATE_PIT:
4377                 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
4378                 goto create_pit;
4379         case KVM_CREATE_PIT2:
4380                 r = -EFAULT;
4381                 if (copy_from_user(&u.pit_config, argp,
4382                                    sizeof(struct kvm_pit_config)))
4383                         goto out;
4384         create_pit:
4385                 mutex_lock(&kvm->lock);
4386                 r = -EEXIST;
4387                 if (kvm->arch.vpit)
4388                         goto create_pit_unlock;
4389                 r = -ENOMEM;
4390                 kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
4391                 if (kvm->arch.vpit)
4392                         r = 0;
4393         create_pit_unlock:
4394                 mutex_unlock(&kvm->lock);
4395                 break;
4396         case KVM_GET_IRQCHIP: {
4397                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
4398                 struct kvm_irqchip *chip;
4399 
4400                 chip = memdup_user(argp, sizeof(*chip));
4401                 if (IS_ERR(chip)) {
4402                         r = PTR_ERR(chip);
4403                         goto out;
4404                 }
4405 
4406                 r = -ENXIO;
4407                 if (!irqchip_kernel(kvm))
4408                         goto get_irqchip_out;
4409                 r = kvm_vm_ioctl_get_irqchip(kvm, chip);
4410                 if (r)
4411                         goto get_irqchip_out;
4412                 r = -EFAULT;
4413                 if (copy_to_user(argp, chip, sizeof *chip))
4414                         goto get_irqchip_out;
4415                 r = 0;
4416         get_irqchip_out:
4417                 kfree(chip);
4418                 break;
4419         }
4420         case KVM_SET_IRQCHIP: {
4421                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
4422                 struct kvm_irqchip *chip;
4423 
4424                 chip = memdup_user(argp, sizeof(*chip));
4425                 if (IS_ERR(chip)) {
4426                         r = PTR_ERR(chip);
4427                         goto out;
4428                 }
4429 
4430                 r = -ENXIO;
4431                 if (!irqchip_kernel(kvm))
4432                         goto set_irqchip_out;
4433                 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
4434                 if (r)
4435                         goto set_irqchip_out;
4436                 r = 0;
4437         set_irqchip_out:
4438                 kfree(chip);
4439                 break;
4440         }
4441         case KVM_GET_PIT: {
4442                 r = -EFAULT;
4443                 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
4444                         goto out;
4445                 r = -ENXIO;
4446                 if (!kvm->arch.vpit)
4447                         goto out;
4448                 r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
4449                 if (r)
4450                         goto out;
4451                 r = -EFAULT;
4452                 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
4453                         goto out;
4454                 r = 0;
4455                 break;
4456         }
4457         case KVM_SET_PIT: {
4458                 r = -EFAULT;
4459                 if (copy_from_user(&u.ps, argp, sizeof u.ps))
4460                         goto out;
4461                 r = -ENXIO;
4462                 if (!kvm->arch.vpit)
4463                         goto out;
4464                 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
4465                 break;
4466         }
4467         case KVM_GET_PIT2: {
4468                 r = -ENXIO;
4469                 if (!kvm->arch.vpit)
4470                         goto out;
4471                 r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
4472                 if (r)
4473                         goto out;
4474                 r = -EFAULT;
4475                 if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
4476                         goto out;
4477                 r = 0;
4478                 break;
4479         }
4480         case KVM_SET_PIT2: {
4481                 r = -EFAULT;
4482                 if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
4483                         goto out;
4484                 r = -ENXIO;
4485                 if (!kvm->arch.vpit)
4486                         goto out;
4487                 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
4488                 break;
4489         }
4490         case KVM_REINJECT_CONTROL: {
4491                 struct kvm_reinject_control control;
4492                 r =  -EFAULT;
4493                 if (copy_from_user(&control, argp, sizeof(control)))
4494                         goto out;
4495                 r = kvm_vm_ioctl_reinject(kvm, &control);
4496                 break;
4497         }
4498         case KVM_SET_BOOT_CPU_ID:
4499                 r = 0;
4500                 mutex_lock(&kvm->lock);
4501                 if (kvm->created_vcpus)
4502                         r = -EBUSY;
4503                 else
4504                         kvm->arch.bsp_vcpu_id = arg;
4505                 mutex_unlock(&kvm->lock);
4506                 break;
4507         case KVM_XEN_HVM_CONFIG: {
4508                 struct kvm_xen_hvm_config xhc;
4509                 r = -EFAULT;
4510                 if (copy_from_user(&xhc, argp, sizeof(xhc)))
4511                         goto out;
4512                 r = -EINVAL;
4513                 if (xhc.flags)
4514                         goto out;
4515                 memcpy(&kvm->arch.xen_hvm_config, &xhc, sizeof(xhc));
4516                 r = 0;
4517                 break;
4518         }
4519         case KVM_SET_CLOCK: {
4520                 struct kvm_clock_data user_ns;
4521                 u64 now_ns;
4522 
4523                 r = -EFAULT;
4524                 if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
4525                         goto out;
4526 
4527                 r = -EINVAL;
4528                 if (user_ns.flags)
4529                         goto out;
4530 
4531                 r = 0;
4532                 /*
4533                  * TODO: userspace has to take care of races with VCPU_RUN, so
4534                  * kvm_gen_update_masterclock() can be cut down to locked
4535                  * pvclock_update_vm_gtod_copy().
4536                  */
4537                 kvm_gen_update_masterclock(kvm);
4538                 now_ns = get_kvmclock_ns(kvm);
4539                 kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
4540                 kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
4541                 break;
4542         }
4543         case KVM_GET_CLOCK: {
4544                 struct kvm_clock_data user_ns;
4545                 u64 now_ns;
4546 
4547                 now_ns = get_kvmclock_ns(kvm);
4548                 user_ns.clock = now_ns;
4549                 user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0;
4550                 memset(&user_ns.pad, 0, sizeof(user_ns.pad));
4551 
4552                 r = -EFAULT;
4553                 if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
4554                         goto out;
4555                 r = 0;
4556                 break;
4557         }
4558         case KVM_ENABLE_CAP: {
4559                 struct kvm_enable_cap cap;
4560 
4561                 r = -EFAULT;
4562                 if (copy_from_user(&cap, argp, sizeof(cap)))
4563                         goto out;
4564                 r = kvm_vm_ioctl_enable_cap(kvm, &cap);
4565                 break;
4566         }
4567         case KVM_MEMORY_ENCRYPT_OP: {
4568                 r = -ENOTTY;
4569                 if (kvm_x86_ops->mem_enc_op)
4570                         r = kvm_x86_ops->mem_enc_op(kvm, argp);
4571                 break;
4572         }
4573         case KVM_MEMORY_ENCRYPT_REG_REGION: {
4574                 struct kvm_enc_region region;
4575 
4576                 r = -EFAULT;
4577                 if (copy_from_user(&region, argp, sizeof(region)))
4578                         goto out;
4579 
4580                 r = -ENOTTY;
4581                 if (kvm_x86_ops->mem_enc_reg_region)
4582                         r = kvm_x86_ops->mem_enc_reg_region(kvm, &region);
4583                 break;
4584         }
4585         case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
4586                 struct kvm_enc_region region;
4587 
4588                 r = -EFAULT;
4589                 if (copy_from_user(&region, argp, sizeof(region)))
4590                         goto out;
4591 
4592                 r = -ENOTTY;
4593                 if (kvm_x86_ops->mem_enc_unreg_region)
4594                         r = kvm_x86_ops->mem_enc_unreg_region(kvm, &region);
4595                 break;
4596         }
4597         case KVM_HYPERV_EVENTFD: {
4598                 struct kvm_hyperv_eventfd hvevfd;
4599 
4600                 r = -EFAULT;
4601                 if (copy_from_user(&hvevfd, argp, sizeof(hvevfd)))
4602                         goto out;
4603                 r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
4604                 break;
4605         }
4606         default:
4607                 r = -ENOTTY;
4608         }
4609 out:
4610         return r;
4611 }
4612 
4613 static void kvm_init_msr_list(void)
4614 {
4615         u32 dummy[2];
4616         unsigned i, j;
4617 
4618         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
4619                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
4620                         continue;
4621 
4622                 /*
4623                  * Even MSRs that are valid in the host may not be exposed
4624                  * to the guests in some cases.
4625                  */
4626                 switch (msrs_to_save[i]) {
4627                 case MSR_IA32_BNDCFGS:
4628                         if (!kvm_mpx_supported())
4629                                 continue;
4630                         break;
4631                 case MSR_TSC_AUX:
4632                         if (!kvm_x86_ops->rdtscp_supported())
4633                                 continue;
4634                         break;
4635                 default:
4636                         break;
4637                 }
4638 
4639                 if (j < i)
4640                         msrs_to_save[j] = msrs_to_save[i];
4641                 j++;
4642         }
4643         num_msrs_to_save = j;
4644 
4645         for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
4646                 if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i]))
4647                         continue;
4648 
4649                 if (j < i)
4650                         emulated_msrs[j] = emulated_msrs[i];
4651                 j++;
4652         }
4653         num_emulated_msrs = j;
4654 
4655         for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) {
4656                 struct kvm_msr_entry msr;
4657 
4658                 msr.index = msr_based_features[i];
4659                 if (kvm_get_msr_feature(&msr))
4660                         continue;
4661 
4662                 if (j < i)
4663                         msr_based_features[j] = msr_based_features[i];
4664                 j++;
4665         }
4666         num_msr_based_features = j;
4667 }
4668 
4669 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
4670                            const void *v)
4671 {
4672         int handled = 0;
4673         int n;
4674 
4675         do {
4676                 n = min(len, 8);
4677                 if (!(lapic_in_kernel(vcpu) &&
4678                       !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
4679                     && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
4680                         break;
4681                 handled += n;
4682                 addr += n;
4683                 len -= n;
4684                 v += n;
4685         } while (len);
4686 
4687         return handled;
4688 }
4689 
4690 static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
4691 {
4692         int handled = 0;
4693         int n;
4694 
4695         do {
4696                 n = min(len, 8);
4697                 if (!(lapic_in_kernel(vcpu) &&
4698                       !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
4699                                          addr, n, v))
4700                     && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
4701                         break;
4702                 trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
4703                 handled += n;
4704                 addr += n;
4705                 len -= n;
4706                 v += n;
4707         } while (len);
4708 
4709         return handled;
4710 }
4711 
4712 static void kvm_set_segment(struct kvm_vcpu *vcpu,
4713                         struct kvm_segment *var, int seg)
4714 {
4715         kvm_x86_ops->set_segment(vcpu, var, seg);
4716 }
4717 
4718 void kvm_get_segment(struct kvm_vcpu *vcpu,
4719                      struct kvm_segment *var, int seg)
4720 {
4721         kvm_x86_ops->get_segment(vcpu, var, seg);
4722 }
4723 
4724 gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
4725                            struct x86_exception *exception)
4726 {
4727         gpa_t t_gpa;
4728 
4729         BUG_ON(!mmu_is_nested(vcpu));
4730 
4731         /* NPT walks are always user-walks */
4732         access |= PFERR_USER_MASK;
4733         t_gpa  = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, exception);
4734 
4735         return t_gpa;
4736 }
4737 
4738 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
4739                               struct x86_exception *exception)
4740 {
4741         u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4742         return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
4743 }
4744 
4745  gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
4746                                 struct x86_exception *exception)
4747 {
4748         u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4749         access |= PFERR_FETCH_MASK;
4750         return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
4751 }
4752 
4753 gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
4754                                struct x86_exception *exception)
4755 {
4756         u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4757         access |= PFERR_WRITE_MASK;
4758         return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
4759 }
4760 
4761 /* uses this to access any guest's mapped memory without checking CPL */
4762 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
4763                                 struct x86_exception *exception)
4764 {
4765         return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
4766 }
4767 
4768 static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
4769                                       struct kvm_vcpu *vcpu, u32 access,
4770                                       struct x86_exception *exception)
4771 {
4772         void *data = val;
4773         int r = X86EMUL_CONTINUE;
4774 
4775         while (bytes) {
4776                 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
4777                                                             exception);
4778                 unsigned offset = addr & (PAGE_SIZE-1);
4779                 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
4780                 int ret;
4781 
4782                 if (gpa == UNMAPPED_GVA)
4783                         return X86EMUL_PROPAGATE_FAULT;
4784                 ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
4785                                                offset, toread);
4786                 if (ret < 0) {
4787                         r = X86EMUL_IO_NEEDED;
4788                         goto out;
4789                 }
4790 
4791                 bytes -= toread;
4792                 data += toread;
4793                 addr += toread;
4794         }
4795 out:
4796         return r;
4797 }
4798 
4799 /* used for instruction fetching */
4800 static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
4801                                 gva_t addr, void *val, unsigned int bytes,
4802                                 struct x86_exception *exception)
4803 {
4804         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4805         u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4806         unsigned offset;
4807         int ret;
4808 
4809         /* Inline kvm_read_guest_virt_helper for speed.  */
4810         gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK,
4811                                                     exception);
4812         if (unlikely(gpa == UNMAPPED_GVA))
4813                 return X86EMUL_PROPAGATE_FAULT;
4814 
4815         offset = addr & (PAGE_SIZE-1);
4816         if (WARN_ON(offset + bytes > PAGE_SIZE))
4817                 bytes = (unsigned)PAGE_SIZE - offset;
4818         ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val,
4819                                        offset, bytes);
4820         if (unlikely(ret < 0))
4821                 return X86EMUL_IO_NEEDED;
4822 
4823         return X86EMUL_CONTINUE;
4824 }
4825 
4826 int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
4827                                gva_t addr, void *val, unsigned int bytes,
4828                                struct x86_exception *exception)
4829 {
4830         u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4831 
4832         return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
4833                                           exception);
4834 }
4835 EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
4836 
4837 static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
4838                              gva_t addr, void *val, unsigned int bytes,
4839                              struct x86_exception *exception, bool system)
4840 {
4841         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4842         u32 access = 0;
4843 
4844         if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
4845                 access |= PFERR_USER_MASK;
4846 
4847         return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
4848 }
4849 
4850 static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
4851                 unsigned long addr, void *val, unsigned int bytes)
4852 {
4853         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4854         int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
4855 
4856         return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
4857 }
4858 
4859 static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
4860                                       struct kvm_vcpu *vcpu, u32 access,
4861                                       struct x86_exception *exception)
4862 {
4863         void *data = val;
4864         int r = X86EMUL_CONTINUE;
4865 
4866         while (bytes) {
4867                 gpa_t gpa =  vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
4868                                                              access,
4869                                                              exception);
4870                 unsigned offset = addr & (PAGE_SIZE-1);
4871                 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
4872                 int ret;
4873 
4874                 if (gpa == UNMAPPED_GVA)
4875                         return X86EMUL_PROPAGATE_FAULT;
4876                 ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
4877                 if (ret < 0) {
4878                         r = X86EMUL_IO_NEEDED;
4879                         goto out;
4880                 }
4881 
4882                 bytes -= towrite;
4883                 data += towrite;
4884                 addr += towrite;
4885         }
4886 out:
4887         return r;
4888 }
4889 
4890 static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val,
4891                               unsigned int bytes, struct x86_exception *exception,
4892                               bool system)
4893 {
4894         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4895         u32 access = PFERR_WRITE_MASK;
4896 
4897         if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
4898                 access |= PFERR_USER_MASK;
4899 
4900         return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
4901                                            access, exception);
4902 }
4903 
4904 int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
4905                                 unsigned int bytes, struct x86_exception *exception)
4906 {
4907         /* kvm_write_guest_virt_system can pull in tons of pages. */
4908         vcpu->arch.l1tf_flush_l1d = true;
4909 
4910         return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
4911                                            PFERR_WRITE_MASK, exception);
4912 }
4913 EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
4914 
4915 int handle_ud(struct kvm_vcpu *vcpu)
4916 {
4917         int emul_type = EMULTYPE_TRAP_UD;
4918         enum emulation_result er;
4919         char sig[5]; /* ud2; .ascii "kvm" */
4920         struct x86_exception e;
4921 
4922         if (force_emulation_prefix &&
4923             kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
4924                                 sig, sizeof(sig), &e) == 0 &&
4925             memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) {
4926                 kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
4927                 emul_type = 0;
4928         }
4929 
4930         er = emulate_instruction(vcpu, emul_type);
4931         if (er == EMULATE_USER_EXIT)
4932                 return 0;
4933         if (er != EMULATE_DONE)
4934                 kvm_queue_exception(vcpu, UD_VECTOR);
4935         return 1;
4936 }
4937 EXPORT_SYMBOL_GPL(handle_ud);
4938 
4939 static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
4940                             gpa_t gpa, bool write)
4941 {
4942         /* For APIC access vmexit */
4943         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
4944                 return 1;
4945 
4946         if (vcpu_match_mmio_gpa(vcpu, gpa)) {
4947                 trace_vcpu_match_mmio(gva, gpa, write, true);
4948                 return 1;
4949         }
4950 
4951         return 0;
4952 }
4953 
4954 static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
4955                                 gpa_t *gpa, struct x86_exception *exception,
4956                                 bool write)
4957 {
4958         u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
4959                 | (write ? PFERR_WRITE_MASK : 0);
4960 
4961         /*
4962          * currently PKRU is only applied to ept enabled guest so
4963          * there is no pkey in EPT page table for L1 guest or EPT
4964          * shadow page table for L2 guest.
4965          */
4966         if (vcpu_match_mmio_gva(vcpu, gva)
4967             && !permission_fault(vcpu, vcpu->arch.walk_mmu,
4968                                  vcpu->arch.access, 0, access)) {
4969                 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
4970                                         (gva & (PAGE_SIZE - 1));
4971                 trace_vcpu_match_mmio(gva, *gpa, write, false);
4972                 return 1;
4973         }
4974 
4975         *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
4976 
4977         if (*gpa == UNMAPPED_GVA)
4978                 return -1;
4979 
4980         return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
4981 }
4982 
4983 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
4984                         const void *val, int bytes)
4985 {
4986         int ret;
4987 
4988         ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
4989         if (ret < 0)
4990                 return 0;
4991         kvm_page_track_write(vcpu, gpa, val, bytes);
4992         return 1;
4993 }
4994 
4995 struct read_write_emulator_ops {
4996         int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
4997                                   int bytes);
4998         int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
4999                                   void *val, int bytes);
5000         int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
5001                                int bytes, void *val);
5002         int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
5003                                     void *val, int bytes);
5004         bool write;
5005 };
5006 
5007 static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
5008 {
5009         if (vcpu->mmio_read_completed) {
5010                 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
5011                                vcpu->mmio_fragments[0].gpa, val);
5012                 vcpu->mmio_read_completed = 0;
5013                 return 1;
5014         }
5015 
5016         return 0;
5017 }
5018 
5019 static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
5020                         void *val, int bytes)
5021 {
5022         return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes);
5023 }
5024 
5025 static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
5026                          void *val, int bytes)
5027 {
5028         return emulator_write_phys(vcpu, gpa, val, bytes);
5029 }
5030 
5031 static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
5032 {
5033         trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
5034         return vcpu_mmio_write(vcpu, gpa, bytes, val);
5035 }
5036 
5037 static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
5038                           void *val, int bytes)
5039 {
5040         trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
5041         return X86EMUL_IO_NEEDED;
5042 }
5043 
5044 static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
5045                            void *val, int bytes)
5046 {
5047         struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
5048 
5049         memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
5050         return X86EMUL_CONTINUE;
5051 }
5052 
5053 static const struct read_write_emulator_ops read_emultor = {
5054         .read_write_prepare = read_prepare,
5055         .read_write_emulate = read_emulate,
5056         .read_write_mmio = vcpu_mmio_read,
5057         .read_write_exit_mmio = read_exit_mmio,
5058 };
5059 
5060 static const struct read_write_emulator_ops write_emultor = {
5061         .read_write_emulate = write_emulate,
5062         .read_write_mmio = write_mmio,
5063         .read_write_exit_mmio = write_exit_mmio,
5064         .write = true,
5065 };
5066 
5067 static int emulator_read_write_onepage(unsigned long addr, void *val,
5068                                        unsigned int bytes,
5069                                        struct x86_exception *exception,
5070                                        struct kvm_vcpu *vcpu,
5071                                        const struct read_write_emulator_ops *ops)
5072 {
5073         gpa_t gpa;
5074         int handled, ret;
5075         bool write = ops->write;
5076         struct kvm_mmio_fragment *frag;
5077         struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
5078 
5079         /*
5080          * If the exit was due to a NPF we may already have a GPA.
5081          * If the GPA is present, use it to avoid the GVA to GPA table walk.
5082          * Note, this cannot be used on string operations since string
5083          * operation using rep will only have the initial GPA from the NPF
5084          * occurred.
5085          */
5086         if (vcpu->arch.gpa_available &&
5087             emulator_can_use_gpa(ctxt) &&
5088             (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
5089                 gpa = vcpu->arch.gpa_val;
5090                 ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
5091         } else {
5092                 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
5093                 if (ret < 0)
5094                         return X86EMUL_PROPAGATE_FAULT;
5095         }
5096 
5097         if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes))
5098                 return X86EMUL_CONTINUE;
5099 
5100         /*
5101          * Is this MMIO handled locally?
5102          */
5103         handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
5104         if (handled == bytes)
5105                 return X86EMUL_CONTINUE;
5106 
5107         gpa += handled;
5108         bytes -= handled;
5109         val += handled;
5110 
5111         WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
5112         frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
5113         frag->gpa = gpa;
5114         frag->data = val;
5115         frag->len = bytes;
5116         return X86EMUL_CONTINUE;
5117 }
5118 
5119 static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
5120                         unsigned long addr,
5121                         void *val, unsigned int bytes,
5122                         struct x86_exception *exception,
5123                         const struct read_write_emulator_ops *ops)
5124 {
5125         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5126         gpa_t gpa;
5127         int rc;
5128 
5129         if (ops->read_write_prepare &&
5130                   ops->read_write_prepare(vcpu, val, bytes))
5131                 return X86EMUL_CONTINUE;
5132 
5133         vcpu->mmio_nr_fragments = 0;
5134 
5135         /* Crossing a page boundary? */
5136         if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
5137                 int now;
5138 
5139                 now = -addr & ~PAGE_MASK;
5140                 rc = emulator_read_write_onepage(addr, val, now, exception,
5141                                                  vcpu, ops);
5142 
5143                 if (rc != X86EMUL_CONTINUE)
5144                         return rc;
5145                 addr += now;
5146                 if (ctxt->mode != X86EMUL_MODE_PROT64)
5147                         addr = (u32)addr;
5148                 val += now;
5149                 bytes -= now;
5150         }
5151 
5152         rc = emulator_read_write_onepage(addr, val, bytes, exception,
5153                                          vcpu, ops);
5154         if (rc != X86EMUL_CONTINUE)
5155                 return rc;
5156 
5157         if (!vcpu->mmio_nr_fragments)
5158                 return rc;
5159 
5160         gpa = vcpu->mmio_fragments[0].gpa;
5161 
5162         vcpu->mmio_needed = 1;
5163         vcpu->mmio_cur_fragment = 0;
5164 
5165         vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
5166         vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
5167         vcpu->run->exit_reason = KVM_EXIT_MMIO;
5168         vcpu->run->mmio.phys_addr = gpa;
5169 
5170         return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
5171 }
5172 
5173 static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
5174                                   unsigned long addr,
5175                                   void *val,
5176                                   unsigned int bytes,
5177                                   struct x86_exception *exception)
5178 {
5179         return emulator_read_write(ctxt, addr, val, bytes,
5180                                    exception, &read_emultor);
5181 }
5182 
5183 static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
5184                             unsigned long addr,
5185                             const void *val,
5186                             unsigned int bytes,
5187                             struct x86_exception *exception)
5188 {
5189         return emulator_read_write(ctxt, addr, (void *)val, bytes,
5190                                    exception, &write_emultor);
5191 }
5192 
5193 #define CMPXCHG_TYPE(t, ptr, old, new) \
5194         (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
5195 
5196 #ifdef CONFIG_X86_64
5197 #  define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
5198 #else
5199 #  define CMPXCHG64(ptr, old, new) \
5200         (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
5201 #endif
5202 
5203 static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
5204                                      unsigned long addr,
5205                                      const void *old,
5206                                      const void *new,
5207                                      unsigned int bytes,
5208                                      struct x86_exception *exception)
5209 {
5210         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5211         gpa_t gpa;
5212         struct page *page;
5213         char *kaddr;
5214         bool exchanged;
5215 
5216         /* guests cmpxchg8b have to be emulated atomically */
5217         if (bytes > 8 || (bytes & (bytes - 1)))
5218                 goto emul_write;
5219 
5220         gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
5221 
5222         if (gpa == UNMAPPED_GVA ||
5223             (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
5224                 goto emul_write;
5225 
5226         if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
5227                 goto emul_write;
5228 
5229         page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
5230         if (is_error_page(page))
5231                 goto emul_write;
5232 
5233         kaddr = kmap_atomic(page);
5234         kaddr += offset_in_page(gpa);
5235         switch (bytes) {
5236         case 1:
5237                 exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
5238                 break;
5239         case 2:
5240                 exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
5241                 break;
5242         case 4:
5243                 exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
5244                 break;
5245         case 8:
5246                 exchanged = CMPXCHG64(kaddr, old, new);
5247                 break;
5248         default:
5249                 BUG();
5250         }
5251         kunmap_atomic(kaddr);
5252         kvm_release_page_dirty(page);
5253 
5254         if (!exchanged)
5255                 return X86EMUL_CMPXCHG_FAILED;
5256 
5257         kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
5258         kvm_page_track_write(vcpu, gpa, new, bytes);
5259 
5260         return X86EMUL_CONTINUE;
5261 
5262 emul_write:
5263         printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
5264 
5265         return emulator_write_emulated(ctxt, addr, new, bytes, exception);
5266 }
5267 
5268 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
5269 {
5270         int r = 0, i;
5271 
5272         for (i = 0; i < vcpu->arch.pio.count; i++) {
5273                 if (vcpu->arch.pio.in)
5274                         r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
5275                                             vcpu->arch.pio.size, pd);
5276                 else
5277                         r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
5278                                              vcpu->arch.pio.port, vcpu->arch.pio.size,
5279                                              pd);
5280                 if (r)
5281                         break;
5282                 pd += vcpu->arch.pio.size;
5283         }
5284         return r;
5285 }
5286 
5287 static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
5288                                unsigned short port, void *val,
5289                                unsigned int count, bool in)
5290 {
5291         vcpu->arch.pio.port = port;
5292         vcpu->arch.pio.in = in;
5293         vcpu->arch.pio.count  = count;
5294         vcpu->arch.pio.size = size;
5295 
5296         if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
5297                 vcpu->arch.pio.count = 0;
5298                 return 1;
5299         }