~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/x86/kvm/x86.c

Version: ~ [ linux-5.16-rc3 ] ~ [ linux-5.15.5 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.82 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.162 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.218 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.256 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.291 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.293 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.18.140 ] ~ [ linux-3.16.85 ] ~ [ linux-3.14.79 ] ~ [ linux-3.12.74 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-only
  2 /*
  3  * Kernel-based Virtual Machine driver for Linux
  4  *
  5  * derived from drivers/kvm/kvm_main.c
  6  *
  7  * Copyright (C) 2006 Qumranet, Inc.
  8  * Copyright (C) 2008 Qumranet, Inc.
  9  * Copyright IBM Corporation, 2008
 10  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
 11  *
 12  * Authors:
 13  *   Avi Kivity   <avi@qumranet.com>
 14  *   Yaniv Kamay  <yaniv@qumranet.com>
 15  *   Amit Shah    <amit.shah@qumranet.com>
 16  *   Ben-Ami Yassour <benami@il.ibm.com>
 17  */
 18 
 19 #include <linux/kvm_host.h>
 20 #include "irq.h"
 21 #include "mmu.h"
 22 #include "i8254.h"
 23 #include "tss.h"
 24 #include "kvm_cache_regs.h"
 25 #include "x86.h"
 26 #include "cpuid.h"
 27 #include "pmu.h"
 28 #include "hyperv.h"
 29 
 30 #include <linux/clocksource.h>
 31 #include <linux/interrupt.h>
 32 #include <linux/kvm.h>
 33 #include <linux/fs.h>
 34 #include <linux/vmalloc.h>
 35 #include <linux/export.h>
 36 #include <linux/moduleparam.h>
 37 #include <linux/mman.h>
 38 #include <linux/highmem.h>
 39 #include <linux/iommu.h>
 40 #include <linux/intel-iommu.h>
 41 #include <linux/cpufreq.h>
 42 #include <linux/user-return-notifier.h>
 43 #include <linux/srcu.h>
 44 #include <linux/slab.h>
 45 #include <linux/perf_event.h>
 46 #include <linux/uaccess.h>
 47 #include <linux/hash.h>
 48 #include <linux/pci.h>
 49 #include <linux/timekeeper_internal.h>
 50 #include <linux/pvclock_gtod.h>
 51 #include <linux/kvm_irqfd.h>
 52 #include <linux/irqbypass.h>
 53 #include <linux/sched/stat.h>
 54 #include <linux/mem_encrypt.h>
 55 
 56 #include <trace/events/kvm.h>
 57 
 58 #include <asm/debugreg.h>
 59 #include <asm/msr.h>
 60 #include <asm/desc.h>
 61 #include <asm/mce.h>
 62 #include <linux/kernel_stat.h>
 63 #include <asm/fpu/internal.h> /* Ugh! */
 64 #include <asm/pvclock.h>
 65 #include <asm/div64.h>
 66 #include <asm/irq_remapping.h>
 67 #include <asm/mshyperv.h>
 68 #include <asm/hypervisor.h>
 69 #include <asm/intel_pt.h>
 70 
 71 #define CREATE_TRACE_POINTS
 72 #include "trace.h"
 73 
 74 #define MAX_IO_MSRS 256
 75 #define KVM_MAX_MCE_BANKS 32
 76 u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
 77 EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
 78 
 79 #define emul_to_vcpu(ctxt) \
 80         container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
 81 
 82 /* EFER defaults:
 83  * - enable syscall per default because its emulated by KVM
 84  * - enable LME and LMA per default on 64 bit KVM
 85  */
 86 #ifdef CONFIG_X86_64
 87 static
 88 u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
 89 #else
 90 static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
 91 #endif
 92 
 93 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
 94 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 95 
 96 #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
 97                                     KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
 98 
 99 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
100 static void process_nmi(struct kvm_vcpu *vcpu);
101 static void enter_smm(struct kvm_vcpu *vcpu);
102 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
103 static void store_regs(struct kvm_vcpu *vcpu);
104 static int sync_regs(struct kvm_vcpu *vcpu);
105 
106 struct kvm_x86_ops *kvm_x86_ops __read_mostly;
107 EXPORT_SYMBOL_GPL(kvm_x86_ops);
108 
109 static bool __read_mostly ignore_msrs = 0;
110 module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
111 
112 static bool __read_mostly report_ignored_msrs = true;
113 module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
114 
115 unsigned int min_timer_period_us = 200;
116 module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
117 
118 static bool __read_mostly kvmclock_periodic_sync = true;
119 module_param(kvmclock_periodic_sync, bool, S_IRUGO);
120 
121 bool __read_mostly kvm_has_tsc_control;
122 EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
123 u32  __read_mostly kvm_max_guest_tsc_khz;
124 EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
125 u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
126 EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
127 u64  __read_mostly kvm_max_tsc_scaling_ratio;
128 EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
129 u64 __read_mostly kvm_default_tsc_scaling_ratio;
130 EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
131 
132 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
133 static u32 __read_mostly tsc_tolerance_ppm = 250;
134 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
135 
136 /*
137  * lapic timer advance (tscdeadline mode only) in nanoseconds.  '-1' enables
138  * adaptive tuning starting from default advancment of 1000ns.  '' disables
139  * advancement entirely.  Any other value is used as-is and disables adaptive
140  * tuning, i.e. allows priveleged userspace to set an exact advancement time.
141  */
142 static int __read_mostly lapic_timer_advance_ns = -1;
143 module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
144 
145 static bool __read_mostly vector_hashing = true;
146 module_param(vector_hashing, bool, S_IRUGO);
147 
148 bool __read_mostly enable_vmware_backdoor = false;
149 module_param(enable_vmware_backdoor, bool, S_IRUGO);
150 EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
151 
152 static bool __read_mostly force_emulation_prefix = false;
153 module_param(force_emulation_prefix, bool, S_IRUGO);
154 
155 #define KVM_NR_SHARED_MSRS 16
156 
157 struct kvm_shared_msrs_global {
158         int nr;
159         u32 msrs[KVM_NR_SHARED_MSRS];
160 };
161 
162 struct kvm_shared_msrs {
163         struct user_return_notifier urn;
164         bool registered;
165         struct kvm_shared_msr_values {
166                 u64 host;
167                 u64 curr;
168         } values[KVM_NR_SHARED_MSRS];
169 };
170 
171 static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
172 static struct kvm_shared_msrs __percpu *shared_msrs;
173 
174 struct kvm_stats_debugfs_item debugfs_entries[] = {
175         { "pf_fixed", VCPU_STAT(pf_fixed) },
176         { "pf_guest", VCPU_STAT(pf_guest) },
177         { "tlb_flush", VCPU_STAT(tlb_flush) },
178         { "invlpg", VCPU_STAT(invlpg) },
179         { "exits", VCPU_STAT(exits) },
180         { "io_exits", VCPU_STAT(io_exits) },
181         { "mmio_exits", VCPU_STAT(mmio_exits) },
182         { "signal_exits", VCPU_STAT(signal_exits) },
183         { "irq_window", VCPU_STAT(irq_window_exits) },
184         { "nmi_window", VCPU_STAT(nmi_window_exits) },
185         { "halt_exits", VCPU_STAT(halt_exits) },
186         { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
187         { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
188         { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
189         { "halt_wakeup", VCPU_STAT(halt_wakeup) },
190         { "hypercalls", VCPU_STAT(hypercalls) },
191         { "request_irq", VCPU_STAT(request_irq_exits) },
192         { "irq_exits", VCPU_STAT(irq_exits) },
193         { "host_state_reload", VCPU_STAT(host_state_reload) },
194         { "fpu_reload", VCPU_STAT(fpu_reload) },
195         { "insn_emulation", VCPU_STAT(insn_emulation) },
196         { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
197         { "irq_injections", VCPU_STAT(irq_injections) },
198         { "nmi_injections", VCPU_STAT(nmi_injections) },
199         { "req_event", VCPU_STAT(req_event) },
200         { "l1d_flush", VCPU_STAT(l1d_flush) },
201         { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
202         { "mmu_pte_write", VM_STAT(mmu_pte_write) },
203         { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
204         { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
205         { "mmu_flooded", VM_STAT(mmu_flooded) },
206         { "mmu_recycled", VM_STAT(mmu_recycled) },
207         { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
208         { "mmu_unsync", VM_STAT(mmu_unsync) },
209         { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
210         { "largepages", VM_STAT(lpages) },
211         { "max_mmu_page_hash_collisions",
212                 VM_STAT(max_mmu_page_hash_collisions) },
213         { NULL }
214 };
215 
216 u64 __read_mostly host_xcr0;
217 
218 struct kmem_cache *x86_fpu_cache;
219 EXPORT_SYMBOL_GPL(x86_fpu_cache);
220 
221 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
222 
223 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
224 {
225         int i;
226         for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
227                 vcpu->arch.apf.gfns[i] = ~0;
228 }
229 
230 static void kvm_on_user_return(struct user_return_notifier *urn)
231 {
232         unsigned slot;
233         struct kvm_shared_msrs *locals
234                 = container_of(urn, struct kvm_shared_msrs, urn);
235         struct kvm_shared_msr_values *values;
236         unsigned long flags;
237 
238         /*
239          * Disabling irqs at this point since the following code could be
240          * interrupted and executed through kvm_arch_hardware_disable()
241          */
242         local_irq_save(flags);
243         if (locals->registered) {
244                 locals->registered = false;
245                 user_return_notifier_unregister(urn);
246         }
247         local_irq_restore(flags);
248         for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
249                 values = &locals->values[slot];
250                 if (values->host != values->curr) {
251                         wrmsrl(shared_msrs_global.msrs[slot], values->host);
252                         values->curr = values->host;
253                 }
254         }
255 }
256 
257 static void shared_msr_update(unsigned slot, u32 msr)
258 {
259         u64 value;
260         unsigned int cpu = smp_processor_id();
261         struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
262 
263         /* only read, and nobody should modify it at this time,
264          * so don't need lock */
265         if (slot >= shared_msrs_global.nr) {
266                 printk(KERN_ERR "kvm: invalid MSR slot!");
267                 return;
268         }
269         rdmsrl_safe(msr, &value);
270         smsr->values[slot].host = value;
271         smsr->values[slot].curr = value;
272 }
273 
274 void kvm_define_shared_msr(unsigned slot, u32 msr)
275 {
276         BUG_ON(slot >= KVM_NR_SHARED_MSRS);
277         shared_msrs_global.msrs[slot] = msr;
278         if (slot >= shared_msrs_global.nr)
279                 shared_msrs_global.nr = slot + 1;
280 }
281 EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
282 
283 static void kvm_shared_msr_cpu_online(void)
284 {
285         unsigned i;
286 
287         for (i = 0; i < shared_msrs_global.nr; ++i)
288                 shared_msr_update(i, shared_msrs_global.msrs[i]);
289 }
290 
291 int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
292 {
293         unsigned int cpu = smp_processor_id();
294         struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
295         int err;
296 
297         if (((value ^ smsr->values[slot].curr) & mask) == 0)
298                 return 0;
299         smsr->values[slot].curr = value;
300         err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
301         if (err)
302                 return 1;
303 
304         if (!smsr->registered) {
305                 smsr->urn.on_user_return = kvm_on_user_return;
306                 user_return_notifier_register(&smsr->urn);
307                 smsr->registered = true;
308         }
309         return 0;
310 }
311 EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
312 
313 static void drop_user_return_notifiers(void)
314 {
315         unsigned int cpu = smp_processor_id();
316         struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
317 
318         if (smsr->registered)
319                 kvm_on_user_return(&smsr->urn);
320 }
321 
322 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
323 {
324         return vcpu->arch.apic_base;
325 }
326 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
327 
328 enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
329 {
330         return kvm_apic_mode(kvm_get_apic_base(vcpu));
331 }
332 EXPORT_SYMBOL_GPL(kvm_get_apic_mode);
333 
334 int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
335 {
336         enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
337         enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
338         u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff |
339                 (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
340 
341         if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
342                 return 1;
343         if (!msr_info->host_initiated) {
344                 if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
345                         return 1;
346                 if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
347                         return 1;
348         }
349 
350         kvm_lapic_set_base(vcpu, msr_info->data);
351         return 0;
352 }
353 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
354 
355 asmlinkage __visible void kvm_spurious_fault(void)
356 {
357         /* Fault while not rebooting.  We want the trace. */
358         BUG();
359 }
360 EXPORT_SYMBOL_GPL(kvm_spurious_fault);
361 
362 #define EXCPT_BENIGN            0
363 #define EXCPT_CONTRIBUTORY      1
364 #define EXCPT_PF                2
365 
366 static int exception_class(int vector)
367 {
368         switch (vector) {
369         case PF_VECTOR:
370                 return EXCPT_PF;
371         case DE_VECTOR:
372         case TS_VECTOR:
373         case NP_VECTOR:
374         case SS_VECTOR:
375         case GP_VECTOR:
376                 return EXCPT_CONTRIBUTORY;
377         default:
378                 break;
379         }
380         return EXCPT_BENIGN;
381 }
382 
383 #define EXCPT_FAULT             0
384 #define EXCPT_TRAP              1
385 #define EXCPT_ABORT             2
386 #define EXCPT_INTERRUPT         3
387 
388 static int exception_type(int vector)
389 {
390         unsigned int mask;
391 
392         if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
393                 return EXCPT_INTERRUPT;
394 
395         mask = 1 << vector;
396 
397         /* #DB is trap, as instruction watchpoints are handled elsewhere */
398         if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
399                 return EXCPT_TRAP;
400 
401         if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
402                 return EXCPT_ABORT;
403 
404         /* Reserved exceptions will result in fault */
405         return EXCPT_FAULT;
406 }
407 
408 void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
409 {
410         unsigned nr = vcpu->arch.exception.nr;
411         bool has_payload = vcpu->arch.exception.has_payload;
412         unsigned long payload = vcpu->arch.exception.payload;
413 
414         if (!has_payload)
415                 return;
416 
417         switch (nr) {
418         case DB_VECTOR:
419                 /*
420                  * "Certain debug exceptions may clear bit 0-3.  The
421                  * remaining contents of the DR6 register are never
422                  * cleared by the processor".
423                  */
424                 vcpu->arch.dr6 &= ~DR_TRAP_BITS;
425                 /*
426                  * DR6.RTM is set by all #DB exceptions that don't clear it.
427                  */
428                 vcpu->arch.dr6 |= DR6_RTM;
429                 vcpu->arch.dr6 |= payload;
430                 /*
431                  * Bit 16 should be set in the payload whenever the #DB
432                  * exception should clear DR6.RTM. This makes the payload
433                  * compatible with the pending debug exceptions under VMX.
434                  * Though not currently documented in the SDM, this also
435                  * makes the payload compatible with the exit qualification
436                  * for #DB exceptions under VMX.
437                  */
438                 vcpu->arch.dr6 ^= payload & DR6_RTM;
439                 break;
440         case PF_VECTOR:
441                 vcpu->arch.cr2 = payload;
442                 break;
443         }
444 
445         vcpu->arch.exception.has_payload = false;
446         vcpu->arch.exception.payload = 0;
447 }
448 EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
449 
450 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
451                 unsigned nr, bool has_error, u32 error_code,
452                 bool has_payload, unsigned long payload, bool reinject)
453 {
454         u32 prev_nr;
455         int class1, class2;
456 
457         kvm_make_request(KVM_REQ_EVENT, vcpu);
458 
459         if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
460         queue:
461                 if (has_error && !is_protmode(vcpu))
462                         has_error = false;
463                 if (reinject) {
464                         /*
465                          * On vmentry, vcpu->arch.exception.pending is only
466                          * true if an event injection was blocked by
467                          * nested_run_pending.  In that case, however,
468                          * vcpu_enter_guest requests an immediate exit,
469                          * and the guest shouldn't proceed far enough to
470                          * need reinjection.
471                          */
472                         WARN_ON_ONCE(vcpu->arch.exception.pending);
473                         vcpu->arch.exception.injected = true;
474                         if (WARN_ON_ONCE(has_payload)) {
475                                 /*
476                                  * A reinjected event has already
477                                  * delivered its payload.
478                                  */
479                                 has_payload = false;
480                                 payload = 0;
481                         }
482                 } else {
483                         vcpu->arch.exception.pending = true;
484                         vcpu->arch.exception.injected = false;
485                 }
486                 vcpu->arch.exception.has_error_code = has_error;
487                 vcpu->arch.exception.nr = nr;
488                 vcpu->arch.exception.error_code = error_code;
489                 vcpu->arch.exception.has_payload = has_payload;
490                 vcpu->arch.exception.payload = payload;
491                 /*
492                  * In guest mode, payload delivery should be deferred,
493                  * so that the L1 hypervisor can intercept #PF before
494                  * CR2 is modified (or intercept #DB before DR6 is
495                  * modified under nVMX).  However, for ABI
496                  * compatibility with KVM_GET_VCPU_EVENTS and
497                  * KVM_SET_VCPU_EVENTS, we can't delay payload
498                  * delivery unless userspace has enabled this
499                  * functionality via the per-VM capability,
500                  * KVM_CAP_EXCEPTION_PAYLOAD.
501                  */
502                 if (!vcpu->kvm->arch.exception_payload_enabled ||
503                     !is_guest_mode(vcpu))
504                         kvm_deliver_exception_payload(vcpu);
505                 return;
506         }
507 
508         /* to check exception */
509         prev_nr = vcpu->arch.exception.nr;
510         if (prev_nr == DF_VECTOR) {
511                 /* triple fault -> shutdown */
512                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
513                 return;
514         }
515         class1 = exception_class(prev_nr);
516         class2 = exception_class(nr);
517         if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
518                 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
519                 /*
520                  * Generate double fault per SDM Table 5-5.  Set
521                  * exception.pending = true so that the double fault
522                  * can trigger a nested vmexit.
523                  */
524                 vcpu->arch.exception.pending = true;
525                 vcpu->arch.exception.injected = false;
526                 vcpu->arch.exception.has_error_code = true;
527                 vcpu->arch.exception.nr = DF_VECTOR;
528                 vcpu->arch.exception.error_code = 0;
529                 vcpu->arch.exception.has_payload = false;
530                 vcpu->arch.exception.payload = 0;
531         } else
532                 /* replace previous exception with a new one in a hope
533                    that instruction re-execution will regenerate lost
534                    exception */
535                 goto queue;
536 }
537 
538 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
539 {
540         kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
541 }
542 EXPORT_SYMBOL_GPL(kvm_queue_exception);
543 
544 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
545 {
546         kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
547 }
548 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
549 
550 static void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
551                                   unsigned long payload)
552 {
553         kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
554 }
555 
556 static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
557                                     u32 error_code, unsigned long payload)
558 {
559         kvm_multiple_exception(vcpu, nr, true, error_code,
560                                true, payload, false);
561 }
562 
563 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
564 {
565         if (err)
566                 kvm_inject_gp(vcpu, 0);
567         else
568                 return kvm_skip_emulated_instruction(vcpu);
569 
570         return 1;
571 }
572 EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
573 
574 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
575 {
576         ++vcpu->stat.pf_guest;
577         vcpu->arch.exception.nested_apf =
578                 is_guest_mode(vcpu) && fault->async_page_fault;
579         if (vcpu->arch.exception.nested_apf) {
580                 vcpu->arch.apf.nested_apf_token = fault->address;
581                 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
582         } else {
583                 kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
584                                         fault->address);
585         }
586 }
587 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
588 
589 static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
590 {
591         if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
592                 vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
593         else
594                 vcpu->arch.mmu->inject_page_fault(vcpu, fault);
595 
596         return fault->nested_page_fault;
597 }
598 
599 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
600 {
601         atomic_inc(&vcpu->arch.nmi_queued);
602         kvm_make_request(KVM_REQ_NMI, vcpu);
603 }
604 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
605 
606 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
607 {
608         kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
609 }
610 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
611 
612 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
613 {
614         kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
615 }
616 EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
617 
618 /*
619  * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
620  * a #GP and return false.
621  */
622 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
623 {
624         if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
625                 return true;
626         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
627         return false;
628 }
629 EXPORT_SYMBOL_GPL(kvm_require_cpl);
630 
631 bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
632 {
633         if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
634                 return true;
635 
636         kvm_queue_exception(vcpu, UD_VECTOR);
637         return false;
638 }
639 EXPORT_SYMBOL_GPL(kvm_require_dr);
640 
641 /*
642  * This function will be used to read from the physical memory of the currently
643  * running guest. The difference to kvm_vcpu_read_guest_page is that this function
644  * can read from guest physical or from the guest's guest physical memory.
645  */
646 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
647                             gfn_t ngfn, void *data, int offset, int len,
648                             u32 access)
649 {
650         struct x86_exception exception;
651         gfn_t real_gfn;
652         gpa_t ngpa;
653 
654         ngpa     = gfn_to_gpa(ngfn);
655         real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
656         if (real_gfn == UNMAPPED_GVA)
657                 return -EFAULT;
658 
659         real_gfn = gpa_to_gfn(real_gfn);
660 
661         return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
662 }
663 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
664 
665 static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
666                                void *data, int offset, int len, u32 access)
667 {
668         return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
669                                        data, offset, len, access);
670 }
671 
672 static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
673 {
674         return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) |
675                rsvd_bits(1, 2);
676 }
677 
678 /*
679  * Load the pae pdptrs.  Return 1 if they are all valid, 0 otherwise.
680  */
681 int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
682 {
683         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
684         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
685         int i;
686         int ret;
687         u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
688 
689         ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
690                                       offset * sizeof(u64), sizeof(pdpte),
691                                       PFERR_USER_MASK|PFERR_WRITE_MASK);
692         if (ret < 0) {
693                 ret = 0;
694                 goto out;
695         }
696         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
697                 if ((pdpte[i] & PT_PRESENT_MASK) &&
698                     (pdpte[i] & pdptr_rsvd_bits(vcpu))) {
699                         ret = 0;
700                         goto out;
701                 }
702         }
703         ret = 1;
704 
705         memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
706         __set_bit(VCPU_EXREG_PDPTR,
707                   (unsigned long *)&vcpu->arch.regs_avail);
708         __set_bit(VCPU_EXREG_PDPTR,
709                   (unsigned long *)&vcpu->arch.regs_dirty);
710 out:
711 
712         return ret;
713 }
714 EXPORT_SYMBOL_GPL(load_pdptrs);
715 
716 bool pdptrs_changed(struct kvm_vcpu *vcpu)
717 {
718         u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
719         bool changed = true;
720         int offset;
721         gfn_t gfn;
722         int r;
723 
724         if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu))
725                 return false;
726 
727         if (!test_bit(VCPU_EXREG_PDPTR,
728                       (unsigned long *)&vcpu->arch.regs_avail))
729                 return true;
730 
731         gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
732         offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
733         r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
734                                        PFERR_USER_MASK | PFERR_WRITE_MASK);
735         if (r < 0)
736                 goto out;
737         changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
738 out:
739 
740         return changed;
741 }
742 EXPORT_SYMBOL_GPL(pdptrs_changed);
743 
744 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
745 {
746         unsigned long old_cr0 = kvm_read_cr0(vcpu);
747         unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
748 
749         cr0 |= X86_CR0_ET;
750 
751 #ifdef CONFIG_X86_64
752         if (cr0 & 0xffffffff00000000UL)
753                 return 1;
754 #endif
755 
756         cr0 &= ~CR0_RESERVED_BITS;
757 
758         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
759                 return 1;
760 
761         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
762                 return 1;
763 
764         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
765 #ifdef CONFIG_X86_64
766                 if ((vcpu->arch.efer & EFER_LME)) {
767                         int cs_db, cs_l;
768 
769                         if (!is_pae(vcpu))
770                                 return 1;
771                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
772                         if (cs_l)
773                                 return 1;
774                 } else
775 #endif
776                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
777                                                  kvm_read_cr3(vcpu)))
778                         return 1;
779         }
780 
781         if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
782                 return 1;
783 
784         kvm_x86_ops->set_cr0(vcpu, cr0);
785 
786         if ((cr0 ^ old_cr0) & X86_CR0_PG) {
787                 kvm_clear_async_pf_completion_queue(vcpu);
788                 kvm_async_pf_hash_reset(vcpu);
789         }
790 
791         if ((cr0 ^ old_cr0) & update_bits)
792                 kvm_mmu_reset_context(vcpu);
793 
794         if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
795             kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
796             !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
797                 kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
798 
799         return 0;
800 }
801 EXPORT_SYMBOL_GPL(kvm_set_cr0);
802 
803 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
804 {
805         (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
806 }
807 EXPORT_SYMBOL_GPL(kvm_lmsw);
808 
809 void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
810 {
811         if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
812                         !vcpu->guest_xcr0_loaded) {
813                 /* kvm_set_xcr() also depends on this */
814                 if (vcpu->arch.xcr0 != host_xcr0)
815                         xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
816                 vcpu->guest_xcr0_loaded = 1;
817         }
818 }
819 EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0);
820 
821 void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
822 {
823         if (vcpu->guest_xcr0_loaded) {
824                 if (vcpu->arch.xcr0 != host_xcr0)
825                         xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
826                 vcpu->guest_xcr0_loaded = 0;
827         }
828 }
829 EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0);
830 
831 static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
832 {
833         u64 xcr0 = xcr;
834         u64 old_xcr0 = vcpu->arch.xcr0;
835         u64 valid_bits;
836 
837         /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
838         if (index != XCR_XFEATURE_ENABLED_MASK)
839                 return 1;
840         if (!(xcr0 & XFEATURE_MASK_FP))
841                 return 1;
842         if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
843                 return 1;
844 
845         /*
846          * Do not allow the guest to set bits that we do not support
847          * saving.  However, xcr0 bit 0 is always set, even if the
848          * emulated CPU does not support XSAVE (see fx_init).
849          */
850         valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
851         if (xcr0 & ~valid_bits)
852                 return 1;
853 
854         if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
855             (!(xcr0 & XFEATURE_MASK_BNDCSR)))
856                 return 1;
857 
858         if (xcr0 & XFEATURE_MASK_AVX512) {
859                 if (!(xcr0 & XFEATURE_MASK_YMM))
860                         return 1;
861                 if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
862                         return 1;
863         }
864         vcpu->arch.xcr0 = xcr0;
865 
866         if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
867                 kvm_update_cpuid(vcpu);
868         return 0;
869 }
870 
871 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
872 {
873         if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
874             __kvm_set_xcr(vcpu, index, xcr)) {
875                 kvm_inject_gp(vcpu, 0);
876                 return 1;
877         }
878         return 0;
879 }
880 EXPORT_SYMBOL_GPL(kvm_set_xcr);
881 
882 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
883 {
884         unsigned long old_cr4 = kvm_read_cr4(vcpu);
885         unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
886                                    X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
887 
888         if (cr4 & CR4_RESERVED_BITS)
889                 return 1;
890 
891         if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
892                 return 1;
893 
894         if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
895                 return 1;
896 
897         if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
898                 return 1;
899 
900         if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
901                 return 1;
902 
903         if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
904                 return 1;
905 
906         if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
907                 return 1;
908 
909         if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP))
910                 return 1;
911 
912         if (is_long_mode(vcpu)) {
913                 if (!(cr4 & X86_CR4_PAE))
914                         return 1;
915         } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
916                    && ((cr4 ^ old_cr4) & pdptr_bits)
917                    && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
918                                    kvm_read_cr3(vcpu)))
919                 return 1;
920 
921         if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
922                 if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
923                         return 1;
924 
925                 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
926                 if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
927                         return 1;
928         }
929 
930         if (kvm_x86_ops->set_cr4(vcpu, cr4))
931                 return 1;
932 
933         if (((cr4 ^ old_cr4) & pdptr_bits) ||
934             (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
935                 kvm_mmu_reset_context(vcpu);
936 
937         if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
938                 kvm_update_cpuid(vcpu);
939 
940         return 0;
941 }
942 EXPORT_SYMBOL_GPL(kvm_set_cr4);
943 
944 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
945 {
946         bool skip_tlb_flush = false;
947 #ifdef CONFIG_X86_64
948         bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
949 
950         if (pcid_enabled) {
951                 skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
952                 cr3 &= ~X86_CR3_PCID_NOFLUSH;
953         }
954 #endif
955 
956         if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
957                 if (!skip_tlb_flush) {
958                         kvm_mmu_sync_roots(vcpu);
959                         kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
960                 }
961                 return 0;
962         }
963 
964         if (is_long_mode(vcpu) &&
965             (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
966                 return 1;
967         else if (is_pae(vcpu) && is_paging(vcpu) &&
968                    !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
969                 return 1;
970 
971         kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
972         vcpu->arch.cr3 = cr3;
973         __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
974 
975         return 0;
976 }
977 EXPORT_SYMBOL_GPL(kvm_set_cr3);
978 
979 int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
980 {
981         if (cr8 & CR8_RESERVED_BITS)
982                 return 1;
983         if (lapic_in_kernel(vcpu))
984                 kvm_lapic_set_tpr(vcpu, cr8);
985         else
986                 vcpu->arch.cr8 = cr8;
987         return 0;
988 }
989 EXPORT_SYMBOL_GPL(kvm_set_cr8);
990 
991 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
992 {
993         if (lapic_in_kernel(vcpu))
994                 return kvm_lapic_get_cr8(vcpu);
995         else
996                 return vcpu->arch.cr8;
997 }
998 EXPORT_SYMBOL_GPL(kvm_get_cr8);
999 
1000 static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
1001 {
1002         int i;
1003 
1004         if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1005                 for (i = 0; i < KVM_NR_DB_REGS; i++)
1006                         vcpu->arch.eff_db[i] = vcpu->arch.db[i];
1007                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
1008         }
1009 }
1010 
1011 static void kvm_update_dr6(struct kvm_vcpu *vcpu)
1012 {
1013         if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1014                 kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
1015 }
1016 
1017 static void kvm_update_dr7(struct kvm_vcpu *vcpu)
1018 {
1019         unsigned long dr7;
1020 
1021         if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1022                 dr7 = vcpu->arch.guest_debug_dr7;
1023         else
1024                 dr7 = vcpu->arch.dr7;
1025         kvm_x86_ops->set_dr7(vcpu, dr7);
1026         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
1027         if (dr7 & DR7_BP_EN_MASK)
1028                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
1029 }
1030 
1031 static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
1032 {
1033         u64 fixed = DR6_FIXED_1;
1034 
1035         if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
1036                 fixed |= DR6_RTM;
1037         return fixed;
1038 }
1039 
1040 static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
1041 {
1042         switch (dr) {
1043         case 0 ... 3:
1044                 vcpu->arch.db[dr] = val;
1045                 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1046                         vcpu->arch.eff_db[dr] = val;
1047                 break;
1048         case 4:
1049                 /* fall through */
1050         case 6:
1051                 if (val & 0xffffffff00000000ULL)
1052                         return -1; /* #GP */
1053                 vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
1054                 kvm_update_dr6(vcpu);
1055                 break;
1056         case 5:
1057                 /* fall through */
1058         default: /* 7 */
1059                 if (val & 0xffffffff00000000ULL)
1060                         return -1; /* #GP */
1061                 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
1062                 kvm_update_dr7(vcpu);
1063                 break;
1064         }
1065 
1066         return 0;
1067 }
1068 
1069 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
1070 {
1071         if (__kvm_set_dr(vcpu, dr, val)) {
1072                 kvm_inject_gp(vcpu, 0);
1073                 return 1;
1074         }
1075         return 0;
1076 }
1077 EXPORT_SYMBOL_GPL(kvm_set_dr);
1078 
1079 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
1080 {
1081         switch (dr) {
1082         case 0 ... 3:
1083                 *val = vcpu->arch.db[dr];
1084                 break;
1085         case 4:
1086                 /* fall through */
1087         case 6:
1088                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1089                         *val = vcpu->arch.dr6;
1090                 else
1091                         *val = kvm_x86_ops->get_dr6(vcpu);
1092                 break;
1093         case 5:
1094                 /* fall through */
1095         default: /* 7 */
1096                 *val = vcpu->arch.dr7;
1097                 break;
1098         }
1099         return 0;
1100 }
1101 EXPORT_SYMBOL_GPL(kvm_get_dr);
1102 
1103 bool kvm_rdpmc(struct kvm_vcpu *vcpu)
1104 {
1105         u32 ecx = kvm_rcx_read(vcpu);
1106         u64 data;
1107         int err;
1108 
1109         err = kvm_pmu_rdpmc(vcpu, ecx, &data);
1110         if (err)
1111                 return err;
1112         kvm_rax_write(vcpu, (u32)data);
1113         kvm_rdx_write(vcpu, data >> 32);
1114         return err;
1115 }
1116 EXPORT_SYMBOL_GPL(kvm_rdpmc);
1117 
1118 /*
1119  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
1120  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
1121  *
1122  * This list is modified at module load time to reflect the
1123  * capabilities of the host cpu. This capabilities test skips MSRs that are
1124  * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
1125  * may depend on host virtualization features rather than host cpu features.
1126  */
1127 
1128 static u32 msrs_to_save[] = {
1129         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
1130         MSR_STAR,
1131 #ifdef CONFIG_X86_64
1132         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
1133 #endif
1134         MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
1135         MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
1136         MSR_IA32_SPEC_CTRL,
1137         MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
1138         MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
1139         MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
1140         MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
1141         MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
1142         MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
1143 };
1144 
1145 static unsigned num_msrs_to_save;
1146 
1147 static u32 emulated_msrs[] = {
1148         MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
1149         MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
1150         HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
1151         HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
1152         HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
1153         HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
1154         HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
1155         HV_X64_MSR_RESET,
1156         HV_X64_MSR_VP_INDEX,
1157         HV_X64_MSR_VP_RUNTIME,
1158         HV_X64_MSR_SCONTROL,
1159         HV_X64_MSR_STIMER0_CONFIG,
1160         HV_X64_MSR_VP_ASSIST_PAGE,
1161         HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
1162         HV_X64_MSR_TSC_EMULATION_STATUS,
1163 
1164         MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
1165         MSR_KVM_PV_EOI_EN,
1166 
1167         MSR_IA32_TSC_ADJUST,
1168         MSR_IA32_TSCDEADLINE,
1169         MSR_IA32_ARCH_CAPABILITIES,
1170         MSR_IA32_MISC_ENABLE,
1171         MSR_IA32_MCG_STATUS,
1172         MSR_IA32_MCG_CTL,
1173         MSR_IA32_MCG_EXT_CTL,
1174         MSR_IA32_SMBASE,
1175         MSR_SMI_COUNT,
1176         MSR_PLATFORM_INFO,
1177         MSR_MISC_FEATURES_ENABLES,
1178         MSR_AMD64_VIRT_SPEC_CTRL,
1179         MSR_IA32_POWER_CTL,
1180 
1181         MSR_K7_HWCR,
1182 };
1183 
1184 static unsigned num_emulated_msrs;
1185 
1186 /*
1187  * List of msr numbers which are used to expose MSR-based features that
1188  * can be used by a hypervisor to validate requested CPU features.
1189  */
1190 static u32 msr_based_features[] = {
1191         MSR_IA32_VMX_BASIC,
1192         MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1193         MSR_IA32_VMX_PINBASED_CTLS,
1194         MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1195         MSR_IA32_VMX_PROCBASED_CTLS,
1196         MSR_IA32_VMX_TRUE_EXIT_CTLS,
1197         MSR_IA32_VMX_EXIT_CTLS,
1198         MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1199         MSR_IA32_VMX_ENTRY_CTLS,
1200         MSR_IA32_VMX_MISC,
1201         MSR_IA32_VMX_CR0_FIXED0,
1202         MSR_IA32_VMX_CR0_FIXED1,
1203         MSR_IA32_VMX_CR4_FIXED0,
1204         MSR_IA32_VMX_CR4_FIXED1,
1205         MSR_IA32_VMX_VMCS_ENUM,
1206         MSR_IA32_VMX_PROCBASED_CTLS2,
1207         MSR_IA32_VMX_EPT_VPID_CAP,
1208         MSR_IA32_VMX_VMFUNC,
1209 
1210         MSR_F10H_DECFG,
1211         MSR_IA32_UCODE_REV,
1212         MSR_IA32_ARCH_CAPABILITIES,
1213 };
1214 
1215 static unsigned int num_msr_based_features;
1216 
1217 u64 kvm_get_arch_capabilities(void)
1218 {
1219         u64 data;
1220 
1221         rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
1222 
1223         /*
1224          * If we're doing cache flushes (either "always" or "cond")
1225          * we will do one whenever the guest does a vmlaunch/vmresume.
1226          * If an outer hypervisor is doing the cache flush for us
1227          * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
1228          * capability to the guest too, and if EPT is disabled we're not
1229          * vulnerable.  Overall, only VMENTER_L1D_FLUSH_NEVER will
1230          * require a nested hypervisor to do a flush of its own.
1231          */
1232         if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
1233                 data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
1234 
1235         return data;
1236 }
1237 EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
1238 
1239 static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
1240 {
1241         switch (msr->index) {
1242         case MSR_IA32_ARCH_CAPABILITIES:
1243                 msr->data = kvm_get_arch_capabilities();
1244                 break;
1245         case MSR_IA32_UCODE_REV:
1246                 rdmsrl_safe(msr->index, &msr->data);
1247                 break;
1248         default:
1249                 if (kvm_x86_ops->get_msr_feature(msr))
1250                         return 1;
1251         }
1252         return 0;
1253 }
1254 
1255 static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1256 {
1257         struct kvm_msr_entry msr;
1258         int r;
1259 
1260         msr.index = index;
1261         r = kvm_get_msr_feature(&msr);
1262         if (r)
1263                 return r;
1264 
1265         *data = msr.data;
1266 
1267         return 0;
1268 }
1269 
1270 static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1271 {
1272         if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
1273                 return false;
1274 
1275         if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
1276                 return false;
1277 
1278         if (efer & (EFER_LME | EFER_LMA) &&
1279             !guest_cpuid_has(vcpu, X86_FEATURE_LM))
1280                 return false;
1281 
1282         if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX))
1283                 return false;
1284 
1285         return true;
1286 
1287 }
1288 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1289 {
1290         if (efer & efer_reserved_bits)
1291                 return false;
1292 
1293         return __kvm_valid_efer(vcpu, efer);
1294 }
1295 EXPORT_SYMBOL_GPL(kvm_valid_efer);
1296 
1297 static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1298 {
1299         u64 old_efer = vcpu->arch.efer;
1300         u64 efer = msr_info->data;
1301 
1302         if (efer & efer_reserved_bits)
1303                 return 1;
1304 
1305         if (!msr_info->host_initiated) {
1306                 if (!__kvm_valid_efer(vcpu, efer))
1307                         return 1;
1308 
1309                 if (is_paging(vcpu) &&
1310                     (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
1311                         return 1;
1312         }
1313 
1314         efer &= ~EFER_LMA;
1315         efer |= vcpu->arch.efer & EFER_LMA;
1316 
1317         kvm_x86_ops->set_efer(vcpu, efer);
1318 
1319         /* Update reserved bits */
1320         if ((efer ^ old_efer) & EFER_NX)
1321                 kvm_mmu_reset_context(vcpu);
1322 
1323         return 0;
1324 }
1325 
1326 void kvm_enable_efer_bits(u64 mask)
1327 {
1328        efer_reserved_bits &= ~mask;
1329 }
1330 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
1331 
1332 /*
1333  * Writes msr value into into the appropriate "register".
1334  * Returns 0 on success, non-0 otherwise.
1335  * Assumes vcpu_load() was already called.
1336  */
1337 int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
1338 {
1339         switch (msr->index) {
1340         case MSR_FS_BASE:
1341         case MSR_GS_BASE:
1342         case MSR_KERNEL_GS_BASE:
1343         case MSR_CSTAR:
1344         case MSR_LSTAR:
1345                 if (is_noncanonical_address(msr->data, vcpu))
1346                         return 1;
1347                 break;
1348         case MSR_IA32_SYSENTER_EIP:
1349         case MSR_IA32_SYSENTER_ESP:
1350                 /*
1351                  * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
1352                  * non-canonical address is written on Intel but not on
1353                  * AMD (which ignores the top 32-bits, because it does
1354                  * not implement 64-bit SYSENTER).
1355                  *
1356                  * 64-bit code should hence be able to write a non-canonical
1357                  * value on AMD.  Making the address canonical ensures that
1358                  * vmentry does not fail on Intel after writing a non-canonical
1359                  * value, and that something deterministic happens if the guest
1360                  * invokes 64-bit SYSENTER.
1361                  */
1362                 msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu));
1363         }
1364         return kvm_x86_ops->set_msr(vcpu, msr);
1365 }
1366 EXPORT_SYMBOL_GPL(kvm_set_msr);
1367 
1368 /*
1369  * Adapt set_msr() to msr_io()'s calling convention
1370  */
1371 static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1372 {
1373         struct msr_data msr;
1374         int r;
1375 
1376         msr.index = index;
1377         msr.host_initiated = true;
1378         r = kvm_get_msr(vcpu, &msr);
1379         if (r)
1380                 return r;
1381 
1382         *data = msr.data;
1383         return 0;
1384 }
1385 
1386 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1387 {
1388         struct msr_data msr;
1389 
1390         msr.data = *data;
1391         msr.index = index;
1392         msr.host_initiated = true;
1393         return kvm_set_msr(vcpu, &msr);
1394 }
1395 
1396 #ifdef CONFIG_X86_64
1397 struct pvclock_gtod_data {
1398         seqcount_t      seq;
1399 
1400         struct { /* extract of a clocksource struct */
1401                 int vclock_mode;
1402                 u64     cycle_last;
1403                 u64     mask;
1404                 u32     mult;
1405                 u32     shift;
1406         } clock;
1407 
1408         u64             boot_ns;
1409         u64             nsec_base;
1410         u64             wall_time_sec;
1411 };
1412 
1413 static struct pvclock_gtod_data pvclock_gtod_data;
1414 
1415 static void update_pvclock_gtod(struct timekeeper *tk)
1416 {
1417         struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
1418         u64 boot_ns;
1419 
1420         boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
1421 
1422         write_seqcount_begin(&vdata->seq);
1423 
1424         /* copy pvclock gtod data */
1425         vdata->clock.vclock_mode        = tk->tkr_mono.clock->archdata.vclock_mode;
1426         vdata->clock.cycle_last         = tk->tkr_mono.cycle_last;
1427         vdata->clock.mask               = tk->tkr_mono.mask;
1428         vdata->clock.mult               = tk->tkr_mono.mult;
1429         vdata->clock.shift              = tk->tkr_mono.shift;
1430 
1431         vdata->boot_ns                  = boot_ns;
1432         vdata->nsec_base                = tk->tkr_mono.xtime_nsec;
1433 
1434         vdata->wall_time_sec            = tk->xtime_sec;
1435 
1436         write_seqcount_end(&vdata->seq);
1437 }
1438 #endif
1439 
1440 void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
1441 {
1442         /*
1443          * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
1444          * vcpu_enter_guest.  This function is only called from
1445          * the physical CPU that is running vcpu.
1446          */
1447         kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
1448 }
1449 
1450 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
1451 {
1452         int version;
1453         int r;
1454         struct pvclock_wall_clock wc;
1455         struct timespec64 boot;
1456 
1457         if (!wall_clock)
1458                 return;
1459 
1460         r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
1461         if (r)
1462                 return;
1463 
1464         if (version & 1)
1465                 ++version;  /* first time write, random junk */
1466 
1467         ++version;
1468 
1469         if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
1470                 return;
1471 
1472         /*
1473          * The guest calculates current wall clock time by adding
1474          * system time (updated by kvm_guest_time_update below) to the
1475          * wall clock specified here.  guest system time equals host
1476          * system time for us, thus we must fill in host boot time here.
1477          */
1478         getboottime64(&boot);
1479 
1480         if (kvm->arch.kvmclock_offset) {
1481                 struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
1482                 boot = timespec64_sub(boot, ts);
1483         }
1484         wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
1485         wc.nsec = boot.tv_nsec;
1486         wc.version = version;
1487 
1488         kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
1489 
1490         version++;
1491         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
1492 }
1493 
1494 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
1495 {
1496         do_shl32_div32(dividend, divisor);
1497         return dividend;
1498 }
1499 
1500 static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
1501                                s8 *pshift, u32 *pmultiplier)
1502 {
1503         uint64_t scaled64;
1504         int32_t  shift = 0;
1505         uint64_t tps64;
1506         uint32_t tps32;
1507 
1508         tps64 = base_hz;
1509         scaled64 = scaled_hz;
1510         while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
1511                 tps64 >>= 1;
1512                 shift--;
1513         }
1514 
1515         tps32 = (uint32_t)tps64;
1516         while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
1517                 if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
1518                         scaled64 >>= 1;
1519                 else
1520                         tps32 <<= 1;
1521                 shift++;
1522         }
1523 
1524         *pshift = shift;
1525         *pmultiplier = div_frac(scaled64, tps32);
1526 
1527         pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
1528                  __func__, base_hz, scaled_hz, shift, *pmultiplier);
1529 }
1530 
1531 #ifdef CONFIG_X86_64
1532 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
1533 #endif
1534 
1535 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
1536 static unsigned long max_tsc_khz;
1537 
1538 static u32 adjust_tsc_khz(u32 khz, s32 ppm)
1539 {
1540         u64 v = (u64)khz * (1000000 + ppm);
1541         do_div(v, 1000000);
1542         return v;
1543 }
1544 
1545 static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
1546 {
1547         u64 ratio;
1548 
1549         /* Guest TSC same frequency as host TSC? */
1550         if (!scale) {
1551                 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
1552                 return 0;
1553         }
1554 
1555         /* TSC scaling supported? */
1556         if (!kvm_has_tsc_control) {
1557                 if (user_tsc_khz > tsc_khz) {
1558                         vcpu->arch.tsc_catchup = 1;
1559                         vcpu->arch.tsc_always_catchup = 1;
1560                         return 0;
1561                 } else {
1562                         pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
1563                         return -1;
1564                 }
1565         }
1566 
1567         /* TSC scaling required  - calculate ratio */
1568         ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
1569                                 user_tsc_khz, tsc_khz);
1570 
1571         if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
1572                 pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
1573                                     user_tsc_khz);
1574                 return -1;
1575         }
1576 
1577         vcpu->arch.tsc_scaling_ratio = ratio;
1578         return 0;
1579 }
1580 
1581 static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
1582 {
1583         u32 thresh_lo, thresh_hi;
1584         int use_scaling = 0;
1585 
1586         /* tsc_khz can be zero if TSC calibration fails */
1587         if (user_tsc_khz == 0) {
1588                 /* set tsc_scaling_ratio to a safe value */
1589                 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
1590                 return -1;
1591         }
1592 
1593         /* Compute a scale to convert nanoseconds in TSC cycles */
1594         kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
1595                            &vcpu->arch.virtual_tsc_shift,
1596                            &vcpu->arch.virtual_tsc_mult);
1597         vcpu->arch.virtual_tsc_khz = user_tsc_khz;
1598 
1599         /*
1600          * Compute the variation in TSC rate which is acceptable
1601          * within the range of tolerance and decide if the
1602          * rate being applied is within that bounds of the hardware
1603          * rate.  If so, no scaling or compensation need be done.
1604          */
1605         thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
1606         thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
1607         if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
1608                 pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
1609                 use_scaling = 1;
1610         }
1611         return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
1612 }
1613 
1614 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1615 {
1616         u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
1617                                       vcpu->arch.virtual_tsc_mult,
1618                                       vcpu->arch.virtual_tsc_shift);
1619         tsc += vcpu->arch.this_tsc_write;
1620         return tsc;
1621 }
1622 
1623 static inline int gtod_is_based_on_tsc(int mode)
1624 {
1625         return mode == VCLOCK_TSC || mode == VCLOCK_HVCLOCK;
1626 }
1627 
1628 static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
1629 {
1630 #ifdef CONFIG_X86_64
1631         bool vcpus_matched;
1632         struct kvm_arch *ka = &vcpu->kvm->arch;
1633         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1634 
1635         vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1636                          atomic_read(&vcpu->kvm->online_vcpus));
1637 
1638         /*
1639          * Once the masterclock is enabled, always perform request in
1640          * order to update it.
1641          *
1642          * In order to enable masterclock, the host clocksource must be TSC
1643          * and the vcpus need to have matched TSCs.  When that happens,
1644          * perform request to enable masterclock.
1645          */
1646         if (ka->use_master_clock ||
1647             (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
1648                 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
1649 
1650         trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
1651                             atomic_read(&vcpu->kvm->online_vcpus),
1652                             ka->use_master_clock, gtod->clock.vclock_mode);
1653 #endif
1654 }
1655 
1656 static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
1657 {
1658         u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
1659         vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
1660 }
1661 
1662 /*
1663  * Multiply tsc by a fixed point number represented by ratio.
1664  *
1665  * The most significant 64-N bits (mult) of ratio represent the
1666  * integral part of the fixed point number; the remaining N bits
1667  * (frac) represent the fractional part, ie. ratio represents a fixed
1668  * point number (mult + frac * 2^(-N)).
1669  *
1670  * N equals to kvm_tsc_scaling_ratio_frac_bits.
1671  */
1672 static inline u64 __scale_tsc(u64 ratio, u64 tsc)
1673 {
1674         return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
1675 }
1676 
1677 u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
1678 {
1679         u64 _tsc = tsc;
1680         u64 ratio = vcpu->arch.tsc_scaling_ratio;
1681 
1682         if (ratio != kvm_default_tsc_scaling_ratio)
1683                 _tsc = __scale_tsc(ratio, tsc);
1684 
1685         return _tsc;
1686 }
1687 EXPORT_SYMBOL_GPL(kvm_scale_tsc);
1688 
1689 static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1690 {
1691         u64 tsc;
1692 
1693         tsc = kvm_scale_tsc(vcpu, rdtsc());
1694 
1695         return target_tsc - tsc;
1696 }
1697 
1698 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
1699 {
1700         u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
1701 
1702         return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
1703 }
1704 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
1705 
1706 static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1707 {
1708         vcpu->arch.tsc_offset = kvm_x86_ops->write_l1_tsc_offset(vcpu, offset);
1709 }
1710 
1711 static inline bool kvm_check_tsc_unstable(void)
1712 {
1713 #ifdef CONFIG_X86_64
1714         /*
1715          * TSC is marked unstable when we're running on Hyper-V,
1716          * 'TSC page' clocksource is good.
1717          */
1718         if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK)
1719                 return false;
1720 #endif
1721         return check_tsc_unstable();
1722 }
1723 
1724 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1725 {
1726         struct kvm *kvm = vcpu->kvm;
1727         u64 offset, ns, elapsed;
1728         unsigned long flags;
1729         bool matched;
1730         bool already_matched;
1731         u64 data = msr->data;
1732         bool synchronizing = false;
1733 
1734         raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1735         offset = kvm_compute_tsc_offset(vcpu, data);
1736         ns = ktime_get_boot_ns();
1737         elapsed = ns - kvm->arch.last_tsc_nsec;
1738 
1739         if (vcpu->arch.virtual_tsc_khz) {
1740                 if (data == 0 && msr->host_initiated) {
1741                         /*
1742                          * detection of vcpu initialization -- need to sync
1743                          * with other vCPUs. This particularly helps to keep
1744                          * kvm_clock stable after CPU hotplug
1745                          */
1746                         synchronizing = true;
1747                 } else {
1748                         u64 tsc_exp = kvm->arch.last_tsc_write +
1749                                                 nsec_to_cycles(vcpu, elapsed);
1750                         u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
1751                         /*
1752                          * Special case: TSC write with a small delta (1 second)
1753                          * of virtual cycle time against real time is
1754                          * interpreted as an attempt to synchronize the CPU.
1755                          */
1756                         synchronizing = data < tsc_exp + tsc_hz &&
1757                                         data + tsc_hz > tsc_exp;
1758                 }
1759         }
1760 
1761         /*
1762          * For a reliable TSC, we can match TSC offsets, and for an unstable
1763          * TSC, we add elapsed time in this computation.  We could let the
1764          * compensation code attempt to catch up if we fall behind, but
1765          * it's better to try to match offsets from the beginning.
1766          */
1767         if (synchronizing &&
1768             vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
1769                 if (!kvm_check_tsc_unstable()) {
1770                         offset = kvm->arch.cur_tsc_offset;
1771                         pr_debug("kvm: matched tsc offset for %llu\n", data);
1772                 } else {
1773                         u64 delta = nsec_to_cycles(vcpu, elapsed);
1774                         data += delta;
1775                         offset = kvm_compute_tsc_offset(vcpu, data);
1776                         pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1777                 }
1778                 matched = true;
1779                 already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
1780         } else {
1781                 /*
1782                  * We split periods of matched TSC writes into generations.
1783                  * For each generation, we track the original measured
1784                  * nanosecond time, offset, and write, so if TSCs are in
1785                  * sync, we can match exact offset, and if not, we can match
1786                  * exact software computation in compute_guest_tsc()
1787                  *
1788                  * These values are tracked in kvm->arch.cur_xxx variables.
1789                  */
1790                 kvm->arch.cur_tsc_generation++;
1791                 kvm->arch.cur_tsc_nsec = ns;
1792                 kvm->arch.cur_tsc_write = data;
1793                 kvm->arch.cur_tsc_offset = offset;
1794                 matched = false;
1795                 pr_debug("kvm: new tsc generation %llu, clock %llu\n",
1796                          kvm->arch.cur_tsc_generation, data);
1797         }
1798 
1799         /*
1800          * We also track th most recent recorded KHZ, write and time to
1801          * allow the matching interval to be extended at each write.
1802          */
1803         kvm->arch.last_tsc_nsec = ns;
1804         kvm->arch.last_tsc_write = data;
1805         kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
1806 
1807         vcpu->arch.last_guest_tsc = data;
1808 
1809         /* Keep track of which generation this VCPU has synchronized to */
1810         vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
1811         vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
1812         vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
1813 
1814         if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
1815                 update_ia32_tsc_adjust_msr(vcpu, offset);
1816 
1817         kvm_vcpu_write_tsc_offset(vcpu, offset);
1818         raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1819 
1820         spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
1821         if (!matched) {
1822                 kvm->arch.nr_vcpus_matched_tsc = 0;
1823         } else if (!already_matched) {
1824                 kvm->arch.nr_vcpus_matched_tsc++;
1825         }
1826 
1827         kvm_track_tsc_matching(vcpu);
1828         spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
1829 }
1830 
1831 EXPORT_SYMBOL_GPL(kvm_write_tsc);
1832 
1833 static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
1834                                            s64 adjustment)
1835 {
1836         u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
1837         kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
1838 }
1839 
1840 static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
1841 {
1842         if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
1843                 WARN_ON(adjustment < 0);
1844         adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
1845         adjust_tsc_offset_guest(vcpu, adjustment);
1846 }
1847 
1848 #ifdef CONFIG_X86_64
1849 
1850 static u64 read_tsc(void)
1851 {
1852         u64 ret = (u64)rdtsc_ordered();
1853         u64 last = pvclock_gtod_data.clock.cycle_last;
1854 
1855         if (likely(ret >= last))
1856                 return ret;
1857 
1858         /*
1859          * GCC likes to generate cmov here, but this branch is extremely
1860          * predictable (it's just a function of time and the likely is
1861          * very likely) and there's a data dependence, so force GCC
1862          * to generate a branch instead.  I don't barrier() because
1863          * we don't actually need a barrier, and if this function
1864          * ever gets inlined it will generate worse code.
1865          */
1866         asm volatile ("");
1867         return last;
1868 }
1869 
1870 static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
1871 {
1872         long v;
1873         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1874         u64 tsc_pg_val;
1875 
1876         switch (gtod->clock.vclock_mode) {
1877         case VCLOCK_HVCLOCK:
1878                 tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
1879                                                   tsc_timestamp);
1880                 if (tsc_pg_val != U64_MAX) {
1881                         /* TSC page valid */
1882                         *mode = VCLOCK_HVCLOCK;
1883                         v = (tsc_pg_val - gtod->clock.cycle_last) &
1884                                 gtod->clock.mask;
1885                 } else {
1886                         /* TSC page invalid */
1887                         *mode = VCLOCK_NONE;
1888                 }
1889                 break;
1890         case VCLOCK_TSC:
1891                 *mode = VCLOCK_TSC;
1892                 *tsc_timestamp = read_tsc();
1893                 v = (*tsc_timestamp - gtod->clock.cycle_last) &
1894                         gtod->clock.mask;
1895                 break;
1896         default:
1897                 *mode = VCLOCK_NONE;
1898         }
1899 
1900         if (*mode == VCLOCK_NONE)
1901                 *tsc_timestamp = v = 0;
1902 
1903         return v * gtod->clock.mult;
1904 }
1905 
1906 static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
1907 {
1908         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1909         unsigned long seq;
1910         int mode;
1911         u64 ns;
1912 
1913         do {
1914                 seq = read_seqcount_begin(&gtod->seq);
1915                 ns = gtod->nsec_base;
1916                 ns += vgettsc(tsc_timestamp, &mode);
1917                 ns >>= gtod->clock.shift;
1918                 ns += gtod->boot_ns;
1919         } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1920         *t = ns;
1921 
1922         return mode;
1923 }
1924 
1925 static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
1926 {
1927         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1928         unsigned long seq;
1929         int mode;
1930         u64 ns;
1931 
1932         do {
1933                 seq = read_seqcount_begin(&gtod->seq);
1934                 ts->tv_sec = gtod->wall_time_sec;
1935                 ns = gtod->nsec_base;
1936                 ns += vgettsc(tsc_timestamp, &mode);
1937                 ns >>= gtod->clock.shift;
1938         } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1939 
1940         ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
1941         ts->tv_nsec = ns;
1942 
1943         return mode;
1944 }
1945 
1946 /* returns true if host is using TSC based clocksource */
1947 static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
1948 {
1949         /* checked again under seqlock below */
1950         if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
1951                 return false;
1952 
1953         return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
1954                                                       tsc_timestamp));
1955 }
1956 
1957 /* returns true if host is using TSC based clocksource */
1958 static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
1959                                            u64 *tsc_timestamp)
1960 {
1961         /* checked again under seqlock below */
1962         if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
1963                 return false;
1964 
1965         return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
1966 }
1967 #endif
1968 
1969 /*
1970  *
1971  * Assuming a stable TSC across physical CPUS, and a stable TSC
1972  * across virtual CPUs, the following condition is possible.
1973  * Each numbered line represents an event visible to both
1974  * CPUs at the next numbered event.
1975  *
1976  * "timespecX" represents host monotonic time. "tscX" represents
1977  * RDTSC value.
1978  *
1979  *              VCPU0 on CPU0           |       VCPU1 on CPU1
1980  *
1981  * 1.  read timespec0,tsc0
1982  * 2.                                   | timespec1 = timespec0 + N
1983  *                                      | tsc1 = tsc0 + M
1984  * 3. transition to guest               | transition to guest
1985  * 4. ret0 = timespec0 + (rdtsc - tsc0) |
1986  * 5.                                   | ret1 = timespec1 + (rdtsc - tsc1)
1987  *                                      | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
1988  *
1989  * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
1990  *
1991  *      - ret0 < ret1
1992  *      - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
1993  *              ...
1994  *      - 0 < N - M => M < N
1995  *
1996  * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
1997  * always the case (the difference between two distinct xtime instances
1998  * might be smaller then the difference between corresponding TSC reads,
1999  * when updating guest vcpus pvclock areas).
2000  *
2001  * To avoid that problem, do not allow visibility of distinct
2002  * system_timestamp/tsc_timestamp values simultaneously: use a master
2003  * copy of host monotonic time values. Update that master copy
2004  * in lockstep.
2005  *
2006  * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
2007  *
2008  */
2009 
2010 static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
2011 {
2012 #ifdef CONFIG_X86_64
2013         struct kvm_arch *ka = &kvm->arch;
2014         int vclock_mode;
2015         bool host_tsc_clocksource, vcpus_matched;
2016 
2017         vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
2018                         atomic_read(&kvm->online_vcpus));
2019 
2020         /*
2021          * If the host uses TSC clock, then passthrough TSC as stable
2022          * to the guest.
2023          */
2024         host_tsc_clocksource = kvm_get_time_and_clockread(
2025                                         &ka->master_kernel_ns,
2026                                         &ka->master_cycle_now);
2027 
2028         ka->use_master_clock = host_tsc_clocksource && vcpus_matched
2029                                 && !ka->backwards_tsc_observed
2030                                 && !ka->boot_vcpu_runs_old_kvmclock;
2031 
2032         if (ka->use_master_clock)
2033                 atomic_set(&kvm_guest_has_master_clock, 1);
2034 
2035         vclock_mode = pvclock_gtod_data.clock.vclock_mode;
2036         trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
2037                                         vcpus_matched);
2038 #endif
2039 }
2040 
2041 void kvm_make_mclock_inprogress_request(struct kvm *kvm)
2042 {
2043         kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
2044 }
2045 
2046 static void kvm_gen_update_masterclock(struct kvm *kvm)
2047 {
2048 #ifdef CONFIG_X86_64
2049         int i;
2050         struct kvm_vcpu *vcpu;
2051         struct kvm_arch *ka = &kvm->arch;
2052 
2053         spin_lock(&ka->pvclock_gtod_sync_lock);
2054         kvm_make_mclock_inprogress_request(kvm);
2055         /* no guest entries from this point */
2056         pvclock_update_vm_gtod_copy(kvm);
2057 
2058         kvm_for_each_vcpu(i, vcpu, kvm)
2059                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2060 
2061         /* guest entries allowed */
2062         kvm_for_each_vcpu(i, vcpu, kvm)
2063                 kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
2064 
2065         spin_unlock(&ka->pvclock_gtod_sync_lock);
2066 #endif
2067 }
2068 
2069 u64 get_kvmclock_ns(struct kvm *kvm)
2070 {
2071         struct kvm_arch *ka = &kvm->arch;
2072         struct pvclock_vcpu_time_info hv_clock;
2073         u64 ret;
2074 
2075         spin_lock(&ka->pvclock_gtod_sync_lock);
2076         if (!ka->use_master_clock) {
2077                 spin_unlock(&ka->pvclock_gtod_sync_lock);
2078                 return ktime_get_boot_ns() + ka->kvmclock_offset;
2079         }
2080 
2081         hv_clock.tsc_timestamp = ka->master_cycle_now;
2082         hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
2083         spin_unlock(&ka->pvclock_gtod_sync_lock);
2084 
2085         /* both __this_cpu_read() and rdtsc() should be on the same cpu */
2086         get_cpu();
2087 
2088         if (__this_cpu_read(cpu_tsc_khz)) {
2089                 kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
2090                                    &hv_clock.tsc_shift,
2091                                    &hv_clock.tsc_to_system_mul);
2092                 ret = __pvclock_read_cycles(&hv_clock, rdtsc());
2093         } else
2094                 ret = ktime_get_boot_ns() + ka->kvmclock_offset;
2095 
2096         put_cpu();
2097 
2098         return ret;
2099 }
2100 
2101 static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
2102 {
2103         struct kvm_vcpu_arch *vcpu = &v->arch;
2104         struct pvclock_vcpu_time_info guest_hv_clock;
2105 
2106         if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
2107                 &guest_hv_clock, sizeof(guest_hv_clock))))
2108                 return;
2109 
2110         /* This VCPU is paused, but it's legal for a guest to read another
2111          * VCPU's kvmclock, so we really have to follow the specification where
2112          * it says that version is odd if data is being modified, and even after
2113          * it is consistent.
2114          *
2115          * Version field updates must be kept separate.  This is because
2116          * kvm_write_guest_cached might use a "rep movs" instruction, and
2117          * writes within a string instruction are weakly ordered.  So there
2118          * are three writes overall.
2119          *
2120          * As a small optimization, only write the version field in the first
2121          * and third write.  The vcpu->pv_time cache is still valid, because the
2122          * version field is the first in the struct.
2123          */
2124         BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
2125 
2126         if (guest_hv_clock.version & 1)
2127                 ++guest_hv_clock.version;  /* first time write, random junk */
2128 
2129         vcpu->hv_clock.version = guest_hv_clock.version + 1;
2130         kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2131                                 &vcpu->hv_clock,
2132                                 sizeof(vcpu->hv_clock.version));
2133 
2134         smp_wmb();
2135 
2136         /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
2137         vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
2138 
2139         if (vcpu->pvclock_set_guest_stopped_request) {
2140                 vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
2141                 vcpu->pvclock_set_guest_stopped_request = false;
2142         }
2143 
2144         trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
2145 
2146         kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2147                                 &vcpu->hv_clock,
2148                                 sizeof(vcpu->hv_clock));
2149 
2150         smp_wmb();
2151 
2152         vcpu->hv_clock.version++;
2153         kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2154                                 &vcpu->hv_clock,
2155                                 sizeof(vcpu->hv_clock.version));
2156 }
2157 
2158 static int kvm_guest_time_update(struct kvm_vcpu *v)
2159 {
2160         unsigned long flags, tgt_tsc_khz;
2161         struct kvm_vcpu_arch *vcpu = &v->arch;
2162         struct kvm_arch *ka = &v->kvm->arch;
2163         s64 kernel_ns;
2164         u64 tsc_timestamp, host_tsc;
2165         u8 pvclock_flags;
2166         bool use_master_clock;
2167 
2168         kernel_ns = 0;
2169         host_tsc = 0;
2170 
2171         /*
2172          * If the host uses TSC clock, then passthrough TSC as stable
2173          * to the guest.
2174          */
2175         spin_lock(&ka->pvclock_gtod_sync_lock);
2176         use_master_clock = ka->use_master_clock;
2177         if (use_master_clock) {
2178                 host_tsc = ka->master_cycle_now;
2179                 kernel_ns = ka->master_kernel_ns;
2180         }
2181         spin_unlock(&ka->pvclock_gtod_sync_lock);
2182 
2183         /* Keep irq disabled to prevent changes to the clock */
2184         local_irq_save(flags);
2185         tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
2186         if (unlikely(tgt_tsc_khz == 0)) {
2187                 local_irq_restore(flags);
2188                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
2189                 return 1;
2190         }
2191         if (!use_master_clock) {
2192                 host_tsc = rdtsc();
2193                 kernel_ns = ktime_get_boot_ns();
2194         }
2195 
2196         tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
2197 
2198         /*
2199          * We may have to catch up the TSC to match elapsed wall clock
2200          * time for two reasons, even if kvmclock is used.
2201          *   1) CPU could have been running below the maximum TSC rate
2202          *   2) Broken TSC compensation resets the base at each VCPU
2203          *      entry to avoid unknown leaps of TSC even when running
2204          *      again on the same CPU.  This may cause apparent elapsed
2205          *      time to disappear, and the guest to stand still or run
2206          *      very slowly.
2207          */
2208         if (vcpu->tsc_catchup) {
2209                 u64 tsc = compute_guest_tsc(v, kernel_ns);
2210                 if (tsc > tsc_timestamp) {
2211                         adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
2212                         tsc_timestamp = tsc;
2213                 }
2214         }
2215 
2216         local_irq_restore(flags);
2217 
2218         /* With all the info we got, fill in the values */
2219 
2220         if (kvm_has_tsc_control)
2221                 tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
2222 
2223         if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
2224                 kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
2225                                    &vcpu->hv_clock.tsc_shift,
2226                                    &vcpu->hv_clock.tsc_to_system_mul);
2227                 vcpu->hw_tsc_khz = tgt_tsc_khz;
2228         }
2229 
2230         vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
2231         vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
2232         vcpu->last_guest_tsc = tsc_timestamp;
2233 
2234         /* If the host uses TSC clocksource, then it is stable */
2235         pvclock_flags = 0;
2236         if (use_master_clock)
2237                 pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
2238 
2239         vcpu->hv_clock.flags = pvclock_flags;
2240 
2241         if (vcpu->pv_time_enabled)
2242                 kvm_setup_pvclock_page(v);
2243         if (v == kvm_get_vcpu(v->kvm, 0))
2244                 kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
2245         return 0;
2246 }
2247 
2248 /*
2249  * kvmclock updates which are isolated to a given vcpu, such as
2250  * vcpu->cpu migration, should not allow system_timestamp from
2251  * the rest of the vcpus to remain static. Otherwise ntp frequency
2252  * correction applies to one vcpu's system_timestamp but not
2253  * the others.
2254  *
2255  * So in those cases, request a kvmclock update for all vcpus.
2256  * We need to rate-limit these requests though, as they can
2257  * considerably slow guests that have a large number of vcpus.
2258  * The time for a remote vcpu to update its kvmclock is bound
2259  * by the delay we use to rate-limit the updates.
2260  */
2261 
2262 #define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
2263 
2264 static void kvmclock_update_fn(struct work_struct *work)
2265 {
2266         int i;
2267         struct delayed_work *dwork = to_delayed_work(work);
2268         struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
2269                                            kvmclock_update_work);
2270         struct kvm *kvm = container_of(ka, struct kvm, arch);
2271         struct kvm_vcpu *vcpu;
2272 
2273         kvm_for_each_vcpu(i, vcpu, kvm) {
2274                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2275                 kvm_vcpu_kick(vcpu);
2276         }
2277 }
2278 
2279 static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
2280 {
2281         struct kvm *kvm = v->kvm;
2282 
2283         kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
2284         schedule_delayed_work(&kvm->arch.kvmclock_update_work,
2285                                         KVMCLOCK_UPDATE_DELAY);
2286 }
2287 
2288 #define KVMCLOCK_SYNC_PERIOD (300 * HZ)
2289 
2290 static void kvmclock_sync_fn(struct work_struct *work)
2291 {
2292         struct delayed_work *dwork = to_delayed_work(work);
2293         struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
2294                                            kvmclock_sync_work);
2295         struct kvm *kvm = container_of(ka, struct kvm, arch);
2296 
2297         if (!kvmclock_periodic_sync)
2298                 return;
2299 
2300         schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
2301         schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
2302                                         KVMCLOCK_SYNC_PERIOD);
2303 }
2304 
2305 /*
2306  * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
2307  */
2308 static bool can_set_mci_status(struct kvm_vcpu *vcpu)
2309 {
2310         /* McStatusWrEn enabled? */
2311         if (guest_cpuid_is_amd(vcpu))
2312                 return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
2313 
2314         return false;
2315 }
2316 
2317 static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2318 {
2319         u64 mcg_cap = vcpu->arch.mcg_cap;
2320         unsigned bank_num = mcg_cap & 0xff;
2321         u32 msr = msr_info->index;
2322         u64 data = msr_info->data;
2323 
2324         switch (msr) {
2325         case MSR_IA32_MCG_STATUS:
2326                 vcpu->arch.mcg_status = data;
2327                 break;
2328         case MSR_IA32_MCG_CTL:
2329                 if (!(mcg_cap & MCG_CTL_P) &&
2330                     (data || !msr_info->host_initiated))
2331                         return 1;
2332                 if (data != 0 && data != ~(u64)0)
2333                         return 1;
2334                 vcpu->arch.mcg_ctl = data;
2335                 break;
2336         default:
2337                 if (msr >= MSR_IA32_MC0_CTL &&
2338                     msr < MSR_IA32_MCx_CTL(bank_num)) {
2339                         u32 offset = msr - MSR_IA32_MC0_CTL;
2340                         /* only 0 or all 1s can be written to IA32_MCi_CTL
2341                          * some Linux kernels though clear bit 10 in bank 4 to
2342                          * workaround a BIOS/GART TBL issue on AMD K8s, ignore
2343                          * this to avoid an uncatched #GP in the guest
2344                          */
2345                         if ((offset & 0x3) == 0 &&
2346                             data != 0 && (data | (1 << 10)) != ~(u64)0)
2347                                 return -1;
2348 
2349                         /* MCi_STATUS */
2350                         if (!msr_info->host_initiated &&
2351                             (offset & 0x3) == 1 && data != 0) {
2352                                 if (!can_set_mci_status(vcpu))
2353                                         return -1;
2354                         }
2355 
2356                         vcpu->arch.mce_banks[offset] = data;
2357                         break;
2358                 }
2359                 return 1;
2360         }
2361         return 0;
2362 }
2363 
2364 static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
2365 {
2366         struct kvm *kvm = vcpu->kvm;
2367         int lm = is_long_mode(vcpu);
2368         u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
2369                 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
2370         u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
2371                 : kvm->arch.xen_hvm_config.blob_size_32;
2372         u32 page_num = data & ~PAGE_MASK;
2373         u64 page_addr = data & PAGE_MASK;
2374         u8 *page;
2375         int r;
2376 
2377         r = -E2BIG;
2378         if (page_num >= blob_size)
2379                 goto out;
2380         r = -ENOMEM;
2381         page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
2382         if (IS_ERR(page)) {
2383                 r = PTR_ERR(page);
2384                 goto out;
2385         }
2386         if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
2387                 goto out_free;
2388         r = 0;
2389 out_free:
2390         kfree(page);
2391 out:
2392         return r;
2393 }
2394 
2395 static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
2396 {
2397         gpa_t gpa = data & ~0x3f;
2398 
2399         /* Bits 3:5 are reserved, Should be zero */
2400         if (data & 0x38)
2401                 return 1;
2402 
2403         vcpu->arch.apf.msr_val = data;
2404 
2405         if (!(data & KVM_ASYNC_PF_ENABLED)) {
2406                 kvm_clear_async_pf_completion_queue(vcpu);
2407                 kvm_async_pf_hash_reset(vcpu);
2408                 return 0;
2409         }
2410 
2411         if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
2412                                         sizeof(u32)))
2413                 return 1;
2414 
2415         vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
2416         vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
2417         kvm_async_pf_wakeup_all(vcpu);
2418         return 0;
2419 }
2420 
2421 static void kvmclock_reset(struct kvm_vcpu *vcpu)
2422 {
2423         vcpu->arch.pv_time_enabled = false;
2424 }
2425 
2426 static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
2427 {
2428         ++vcpu->stat.tlb_flush;
2429         kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa);
2430 }
2431 
2432 static void record_steal_time(struct kvm_vcpu *vcpu)
2433 {
2434         if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
2435                 return;
2436 
2437         if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2438                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
2439                 return;
2440 
2441         /*
2442          * Doing a TLB flush here, on the guest's behalf, can avoid
2443          * expensive IPIs.
2444          */
2445         if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB)
2446                 kvm_vcpu_flush_tlb(vcpu, false);
2447 
2448         if (vcpu->arch.st.steal.version & 1)
2449                 vcpu->arch.st.steal.version += 1;  /* first time write, random junk */
2450 
2451         vcpu->arch.st.steal.version += 1;
2452 
2453         kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2454                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
2455 
2456         smp_wmb();
2457 
2458         vcpu->arch.st.steal.steal += current->sched_info.run_delay -
2459                 vcpu->arch.st.last_steal;
2460         vcpu->arch.st.last_steal = current->sched_info.run_delay;
2461 
2462         kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2463                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
2464 
2465         smp_wmb();
2466 
2467         vcpu->arch.st.steal.version += 1;
2468 
2469         kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2470                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
2471 }
2472 
2473 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2474 {
2475         bool pr = false;
2476         u32 msr = msr_info->index;
2477         u64 data = msr_info->data;
2478 
2479         switch (msr) {
2480         case MSR_AMD64_NB_CFG:
2481         case MSR_IA32_UCODE_WRITE:
2482         case MSR_VM_HSAVE_PA:
2483         case MSR_AMD64_PATCH_LOADER:
2484         case MSR_AMD64_BU_CFG2:
2485         case MSR_AMD64_DC_CFG:
2486         case MSR_F15H_EX_CFG:
2487                 break;
2488 
2489         case MSR_IA32_UCODE_REV:
2490                 if (msr_info->host_initiated)
2491                         vcpu->arch.microcode_version = data;
2492                 break;
2493         case MSR_IA32_ARCH_CAPABILITIES:
2494                 if (!msr_info->host_initiated)
2495                         return 1;
2496                 vcpu->arch.arch_capabilities = data;
2497                 break;
2498         case MSR_EFER:
2499                 return set_efer(vcpu, msr_info);
2500         case MSR_K7_HWCR:
2501                 data &= ~(u64)0x40;     /* ignore flush filter disable */
2502                 data &= ~(u64)0x100;    /* ignore ignne emulation enable */
2503                 data &= ~(u64)0x8;      /* ignore TLB cache disable */
2504 
2505                 /* Handle McStatusWrEn */
2506                 if (data == BIT_ULL(18)) {
2507                         vcpu->arch.msr_hwcr = data;
2508                 } else if (data != 0) {
2509                         vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
2510                                     data);
2511                         return 1;
2512                 }
2513                 break;
2514         case MSR_FAM10H_MMIO_CONF_BASE:
2515                 if (data != 0) {
2516                         vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
2517                                     "0x%llx\n", data);
2518                         return 1;
2519                 }
2520                 break;
2521         case MSR_IA32_DEBUGCTLMSR:
2522                 if (!data) {
2523                         /* We support the non-activated case already */
2524                         break;
2525                 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
2526                         /* Values other than LBR and BTF are vendor-specific,
2527                            thus reserved and should throw a #GP */
2528                         return 1;
2529                 }
2530                 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
2531                             __func__, data);
2532                 break;
2533         case 0x200 ... 0x2ff:
2534                 return kvm_mtrr_set_msr(vcpu, msr, data);
2535         case MSR_IA32_APICBASE:
2536                 return kvm_set_apic_base(vcpu, msr_info);
2537         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
2538                 return kvm_x2apic_msr_write(vcpu, msr, data);
2539         case MSR_IA32_TSCDEADLINE:
2540                 kvm_set_lapic_tscdeadline_msr(vcpu, data);
2541                 break;
2542         case MSR_IA32_TSC_ADJUST:
2543                 if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
2544                         if (!msr_info->host_initiated) {
2545                                 s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
2546                                 adjust_tsc_offset_guest(vcpu, adj);
2547                         }
2548                         vcpu->arch.ia32_tsc_adjust_msr = data;
2549                 }
2550                 break;
2551         case MSR_IA32_MISC_ENABLE:
2552                 vcpu->arch.ia32_misc_enable_msr = data;
2553                 break;
2554         case MSR_IA32_SMBASE:
2555                 if (!msr_info->host_initiated)
2556                         return 1;
2557                 vcpu->arch.smbase = data;
2558                 break;
2559         case MSR_IA32_TSC:
2560                 kvm_write_tsc(vcpu, msr_info);
2561                 break;
2562         case MSR_SMI_COUNT:
2563                 if (!msr_info->host_initiated)
2564                         return 1;
2565                 vcpu->arch.smi_count = data;
2566                 break;
2567         case MSR_KVM_WALL_CLOCK_NEW:
2568         case MSR_KVM_WALL_CLOCK:
2569                 vcpu->kvm->arch.wall_clock = data;
2570                 kvm_write_wall_clock(vcpu->kvm, data);
2571                 break;
2572         case MSR_KVM_SYSTEM_TIME_NEW:
2573         case MSR_KVM_SYSTEM_TIME: {
2574                 struct kvm_arch *ka = &vcpu->kvm->arch;
2575 
2576                 kvmclock_reset(vcpu);
2577 
2578                 if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
2579                         bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
2580 
2581                         if (ka->boot_vcpu_runs_old_kvmclock != tmp)
2582                                 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2583 
2584                         ka->boot_vcpu_runs_old_kvmclock = tmp;
2585                 }
2586 
2587                 vcpu->arch.time = data;
2588                 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2589 
2590                 /* we verify if the enable bit is set... */
2591                 if (!(data & 1))
2592                         break;
2593 
2594                 if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
2595                      &vcpu->arch.pv_time, data & ~1ULL,
2596                      sizeof(struct pvclock_vcpu_time_info)))
2597                         vcpu->arch.pv_time_enabled = false;
2598                 else
2599                         vcpu->arch.pv_time_enabled = true;
2600 
2601                 break;
2602         }
2603         case MSR_KVM_ASYNC_PF_EN:
2604                 if (kvm_pv_enable_async_pf(vcpu, data))
2605                         return 1;
2606                 break;
2607         case MSR_KVM_STEAL_TIME:
2608 
2609                 if (unlikely(!sched_info_on()))
2610                         return 1;
2611 
2612                 if (data & KVM_STEAL_RESERVED_MASK)
2613                         return 1;
2614 
2615                 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
2616                                                 data & KVM_STEAL_VALID_BITS,
2617                                                 sizeof(struct kvm_steal_time)))
2618                         return 1;
2619 
2620                 vcpu->arch.st.msr_val = data;
2621 
2622                 if (!(data & KVM_MSR_ENABLED))
2623                         break;
2624 
2625                 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2626 
2627                 break;
2628         case MSR_KVM_PV_EOI_EN:
2629                 if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
2630                         return 1;
2631                 break;
2632 
2633         case MSR_IA32_MCG_CTL:
2634         case MSR_IA32_MCG_STATUS:
2635         case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2636                 return set_msr_mce(vcpu, msr_info);
2637 
2638         case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
2639         case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
2640                 pr = true; /* fall through */
2641         case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
2642         case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
2643                 if (kvm_pmu_is_valid_msr(vcpu, msr))
2644                         return kvm_pmu_set_msr(vcpu, msr_info);
2645 
2646                 if (pr || data != 0)
2647                         vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
2648                                     "0x%x data 0x%llx\n", msr, data);
2649                 break;
2650         case MSR_K7_CLK_CTL:
2651                 /*
2652                  * Ignore all writes to this no longer documented MSR.
2653                  * Writes are only relevant for old K7 processors,
2654                  * all pre-dating SVM, but a recommended workaround from
2655                  * AMD for these chips. It is possible to specify the
2656                  * affected processor models on the command line, hence
2657                  * the need to ignore the workaround.
2658                  */
2659                 break;
2660         case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
2661         case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
2662         case HV_X64_MSR_CRASH_CTL:
2663         case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
2664         case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
2665         case HV_X64_MSR_TSC_EMULATION_CONTROL:
2666         case HV_X64_MSR_TSC_EMULATION_STATUS:
2667                 return kvm_hv_set_msr_common(vcpu, msr, data,
2668                                              msr_info->host_initiated);
2669         case MSR_IA32_BBL_CR_CTL3:
2670                 /* Drop writes to this legacy MSR -- see rdmsr
2671                  * counterpart for further detail.
2672                  */
2673                 if (report_ignored_msrs)
2674                         vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
2675                                 msr, data);
2676                 break;
2677         case MSR_AMD64_OSVW_ID_LENGTH:
2678                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2679                         return 1;
2680                 vcpu->arch.osvw.length = data;
2681                 break;
2682         case MSR_AMD64_OSVW_STATUS:
2683                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2684                         return 1;
2685                 vcpu->arch.osvw.status = data;
2686                 break;
2687         case MSR_PLATFORM_INFO:
2688                 if (!msr_info->host_initiated ||
2689                     (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
2690                      cpuid_fault_enabled(vcpu)))
2691                         return 1;
2692                 vcpu->arch.msr_platform_info = data;
2693                 break;
2694         case MSR_MISC_FEATURES_ENABLES:
2695                 if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
2696                     (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
2697                      !supports_cpuid_fault(vcpu)))
2698                         return 1;
2699                 vcpu->arch.msr_misc_features_enables = data;
2700                 break;
2701         default:
2702                 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
2703                         return xen_hvm_config(vcpu, data);
2704                 if (kvm_pmu_is_valid_msr(vcpu, msr))
2705                         return kvm_pmu_set_msr(vcpu, msr_info);
2706                 if (!ignore_msrs) {
2707                         vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
2708                                     msr, data);
2709                         return 1;
2710                 } else {
2711                         if (report_ignored_msrs)
2712                                 vcpu_unimpl(vcpu,
2713                                         "ignored wrmsr: 0x%x data 0x%llx\n",
2714                                         msr, data);
2715                         break;
2716                 }
2717         }
2718         return 0;
2719 }
2720 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
2721 
2722 
2723 /*
2724  * Reads an msr value (of 'msr_index') into 'pdata'.
2725  * Returns 0 on success, non-0 otherwise.
2726  * Assumes vcpu_load() was already called.
2727  */
2728 int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2729 {
2730         return kvm_x86_ops->get_msr(vcpu, msr);
2731 }
2732 EXPORT_SYMBOL_GPL(kvm_get_msr);
2733 
2734 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
2735 {
2736         u64 data;
2737         u64 mcg_cap = vcpu->arch.mcg_cap;
2738         unsigned bank_num = mcg_cap & 0xff;
2739 
2740         switch (msr) {
2741         case MSR_IA32_P5_MC_ADDR:
2742         case MSR_IA32_P5_MC_TYPE:
2743                 data = 0;
2744                 break;
2745         case MSR_IA32_MCG_CAP:
2746                 data = vcpu->arch.mcg_cap;
2747                 break;
2748         case MSR_IA32_MCG_CTL:
2749                 if (!(mcg_cap & MCG_CTL_P) && !host)
2750                         return 1;
2751                 data = vcpu->arch.mcg_ctl;
2752                 break;
2753         case MSR_IA32_MCG_STATUS:
2754                 data = vcpu->arch.mcg_status;
2755                 break;
2756         default:
2757                 if (msr >= MSR_IA32_MC0_CTL &&
2758                     msr < MSR_IA32_MCx_CTL(bank_num)) {
2759                         u32 offset = msr - MSR_IA32_MC0_CTL;
2760                         data = vcpu->arch.mce_banks[offset];
2761                         break;
2762                 }
2763                 return 1;
2764         }
2765         *pdata = data;
2766         return 0;
2767 }
2768 
2769 int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2770 {
2771         switch (msr_info->index) {
2772         case MSR_IA32_PLATFORM_ID:
2773         case MSR_IA32_EBL_CR_POWERON:
2774         case MSR_IA32_DEBUGCTLMSR:
2775         case MSR_IA32_LASTBRANCHFROMIP:
2776         case MSR_IA32_LASTBRANCHTOIP:
2777         case MSR_IA32_LASTINTFROMIP:
2778         case MSR_IA32_LASTINTTOIP:
2779         case MSR_K8_SYSCFG:
2780         case MSR_K8_TSEG_ADDR:
2781         case MSR_K8_TSEG_MASK:
2782         case MSR_VM_HSAVE_PA:
2783         case MSR_K8_INT_PENDING_MSG:
2784         case MSR_AMD64_NB_CFG:
2785         case MSR_FAM10H_MMIO_CONF_BASE:
2786         case MSR_AMD64_BU_CFG2:
2787         case MSR_IA32_PERF_CTL:
2788         case MSR_AMD64_DC_CFG:
2789         case MSR_F15H_EX_CFG:
2790                 msr_info->data = 0;
2791                 break;
2792         case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
2793         case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
2794         case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
2795         case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
2796         case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
2797                 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
2798                         return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
2799                 msr_info->data = 0;
2800                 break;
2801         case MSR_IA32_UCODE_REV:
2802                 msr_info->data = vcpu->arch.microcode_version;
2803                 break;
2804         case MSR_IA32_ARCH_CAPABILITIES:
2805                 if (!msr_info->host_initiated &&
2806                     !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
2807                         return 1;
2808                 msr_info->data = vcpu->arch.arch_capabilities;
2809                 break;
2810         case MSR_IA32_TSC:
2811                 msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
2812                 break;
2813         case MSR_MTRRcap:
2814         case 0x200 ... 0x2ff:
2815                 return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
2816         case 0xcd: /* fsb frequency */
2817                 msr_info->data = 3;
2818                 break;
2819                 /*
2820                  * MSR_EBC_FREQUENCY_ID
2821                  * Conservative value valid for even the basic CPU models.
2822                  * Models 0,1: 000 in bits 23:21 indicating a bus speed of
2823                  * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
2824                  * and 266MHz for model 3, or 4. Set Core Clock
2825                  * Frequency to System Bus Frequency Ratio to 1 (bits
2826                  * 31:24) even though these are only valid for CPU
2827                  * models > 2, however guests may end up dividing or
2828                  * multiplying by zero otherwise.
2829                  */
2830         case MSR_EBC_FREQUENCY_ID:
2831                 msr_info->data = 1 << 24;
2832                 break;
2833         case MSR_IA32_APICBASE:
2834                 msr_info->data = kvm_get_apic_base(vcpu);
2835                 break;
2836         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
2837                 return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
2838                 break;
2839         case MSR_IA32_TSCDEADLINE:
2840                 msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
2841                 break;
2842         case MSR_IA32_TSC_ADJUST:
2843                 msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
2844                 break;
2845         case MSR_IA32_MISC_ENABLE:
2846                 msr_info->data = vcpu->arch.ia32_misc_enable_msr;
2847                 break;
2848         case MSR_IA32_SMBASE:
2849                 if (!msr_info->host_initiated)
2850                         return 1;
2851                 msr_info->data = vcpu->arch.smbase;
2852                 break;
2853         case MSR_SMI_COUNT:
2854                 msr_info->data = vcpu->arch.smi_count;
2855                 break;
2856         case MSR_IA32_PERF_STATUS:
2857                 /* TSC increment by tick */
2858                 msr_info->data = 1000ULL;
2859                 /* CPU multiplier */
2860                 msr_info->data |= (((uint64_t)4ULL) << 40);
2861                 break;
2862         case MSR_EFER:
2863                 msr_info->data = vcpu->arch.efer;
2864                 break;
2865         case MSR_KVM_WALL_CLOCK:
2866         case MSR_KVM_WALL_CLOCK_NEW:
2867                 msr_info->data = vcpu->kvm->arch.wall_clock;
2868                 break;
2869         case MSR_KVM_SYSTEM_TIME:
2870         case MSR_KVM_SYSTEM_TIME_NEW:
2871                 msr_info->data = vcpu->arch.time;
2872                 break;
2873         case MSR_KVM_ASYNC_PF_EN:
2874                 msr_info->data = vcpu->arch.apf.msr_val;
2875                 break;
2876         case MSR_KVM_STEAL_TIME:
2877                 msr_info->data = vcpu->arch.st.msr_val;
2878                 break;
2879         case MSR_KVM_PV_EOI_EN:
2880                 msr_info->data = vcpu->arch.pv_eoi.msr_val;
2881                 break;
2882         case MSR_IA32_P5_MC_ADDR:
2883         case MSR_IA32_P5_MC_TYPE:
2884         case MSR_IA32_MCG_CAP:
2885         case MSR_IA32_MCG_CTL:
2886         case MSR_IA32_MCG_STATUS:
2887         case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2888                 return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
2889                                    msr_info->host_initiated);
2890         case MSR_K7_CLK_CTL:
2891                 /*
2892                  * Provide expected ramp-up count for K7. All other
2893                  * are set to zero, indicating minimum divisors for
2894                  * every field.
2895                  *
2896                  * This prevents guest kernels on AMD host with CPU
2897                  * type 6, model 8 and higher from exploding due to
2898                  * the rdmsr failing.
2899                  */
2900                 msr_info->data = 0x20000000;
2901                 break;
2902         case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
2903         case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
2904         case HV_X64_MSR_CRASH_CTL:
2905         case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
2906         case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
2907         case HV_X64_MSR_TSC_EMULATION_CONTROL:
2908         case HV_X64_MSR_TSC_EMULATION_STATUS:
2909                 return kvm_hv_get_msr_common(vcpu,
2910                                              msr_info->index, &msr_info->data,
2911                                              msr_info->host_initiated);
2912                 break;
2913         case MSR_IA32_BBL_CR_CTL3:
2914                 /* This legacy MSR exists but isn't fully documented in current
2915                  * silicon.  It is however accessed by winxp in very narrow
2916                  * scenarios where it sets bit #19, itself documented as
2917                  * a "reserved" bit.  Best effort attempt to source coherent
2918                  * read data here should the balance of the register be
2919                  * interpreted by the guest:
2920                  *
2921                  * L2 cache control register 3: 64GB range, 256KB size,
2922                  * enabled, latency 0x1, configured
2923                  */
2924                 msr_info->data = 0xbe702111;
2925                 break;
2926         case MSR_AMD64_OSVW_ID_LENGTH:
2927                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2928                         return 1;
2929                 msr_info->data = vcpu->arch.osvw.length;
2930                 break;
2931         case MSR_AMD64_OSVW_STATUS:
2932                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2933                         return 1;
2934                 msr_info->data = vcpu->arch.osvw.status;
2935                 break;
2936         case MSR_PLATFORM_INFO:
2937                 if (!msr_info->host_initiated &&
2938                     !vcpu->kvm->arch.guest_can_read_msr_platform_info)
2939                         return 1;
2940                 msr_info->data = vcpu->arch.msr_platform_info;
2941                 break;
2942         case MSR_MISC_FEATURES_ENABLES:
2943                 msr_info->data = vcpu->arch.msr_misc_features_enables;
2944                 break;
2945         case MSR_K7_HWCR:
2946                 msr_info->data = vcpu->arch.msr_hwcr;
2947                 break;
2948         default:
2949                 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
2950                         return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
2951                 if (!ignore_msrs) {
2952                         vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
2953                                                msr_info->index);
2954                         return 1;
2955                 } else {
2956                         if (report_ignored_msrs)
2957                                 vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n",
2958                                         msr_info->index);
2959                         msr_info->data = 0;
2960                 }
2961                 break;
2962         }
2963         return 0;
2964 }
2965 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
2966 
2967 /*
2968  * Read or write a bunch of msrs. All parameters are kernel addresses.
2969  *
2970  * @return number of msrs set successfully.
2971  */
2972 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2973                     struct kvm_msr_entry *entries,
2974                     int (*do_msr)(struct kvm_vcpu *vcpu,
2975                                   unsigned index, u64 *data))
2976 {
2977         int i;
2978 
2979         for (i = 0; i < msrs->nmsrs; ++i)
2980                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
2981                         break;
2982 
2983         return i;
2984 }
2985 
2986 /*
2987  * Read or write a bunch of msrs. Parameters are user addresses.
2988  *
2989  * @return number of msrs set successfully.
2990  */
2991 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2992                   int (*do_msr)(struct kvm_vcpu *vcpu,
2993                                 unsigned index, u64 *data),
2994                   int writeback)
2995 {
2996         struct kvm_msrs msrs;
2997         struct kvm_msr_entry *entries;
2998         int r, n;
2999         unsigned size;
3000 
3001         r = -EFAULT;
3002         if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
3003                 goto out;
3004 
3005         r = -E2BIG;
3006         if (msrs.nmsrs >= MAX_IO_MSRS)
3007                 goto out;
3008 
3009         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
3010         entries = memdup_user(user_msrs->entries, size);
3011         if (IS_ERR(entries)) {
3012                 r = PTR_ERR(entries);
3013                 goto out;
3014         }
3015 
3016         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
3017         if (r < 0)
3018                 goto out_free;
3019 
3020         r = -EFAULT;
3021         if (writeback && copy_to_user(user_msrs->entries, entries, size))
3022                 goto out_free;
3023 
3024         r = n;
3025 
3026 out_free:
3027         kfree(entries);
3028 out:
3029         return r;
3030 }
3031 
3032 static inline bool kvm_can_mwait_in_guest(void)
3033 {
3034         return boot_cpu_has(X86_FEATURE_MWAIT) &&
3035                 !boot_cpu_has_bug(X86_BUG_MONITOR) &&
3036                 boot_cpu_has(X86_FEATURE_ARAT);
3037 }
3038 
3039 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
3040 {
3041         int r = 0;
3042 
3043         switch (ext) {
3044         case KVM_CAP_IRQCHIP:
3045         case KVM_CAP_HLT:
3046         case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
3047         case KVM_CAP_SET_TSS_ADDR:
3048         case KVM_CAP_EXT_CPUID:
3049         case KVM_CAP_EXT_EMUL_CPUID:
3050         case KVM_CAP_CLOCKSOURCE:
3051         case KVM_CAP_PIT:
3052         case KVM_CAP_NOP_IO_DELAY:
3053         case KVM_CAP_MP_STATE:
3054         case KVM_CAP_SYNC_MMU:
3055         case KVM_CAP_USER_NMI:
3056         case KVM_CAP_REINJECT_CONTROL:
3057         case KVM_CAP_IRQ_INJECT_STATUS:
3058         case KVM_CAP_IOEVENTFD:
3059         case KVM_CAP_IOEVENTFD_NO_LENGTH:
3060         case KVM_CAP_PIT2:
3061         case KVM_CAP_PIT_STATE2:
3062         case KVM_CAP_SET_IDENTITY_MAP_ADDR:
3063         case KVM_CAP_XEN_HVM:
3064         case KVM_CAP_VCPU_EVENTS:
3065         case KVM_CAP_HYPERV:
3066         case KVM_CAP_HYPERV_VAPIC:
3067         case KVM_CAP_HYPERV_SPIN:
3068         case KVM_CAP_HYPERV_SYNIC:
3069         case KVM_CAP_HYPERV_SYNIC2:
3070         case KVM_CAP_HYPERV_VP_INDEX:
3071         case KVM_CAP_HYPERV_EVENTFD:
3072         case KVM_CAP_HYPERV_TLBFLUSH:
3073         case KVM_CAP_HYPERV_SEND_IPI:
3074         case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
3075         case KVM_CAP_HYPERV_CPUID:
3076         case KVM_CAP_PCI_SEGMENT:
3077         case KVM_CAP_DEBUGREGS:
3078         case KVM_CAP_X86_ROBUST_SINGLESTEP:
3079         case KVM_CAP_XSAVE:
3080         case KVM_CAP_ASYNC_PF:
3081         case KVM_CAP_GET_TSC_KHZ:
3082         case KVM_CAP_KVMCLOCK_CTRL:
3083         case KVM_CAP_READONLY_MEM:
3084         case KVM_CAP_HYPERV_TIME:
3085         case KVM_CAP_IOAPIC_POLARITY_IGNORED:
3086         case KVM_CAP_TSC_DEADLINE_TIMER:
3087         case KVM_CAP_DISABLE_QUIRKS:
3088         case KVM_CAP_SET_BOOT_CPU_ID:
3089         case KVM_CAP_SPLIT_IRQCHIP:
3090         case KVM_CAP_IMMEDIATE_EXIT:
3091         case KVM_CAP_GET_MSR_FEATURES:
3092         case KVM_CAP_MSR_PLATFORM_INFO:
3093         case KVM_CAP_EXCEPTION_PAYLOAD:
3094                 r = 1;
3095                 break;
3096         case KVM_CAP_SYNC_REGS:
3097                 r = KVM_SYNC_X86_VALID_FIELDS;
3098                 break;
3099         case KVM_CAP_ADJUST_CLOCK:
3100                 r = KVM_CLOCK_TSC_STABLE;
3101                 break;
3102         case KVM_CAP_X86_DISABLE_EXITS:
3103                 r |=  KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE;
3104                 if(kvm_can_mwait_in_guest())
3105                         r |= KVM_X86_DISABLE_EXITS_MWAIT;
3106                 break;
3107         case KVM_CAP_X86_SMM:
3108                 /* SMBASE is usually relocated above 1M on modern chipsets,
3109                  * and SMM handlers might indeed rely on 4G segment limits,
3110                  * so do not report SMM to be available if real mode is
3111                  * emulated via vm86 mode.  Still, do not go to great lengths
3112                  * to avoid userspace's usage of the feature, because it is a
3113                  * fringe case that is not enabled except via specific settings
3114                  * of the module parameters.
3115                  */
3116                 r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE);
3117                 break;
3118         case KVM_CAP_VAPIC:
3119                 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
3120                 break;
3121         case KVM_CAP_NR_VCPUS:
3122                 r = KVM_SOFT_MAX_VCPUS;
3123                 break;
3124         case KVM_CAP_MAX_VCPUS:
3125                 r = KVM_MAX_VCPUS;
3126                 break;
3127         case KVM_CAP_MAX_VCPU_ID:
3128                 r = KVM_MAX_VCPU_ID;
3129                 break;
3130         case KVM_CAP_PV_MMU:    /* obsolete */
3131                 r = 0;
3132                 break;
3133         case KVM_CAP_MCE:
3134                 r = KVM_MAX_MCE_BANKS;
3135                 break;
3136         case KVM_CAP_XCRS:
3137                 r = boot_cpu_has(X86_FEATURE_XSAVE);
3138                 break;
3139         case KVM_CAP_TSC_CONTROL:
3140                 r = kvm_has_tsc_control;
3141                 break;
3142         case KVM_CAP_X2APIC_API:
3143                 r = KVM_X2APIC_API_VALID_FLAGS;
3144                 break;
3145         case KVM_CAP_NESTED_STATE:
3146                 r = kvm_x86_ops->get_nested_state ?
3147                         kvm_x86_ops->get_nested_state(NULL, NULL, 0) : 0;
3148                 break;
3149         default:
3150                 break;
3151         }
3152         return r;
3153 
3154 }
3155 
3156 long kvm_arch_dev_ioctl(struct file *filp,
3157                         unsigned int ioctl, unsigned long arg)
3158 {
3159         void __user *argp = (void __user *)arg;
3160         long r;
3161 
3162         switch (ioctl) {
3163         case KVM_GET_MSR_INDEX_LIST: {
3164                 struct kvm_msr_list __user *user_msr_list = argp;
3165                 struct kvm_msr_list msr_list;
3166                 unsigned n;
3167 
3168                 r = -EFAULT;
3169                 if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
3170                         goto out;
3171                 n = msr_list.nmsrs;
3172                 msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
3173                 if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
3174                         goto out;
3175                 r = -E2BIG;
3176                 if (n < msr_list.nmsrs)
3177                         goto out;
3178                 r = -EFAULT;
3179                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
3180                                  num_msrs_to_save * sizeof(u32)))
3181                         goto out;
3182                 if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
3183                                  &emulated_msrs,
3184                                  num_emulated_msrs * sizeof(u32)))
3185                         goto out;
3186                 r = 0;
3187                 break;
3188         }
3189         case KVM_GET_SUPPORTED_CPUID:
3190         case KVM_GET_EMULATED_CPUID: {
3191                 struct kvm_cpuid2 __user *cpuid_arg = argp;
3192                 struct kvm_cpuid2 cpuid;
3193 
3194                 r = -EFAULT;
3195                 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
3196                         goto out;
3197 
3198                 r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
3199                                             ioctl);
3200                 if (r)
3201                         goto out;
3202 
3203                 r = -EFAULT;
3204                 if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
3205                         goto out;
3206                 r = 0;
3207                 break;
3208         }
3209         case KVM_X86_GET_MCE_CAP_SUPPORTED: {
3210                 r = -EFAULT;
3211                 if (copy_to_user(argp, &kvm_mce_cap_supported,
3212                                  sizeof(kvm_mce_cap_supported)))
3213                         goto out;
3214                 r = 0;
3215                 break;
3216         case KVM_GET_MSR_FEATURE_INDEX_LIST: {
3217                 struct kvm_msr_list __user *user_msr_list = argp;
3218                 struct kvm_msr_list msr_list;
3219                 unsigned int n;
3220 
3221                 r = -EFAULT;
3222                 if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
3223                         goto out;
3224                 n = msr_list.nmsrs;
3225                 msr_list.nmsrs = num_msr_based_features;
3226                 if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
3227                         goto out;
3228                 r = -E2BIG;
3229                 if (n < msr_list.nmsrs)
3230                         goto out;
3231                 r = -EFAULT;
3232                 if (copy_to_user(user_msr_list->indices, &msr_based_features,
3233                                  num_msr_based_features * sizeof(u32)))
3234                         goto out;
3235                 r = 0;
3236                 break;
3237         }
3238         case KVM_GET_MSRS:
3239                 r = msr_io(NULL, argp, do_get_msr_feature, 1);
3240                 break;
3241         }
3242         default:
3243                 r = -EINVAL;
3244         }
3245 out:
3246         return r;
3247 }
3248 
3249 static void wbinvd_ipi(void *garbage)
3250 {
3251         wbinvd();
3252 }
3253 
3254 static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
3255 {
3256         return kvm_arch_has_noncoherent_dma(vcpu->kvm);
3257 }
3258 
3259 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
3260 {
3261         /* Address WBINVD may be executed by guest */
3262         if (need_emulate_wbinvd(vcpu)) {
3263                 if (kvm_x86_ops->has_wbinvd_exit())
3264                         cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
3265                 else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
3266                         smp_call_function_single(vcpu->cpu,
3267                                         wbinvd_ipi, NULL, 1);
3268         }
3269 
3270         kvm_x86_ops->vcpu_load(vcpu, cpu);
3271 
3272         fpregs_assert_state_consistent();
3273         if (test_thread_flag(TIF_NEED_FPU_LOAD))
3274                 switch_fpu_return();
3275 
3276         /* Apply any externally detected TSC adjustments (due to suspend) */
3277         if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
3278                 adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
3279                 vcpu->arch.tsc_offset_adjustment = 0;
3280                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
3281         }
3282 
3283         if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
3284                 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
3285                                 rdtsc() - vcpu->arch.last_host_tsc;
3286                 if (tsc_delta < 0)
3287                         mark_tsc_unstable("KVM discovered backwards TSC");
3288 
3289                 if (kvm_check_tsc_unstable()) {
3290                         u64 offset = kvm_compute_tsc_offset(vcpu,
3291                                                 vcpu->arch.last_guest_tsc);
3292                         kvm_vcpu_write_tsc_offset(vcpu, offset);
3293                         vcpu->arch.tsc_catchup = 1;
3294                 }
3295 
3296                 if (kvm_lapic_hv_timer_in_use(vcpu))
3297                         kvm_lapic_restart_hv_timer(vcpu);
3298 
3299                 /*
3300                  * On a host with synchronized TSC, there is no need to update
3301                  * kvmclock on vcpu->cpu migration
3302                  */
3303                 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
3304                         kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
3305                 if (vcpu->cpu != cpu)
3306                         kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
3307                 vcpu->cpu = cpu;
3308         }
3309 
3310         kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
3311 }
3312 
3313 static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
3314 {
3315         if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
3316                 return;
3317 
3318         vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
3319 
3320         kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
3321                         &vcpu->arch.st.steal.preempted,
3322                         offsetof(struct kvm_steal_time, preempted),
3323                         sizeof(vcpu->arch.st.steal.preempted));
3324 }
3325 
3326 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
3327 {
3328         int idx;
3329 
3330         if (vcpu->preempted)
3331                 vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu);
3332 
3333         /*
3334          * Disable page faults because we're in atomic context here.
3335          * kvm_write_guest_offset_cached() would call might_fault()
3336          * that relies on pagefault_disable() to tell if there's a
3337          * bug. NOTE: the write to guest memory may not go through if
3338          * during postcopy live migration or if there's heavy guest
3339          * paging.
3340          */
3341         pagefault_disable();
3342         /*
3343          * kvm_memslots() will be called by
3344          * kvm_write_guest_offset_cached() so take the srcu lock.
3345          */
3346         idx = srcu_read_lock(&vcpu->kvm->srcu);
3347         kvm_steal_time_set_preempted(vcpu);
3348         srcu_read_unlock(&vcpu->kvm->srcu, idx);
3349         pagefault_enable();
3350         kvm_x86_ops->vcpu_put(vcpu);
3351         vcpu->arch.last_host_tsc = rdtsc();
3352         /*
3353          * If userspace has set any breakpoints or watchpoints, dr6 is restored
3354          * on every vmexit, but if not, we might have a stale dr6 from the
3355          * guest. do_debug expects dr6 to be cleared after it runs, do the same.
3356          */
3357         set_debugreg(0, 6);
3358 }
3359 
3360 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
3361                                     struct kvm_lapic_state *s)
3362 {
3363         if (vcpu->arch.apicv_active)
3364                 kvm_x86_ops->sync_pir_to_irr(vcpu);
3365 
3366         return kvm_apic_get_state(vcpu, s);
3367 }
3368 
3369 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
3370                                     struct kvm_lapic_state *s)
3371 {
3372         int r;
3373 
3374         r = kvm_apic_set_state(vcpu, s);
3375         if (r)
3376                 return r;
3377         update_cr8_intercept(vcpu);
3378 
3379         return 0;
3380 }
3381 
3382 static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
3383 {
3384         return (!lapic_in_kernel(vcpu) ||
3385                 kvm_apic_accept_pic_intr(vcpu));
3386 }
3387 
3388 /*
3389  * if userspace requested an interrupt window, check that the
3390  * interrupt window is open.
3391  *
3392  * No need to exit to userspace if we already have an interrupt queued.
3393  */
3394 static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
3395 {
3396         return kvm_arch_interrupt_allowed(vcpu) &&
3397                 !kvm_cpu_has_interrupt(vcpu) &&
3398                 !kvm_event_needs_reinjection(vcpu) &&
3399                 kvm_cpu_accept_dm_intr(vcpu);
3400 }
3401 
3402 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
3403                                     struct kvm_interrupt *irq)
3404 {
3405         if (irq->irq >= KVM_NR_INTERRUPTS)
3406                 return -EINVAL;
3407 
3408         if (!irqchip_in_kernel(vcpu->kvm)) {
3409                 kvm_queue_interrupt(vcpu, irq->irq, false);
3410                 kvm_make_request(KVM_REQ_EVENT, vcpu);
3411                 return 0;
3412         }
3413 
3414         /*
3415          * With in-kernel LAPIC, we only use this to inject EXTINT, so
3416          * fail for in-kernel 8259.
3417          */
3418         if (pic_in_kernel(vcpu->kvm))
3419                 return -ENXIO;
3420 
3421         if (vcpu->arch.pending_external_vector != -1)
3422                 return -EEXIST;
3423 
3424         vcpu->arch.pending_external_vector = irq->irq;
3425         kvm_make_request(KVM_REQ_EVENT, vcpu);
3426         return 0;
3427 }
3428 
3429 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
3430 {
3431         kvm_inject_nmi(vcpu);
3432 
3433         return 0;
3434 }
3435 
3436 static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
3437 {
3438         kvm_make_request(KVM_REQ_SMI, vcpu);
3439 
3440         return 0;
3441 }
3442 
3443 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
3444                                            struct kvm_tpr_access_ctl *tac)
3445 {
3446         if (tac->flags)
3447                 return -EINVAL;
3448         vcpu->arch.tpr_access_reporting = !!tac->enabled;
3449         return 0;
3450 }
3451 
3452 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
3453                                         u64 mcg_cap)
3454 {
3455         int r;
3456         unsigned bank_num = mcg_cap & 0xff, bank;
3457 
3458         r = -EINVAL;
3459         if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
3460                 goto out;
3461         if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
3462                 goto out;
3463         r = 0;
3464         vcpu->arch.mcg_cap = mcg_cap;
3465         /* Init IA32_MCG_CTL to all 1s */
3466         if (mcg_cap & MCG_CTL_P)
3467                 vcpu->arch.mcg_ctl = ~(u64)0;
3468         /* Init IA32_MCi_CTL to all 1s */
3469         for (bank = 0; bank < bank_num; bank++)
3470                 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
3471 
3472         if (kvm_x86_ops->setup_mce)
3473                 kvm_x86_ops->setup_mce(vcpu);
3474 out:
3475         return r;
3476 }
3477 
3478 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
3479                                       struct kvm_x86_mce *mce)
3480 {
3481         u64 mcg_cap = vcpu->arch.mcg_cap;
3482         unsigned bank_num = mcg_cap & 0xff;
3483         u64 *banks = vcpu->arch.mce_banks;
3484 
3485         if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
3486                 return -EINVAL;
3487         /*
3488          * if IA32_MCG_CTL is not all 1s, the uncorrected error
3489          * reporting is disabled
3490          */
3491         if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
3492             vcpu->arch.mcg_ctl != ~(u64)0)
3493                 return 0;
3494         banks += 4 * mce->bank;
3495         /*
3496          * if IA32_MCi_CTL is not all 1s, the uncorrected error
3497          * reporting is disabled for the bank
3498          */
3499         if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
3500                 return 0;
3501         if (mce->status & MCI_STATUS_UC) {
3502                 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
3503                     !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
3504                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3505                         return 0;
3506                 }
3507                 if (banks[1] & MCI_STATUS_VAL)
3508                         mce->status |= MCI_STATUS_OVER;
3509                 banks[2] = mce->addr;
3510                 banks[3] = mce->misc;
3511                 vcpu->arch.mcg_status = mce->mcg_status;
3512                 banks[1] = mce->status;
3513                 kvm_queue_exception(vcpu, MC_VECTOR);
3514         } else if (!(banks[1] & MCI_STATUS_VAL)
3515                    || !(banks[1] & MCI_STATUS_UC)) {
3516                 if (banks[1] & MCI_STATUS_VAL)
3517                         mce->status |= MCI_STATUS_OVER;
3518                 banks[2] = mce->addr;
3519                 banks[3] = mce->misc;
3520                 banks[1] = mce->status;
3521         } else
3522                 banks[1] |= MCI_STATUS_OVER;
3523         return 0;
3524 }
3525 
3526 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
3527                                                struct kvm_vcpu_events *events)
3528 {
3529         process_nmi(vcpu);
3530 
3531         /*
3532          * The API doesn't provide the instruction length for software
3533          * exceptions, so don't report them. As long as the guest RIP
3534          * isn't advanced, we should expect to encounter the exception
3535          * again.
3536          */
3537         if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
3538                 events->exception.injected = 0;
3539                 events->exception.pending = 0;
3540         } else {
3541                 events->exception.injected = vcpu->arch.exception.injected;
3542                 events->exception.pending = vcpu->arch.exception.pending;
3543                 /*
3544                  * For ABI compatibility, deliberately conflate
3545                  * pending and injected exceptions when
3546                  * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
3547                  */
3548                 if (!vcpu->kvm->arch.exception_payload_enabled)
3549                         events->exception.injected |=
3550                                 vcpu->arch.exception.pending;
3551         }
3552         events->exception.nr = vcpu->arch.exception.nr;
3553         events->exception.has_error_code = vcpu->arch.exception.has_error_code;
3554         events->exception.error_code = vcpu->arch.exception.error_code;
3555         events->exception_has_payload = vcpu->arch.exception.has_payload;
3556         events->exception_payload = vcpu->arch.exception.payload;
3557 
3558         events->interrupt.injected =
3559                 vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
3560         events->interrupt.nr = vcpu->arch.interrupt.nr;
3561         events->interrupt.soft = 0;
3562         events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
3563 
3564         events->nmi.injected = vcpu->arch.nmi_injected;
3565         events->nmi.pending = vcpu->arch.nmi_pending != 0;
3566         events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
3567         events->nmi.pad = 0;
3568 
3569         events->sipi_vector = 0; /* never valid when reporting to user space */
3570 
3571         events->smi.smm = is_smm(vcpu);
3572         events->smi.pending = vcpu->arch.smi_pending;
3573         events->smi.smm_inside_nmi =
3574                 !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
3575         events->smi.latched_init = kvm_lapic_latched_init(vcpu);
3576 
3577         events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
3578                          | KVM_VCPUEVENT_VALID_SHADOW
3579                          | KVM_VCPUEVENT_VALID_SMM);
3580         if (vcpu->kvm->arch.exception_payload_enabled)
3581                 events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
3582 
3583         memset(&events->reserved, 0, sizeof(events->reserved));
3584 }
3585 
3586 static void kvm_smm_changed(struct kvm_vcpu *vcpu);
3587 
3588 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
3589                                               struct kvm_vcpu_events *events)
3590 {
3591         if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
3592                               | KVM_VCPUEVENT_VALID_SIPI_VECTOR
3593                               | KVM_VCPUEVENT_VALID_SHADOW
3594                               | KVM_VCPUEVENT_VALID_SMM
3595                               | KVM_VCPUEVENT_VALID_PAYLOAD))
3596                 return -EINVAL;
3597 
3598         if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
3599                 if (!vcpu->kvm->arch.exception_payload_enabled)
3600                         return -EINVAL;
3601                 if (events->exception.pending)
3602                         events->exception.injected = 0;
3603                 else
3604                         events->exception_has_payload = 0;
3605         } else {
3606                 events->exception.pending = 0;
3607                 events->exception_has_payload = 0;
3608         }
3609 
3610         if ((events->exception.injected || events->exception.pending) &&
3611             (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
3612                 return -EINVAL;
3613 
3614         /* INITs are latched while in SMM */
3615         if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
3616             (events->smi.smm || events->smi.pending) &&
3617             vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
3618                 return -EINVAL;
3619 
3620         process_nmi(vcpu);
3621         vcpu->arch.exception.injected = events->exception.injected;
3622         vcpu->arch.exception.pending = events->exception.pending;
3623         vcpu->arch.exception.nr = events->exception.nr;
3624         vcpu->arch.exception.has_error_code = events->exception.has_error_code;
3625         vcpu->arch.exception.error_code = events->exception.error_code;
3626         vcpu->arch.exception.has_payload = events->exception_has_payload;
3627         vcpu->arch.exception.payload = events->exception_payload;
3628 
3629         vcpu->arch.interrupt.injected = events->interrupt.injected;
3630         vcpu->arch.interrupt.nr = events->interrupt.nr;
3631         vcpu->arch.interrupt.soft = events->interrupt.soft;
3632         if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
3633                 kvm_x86_ops->set_interrupt_shadow(vcpu,
3634                                                   events->interrupt.shadow);
3635 
3636         vcpu->arch.nmi_injected = events->nmi.injected;
3637         if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
3638                 vcpu->arch.nmi_pending = events->nmi.pending;
3639         kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
3640 
3641         if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
3642             lapic_in_kernel(vcpu))
3643                 vcpu->arch.apic->sipi_vector = events->sipi_vector;
3644 
3645         if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
3646                 if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
3647                         if (events->smi.smm)
3648                                 vcpu->arch.hflags |= HF_SMM_MASK;
3649                         else
3650                                 vcpu->arch.hflags &= ~HF_SMM_MASK;
3651                         kvm_smm_changed(vcpu);
3652                 }
3653 
3654                 vcpu->arch.smi_pending = events->smi.pending;
3655 
3656                 if (events->smi.smm) {
3657                         if (events->smi.smm_inside_nmi)
3658                                 vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
3659                         else
3660                                 vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
3661                         if (lapic_in_kernel(vcpu)) {
3662                                 if (events->smi.latched_init)
3663                                         set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3664                                 else
3665                                         clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3666                         }
3667                 }
3668         }
3669 
3670         kvm_make_request(KVM_REQ_EVENT, vcpu);
3671 
3672         return 0;
3673 }
3674 
3675 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
3676                                              struct kvm_debugregs *dbgregs)
3677 {
3678         unsigned long val;
3679 
3680         memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
3681         kvm_get_dr(vcpu, 6, &val);
3682         dbgregs->dr6 = val;
3683         dbgregs->dr7 = vcpu->arch.dr7;
3684         dbgregs->flags = 0;
3685         memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
3686 }
3687 
3688 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
3689                                             struct kvm_debugregs *dbgregs)
3690 {
3691         if (dbgregs->flags)
3692                 return -EINVAL;
3693 
3694         if (dbgregs->dr6 & ~0xffffffffull)
3695                 return -EINVAL;
3696         if (dbgregs->dr7 & ~0xffffffffull)
3697                 return -EINVAL;
3698 
3699         memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
3700         kvm_update_dr0123(vcpu);
3701         vcpu->arch.dr6 = dbgregs->dr6;
3702         kvm_update_dr6(vcpu);
3703         vcpu->arch.dr7 = dbgregs->dr7;
3704         kvm_update_dr7(vcpu);
3705 
3706         return 0;
3707 }
3708 
3709 #define XSTATE_COMPACTION_ENABLED (1ULL << 63)
3710 
3711 static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
3712 {
3713         struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
3714         u64 xstate_bv = xsave->header.xfeatures;
3715         u64 valid;
3716 
3717         /*
3718          * Copy legacy XSAVE area, to avoid complications with CPUID
3719          * leaves 0 and 1 in the loop below.
3720          */
3721         memcpy(dest, xsave, XSAVE_HDR_OFFSET);
3722 
3723         /* Set XSTATE_BV */
3724         xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE;
3725         *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
3726 
3727         /*
3728          * Copy each region from the possibly compacted offset to the
3729          * non-compacted offset.
3730          */
3731         valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
3732         while (valid) {
3733                 u64 xfeature_mask = valid & -valid;
3734                 int xfeature_nr = fls64(xfeature_mask) - 1;
3735                 void *src = get_xsave_addr(xsave, xfeature_nr);
3736 
3737                 if (src) {
3738                         u32 size, offset, ecx, edx;
3739                         cpuid_count(XSTATE_CPUID, xfeature_nr,
3740                                     &size, &offset, &ecx, &edx);
3741                         if (xfeature_nr == XFEATURE_PKRU)
3742                                 memcpy(dest + offset, &vcpu->arch.pkru,
3743                                        sizeof(vcpu->arch.pkru));
3744                         else
3745                                 memcpy(dest + offset, src, size);
3746 
3747                 }
3748 
3749                 valid -= xfeature_mask;
3750         }
3751 }
3752 
3753 static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
3754 {
3755         struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
3756         u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
3757         u64 valid;
3758 
3759         /*
3760          * Copy legacy XSAVE area, to avoid complications with CPUID
3761          * leaves 0 and 1 in the loop below.
3762          */
3763         memcpy(xsave, src, XSAVE_HDR_OFFSET);
3764 
3765         /* Set XSTATE_BV and possibly XCOMP_BV.  */
3766         xsave->header.xfeatures = xstate_bv;
3767         if (boot_cpu_has(X86_FEATURE_XSAVES))
3768                 xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
3769 
3770         /*
3771          * Copy each region from the non-compacted offset to the
3772          * possibly compacted offset.
3773          */
3774         valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
3775         while (valid) {
3776                 u64 xfeature_mask = valid & -valid;
3777                 int xfeature_nr = fls64(xfeature_mask) - 1;
3778                 void *dest = get_xsave_addr(xsave, xfeature_nr);
3779 
3780                 if (dest) {
3781                         u32 size, offset, ecx, edx;
3782                         cpuid_count(XSTATE_CPUID, xfeature_nr,
3783                                     &size, &offset, &ecx, &edx);
3784                         if (xfeature_nr == XFEATURE_PKRU)
3785                                 memcpy(&vcpu->arch.pkru, src + offset,
3786                                        sizeof(vcpu->arch.pkru));
3787                         else
3788                                 memcpy(dest, src + offset, size);
3789                 }
3790 
3791                 valid -= xfeature_mask;
3792         }
3793 }
3794 
3795 static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
3796                                          struct kvm_xsave *guest_xsave)
3797 {
3798         if (boot_cpu_has(X86_FEATURE_XSAVE)) {
3799                 memset(guest_xsave, 0, sizeof(struct kvm_xsave));
3800                 fill_xsave((u8 *) guest_xsave->region, vcpu);
3801         } else {
3802                 memcpy(guest_xsave->region,
3803                         &vcpu->arch.guest_fpu->state.fxsave,
3804                         sizeof(struct fxregs_state));
3805                 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
3806                         XFEATURE_MASK_FPSSE;
3807         }
3808 }
3809 
3810 #define XSAVE_MXCSR_OFFSET 24
3811 
3812 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
3813                                         struct kvm_xsave *guest_xsave)
3814 {
3815         u64 xstate_bv =
3816                 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
3817         u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
3818 
3819         if (boot_cpu_has(X86_FEATURE_XSAVE)) {
3820                 /*
3821                  * Here we allow setting states that are not present in
3822                  * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
3823                  * with old userspace.
3824                  */
3825                 if (xstate_bv & ~kvm_supported_xcr0() ||
3826                         mxcsr & ~mxcsr_feature_mask)
3827                         return -EINVAL;
3828                 load_xsave(vcpu, (u8 *)guest_xsave->region);
3829         } else {
3830                 if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
3831                         mxcsr & ~mxcsr_feature_mask)
3832                         return -EINVAL;
3833                 memcpy(&vcpu->arch.guest_fpu->state.fxsave,
3834                         guest_xsave->region, sizeof(struct fxregs_state));
3835         }
3836         return 0;
3837 }
3838 
3839 static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
3840                                         struct kvm_xcrs *guest_xcrs)
3841 {
3842         if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
3843                 guest_xcrs->nr_xcrs = 0;
3844                 return;
3845         }
3846 
3847         guest_xcrs->nr_xcrs = 1;
3848         guest_xcrs->flags = 0;
3849         guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
3850         guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
3851 }
3852 
3853 static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
3854                                        struct kvm_xcrs *guest_xcrs)
3855 {
3856         int i, r = 0;
3857 
3858         if (!boot_cpu_has(X86_FEATURE_XSAVE))
3859                 return -EINVAL;
3860 
3861         if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
3862                 return -EINVAL;
3863 
3864         for (i = 0; i < guest_xcrs->nr_xcrs; i++)
3865                 /* Only support XCR0 currently */
3866                 if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
3867                         r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
3868                                 guest_xcrs->xcrs[i].value);
3869                         break;
3870                 }
3871         if (r)
3872                 r = -EINVAL;
3873         return r;
3874 }
3875 
3876 /*
3877  * kvm_set_guest_paused() indicates to the guest kernel that it has been
3878  * stopped by the hypervisor.  This function will be called from the host only.
3879  * EINVAL is returned when the host attempts to set the flag for a guest that
3880  * does not support pv clocks.
3881  */
3882 static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
3883 {
3884         if (!vcpu->arch.pv_time_enabled)
3885                 return -EINVAL;
3886         vcpu->arch.pvclock_set_guest_stopped_request = true;
3887         kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
3888         return 0;
3889 }
3890 
3891 static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
3892                                      struct kvm_enable_cap *cap)
3893 {
3894         int r;
3895         uint16_t vmcs_version;
3896         void __user *user_ptr;
3897 
3898         if (cap->flags)
3899                 return -EINVAL;
3900 
3901         switch (cap->cap) {
3902         case KVM_CAP_HYPERV_SYNIC2:
3903                 if (cap->args[0])
3904                         return -EINVAL;
3905                 /* fall through */
3906 
3907         case KVM_CAP_HYPERV_SYNIC:
3908                 if (!irqchip_in_kernel(vcpu->kvm))
3909                         return -EINVAL;
3910                 return kvm_hv_activate_synic(vcpu, cap->cap ==
3911                                              KVM_CAP_HYPERV_SYNIC2);
3912         case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
3913                 if (!kvm_x86_ops->nested_enable_evmcs)
3914                         return -ENOTTY;
3915                 r = kvm_x86_ops->nested_enable_evmcs(vcpu, &vmcs_version);
3916                 if (!r) {
3917                         user_ptr = (void __user *)(uintptr_t)cap->args[0];
3918                         if (copy_to_user(user_ptr, &vmcs_version,
3919                                          sizeof(vmcs_version)))
3920                                 r = -EFAULT;
3921                 }
3922                 return r;
3923 
3924         default:
3925                 return -EINVAL;
3926         }
3927 }
3928 
3929 long kvm_arch_vcpu_ioctl(struct file *filp,
3930                          unsigned int ioctl, unsigned long arg)
3931 {
3932         struct kvm_vcpu *vcpu = filp->private_data;
3933         void __user *argp = (void __user *)arg;
3934         int r;
3935         union {
3936                 struct kvm_lapic_state *lapic;
3937                 struct kvm_xsave *xsave;
3938                 struct kvm_xcrs *xcrs;
3939                 void *buffer;
3940         } u;
3941 
3942         vcpu_load(vcpu);
3943 
3944         u.buffer = NULL;
3945         switch (ioctl) {
3946         case KVM_GET_LAPIC: {
3947                 r = -EINVAL;
3948                 if (!lapic_in_kernel(vcpu))
3949                         goto out;
3950                 u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
3951                                 GFP_KERNEL_ACCOUNT);
3952 
3953                 r = -ENOMEM;
3954                 if (!u.lapic)
3955                         goto out;
3956                 r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
3957                 if (r)
3958                         goto out;
3959                 r = -EFAULT;
3960                 if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
3961                         goto out;
3962                 r = 0;
3963                 break;
3964         }
3965         case KVM_SET_LAPIC: {
3966                 r = -EINVAL;
3967                 if (!lapic_in_kernel(vcpu))
3968                         goto out;
3969                 u.lapic = memdup_user(argp, sizeof(*u.lapic));
3970                 if (IS_ERR(u.lapic)) {
3971                         r = PTR_ERR(u.lapic);
3972                         goto out_nofree;
3973                 }
3974 
3975                 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
3976                 break;
3977         }
3978         case KVM_INTERRUPT: {
3979                 struct kvm_interrupt irq;
3980 
3981                 r = -EFAULT;
3982                 if (copy_from_user(&irq, argp, sizeof(irq)))
3983                         goto out;
3984                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
3985                 break;
3986         }
3987         case KVM_NMI: {
3988                 r = kvm_vcpu_ioctl_nmi(vcpu);
3989                 break;
3990         }
3991         case KVM_SMI: {
3992                 r = kvm_vcpu_ioctl_smi(vcpu);
3993                 break;
3994         }
3995         case KVM_SET_CPUID: {
3996                 struct kvm_cpuid __user *cpuid_arg = argp;
3997                 struct kvm_cpuid cpuid;
3998 
3999                 r = -EFAULT;
4000                 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
4001                         goto out;
4002                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
4003                 break;
4004         }
4005         case KVM_SET_CPUID2: {
4006                 struct kvm_cpuid2 __user *cpuid_arg = argp;
4007                 struct kvm_cpuid2 cpuid;
4008 
4009                 r = -EFAULT;
4010                 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
4011                         goto out;
4012                 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
4013                                               cpuid_arg->entries);
4014                 break;
4015         }
4016         case KVM_GET_CPUID2: {
4017                 struct kvm_cpuid2 __user *cpuid_arg = argp;
4018                 struct kvm_cpuid2 cpuid;
4019 
4020                 r = -EFAULT;
4021                 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
4022                         goto out;
4023                 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
4024                                               cpuid_arg->entries);
4025                 if (r)
4026                         goto out;
4027                 r = -EFAULT;
4028                 if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
4029                         goto out;
4030                 r = 0;
4031                 break;
4032         }
4033         case KVM_GET_MSRS: {
4034                 int idx = srcu_read_lock(&vcpu->kvm->srcu);
4035                 r = msr_io(vcpu, argp, do_get_msr, 1);
4036                 srcu_read_unlock(&vcpu->kvm->srcu, idx);
4037                 break;
4038         }
4039         case KVM_SET_MSRS: {
4040                 int idx = srcu_read_lock(&vcpu->kvm->srcu);
4041                 r = msr_io(vcpu, argp, do_set_msr, 0);
4042                 srcu_read_unlock(&vcpu->kvm->srcu, idx);
4043                 break;
4044         }
4045         case KVM_TPR_ACCESS_REPORTING: {
4046                 struct kvm_tpr_access_ctl tac;
4047 
4048                 r = -EFAULT;
4049                 if (copy_from_user(&tac, argp, sizeof(tac)))
4050                         goto out;
4051                 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
4052                 if (r)
4053                         goto out;
4054                 r = -EFAULT;
4055                 if (copy_to_user(argp, &tac, sizeof(tac)))
4056                         goto out;
4057                 r = 0;
4058                 break;
4059         };
4060         case KVM_SET_VAPIC_ADDR: {
4061                 struct kvm_vapic_addr va;
4062                 int idx;
4063 
4064                 r = -EINVAL;
4065                 if (!lapic_in_kernel(vcpu))
4066                         goto out;
4067                 r = -EFAULT;
4068                 if (copy_from_user(&va, argp, sizeof(va)))
4069                         goto out;
4070                 idx = srcu_read_lock(&vcpu->kvm->srcu);
4071                 r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
4072                 srcu_read_unlock(&vcpu->kvm->srcu, idx);
4073                 break;
4074         }
4075         case KVM_X86_SETUP_MCE: {
4076                 u64 mcg_cap;
4077 
4078                 r = -EFAULT;
4079                 if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap)))
4080                         goto out;
4081                 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
4082                 break;
4083         }
4084         case KVM_X86_SET_MCE: {
4085                 struct kvm_x86_mce mce;
4086 
4087                 r = -EFAULT;
4088                 if (copy_from_user(&mce, argp, sizeof(mce)))
4089                         goto out;
4090                 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
4091                 break;
4092         }
4093         case KVM_GET_VCPU_EVENTS: {
4094                 struct kvm_vcpu_events events;
4095 
4096                 kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
4097 
4098                 r = -EFAULT;
4099                 if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
4100                         break;
4101                 r = 0;
4102                 break;
4103         }
4104         case KVM_SET_VCPU_EVENTS: {
4105                 struct kvm_vcpu_events events;
4106 
4107                 r = -EFAULT;
4108                 if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
4109                         break;
4110 
4111                 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
4112                 break;
4113         }
4114         case KVM_GET_DEBUGREGS: {
4115                 struct kvm_debugregs dbgregs;
4116 
4117                 kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
4118 
4119                 r = -EFAULT;
4120                 if (copy_to_user(argp, &dbgregs,
4121                                  sizeof(struct kvm_debugregs)))
4122                         break;
4123                 r = 0;
4124                 break;
4125         }
4126         case KVM_SET_DEBUGREGS: {
4127                 struct kvm_debugregs dbgregs;
4128 
4129                 r = -EFAULT;
4130                 if (copy_from_user(&dbgregs, argp,
4131                                    sizeof(struct kvm_debugregs)))
4132                         break;
4133 
4134                 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
4135                 break;
4136         }
4137         case KVM_GET_XSAVE: {
4138                 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
4139                 r = -ENOMEM;
4140                 if (!u.xsave)
4141                         break;
4142 
4143                 kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
4144 
4145                 r = -EFAULT;
4146                 if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
4147                         break;
4148                 r = 0;
4149                 break;
4150         }
4151         case KVM_SET_XSAVE: {
4152                 u.xsave = memdup_user(argp, sizeof(*u.xsave));
4153                 if (IS_ERR(u.xsave)) {
4154                         r = PTR_ERR(u.xsave);
4155                         goto out_nofree;
4156                 }
4157 
4158                 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
4159                 break;
4160         }
4161         case KVM_GET_XCRS: {
4162                 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
4163                 r = -ENOMEM;
4164                 if (!u.xcrs)
4165                         break;
4166 
4167                 kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
4168 
4169                 r = -EFAULT;
4170                 if (copy_to_user(argp, u.xcrs,
4171                                  sizeof(struct kvm_xcrs)))
4172                         break;
4173                 r = 0;
4174                 break;
4175         }
4176         case KVM_SET_XCRS: {
4177                 u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
4178                 if (IS_ERR(u.xcrs)) {
4179                         r = PTR_ERR(u.xcrs);
4180                         goto out_nofree;
4181                 }
4182 
4183                 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
4184                 break;
4185         }
4186         case KVM_SET_TSC_KHZ: {
4187                 u32 user_tsc_khz;
4188 
4189                 r = -EINVAL;
4190                 user_tsc_khz = (u32)arg;
4191 
4192                 if (user_tsc_khz >= kvm_max_guest_tsc_khz)
4193                         goto out;
4194 
4195                 if (user_tsc_khz == 0)
4196                         user_tsc_khz = tsc_khz;
4197 
4198                 if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
4199                         r = 0;
4200 
4201                 goto out;
4202         }
4203         case KVM_GET_TSC_KHZ: {
4204                 r = vcpu->arch.virtual_tsc_khz;
4205                 goto out;
4206         }
4207         case KVM_KVMCLOCK_CTRL: {
4208                 r = kvm_set_guest_paused(vcpu);
4209                 goto out;
4210         }
4211         case KVM_ENABLE_CAP: {
4212                 struct kvm_enable_cap cap;
4213 
4214                 r = -EFAULT;
4215                 if (copy_from_user(&cap, argp, sizeof(cap)))
4216                         goto out;
4217                 r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
4218                 break;
4219         }
4220         case KVM_GET_NESTED_STATE: {
4221                 struct kvm_nested_state __user *user_kvm_nested_state = argp;
4222                 u32 user_data_size;
4223 
4224                 r = -EINVAL;
4225                 if (!kvm_x86_ops->get_nested_state)
4226                         break;
4227 
4228                 BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
4229                 r = -EFAULT;
4230                 if (get_user(user_data_size, &user_kvm_nested_state->size))
4231                         break;
4232 
4233                 r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state,
4234                                                   user_data_size);
4235                 if (r < 0)
4236                         break;
4237 
4238                 if (r > user_data_size) {
4239                         if (put_user(r, &user_kvm_nested_state->size))
4240                                 r = -EFAULT;
4241                         else
4242                                 r = -E2BIG;
4243                         break;
4244                 }
4245 
4246                 r = 0;
4247                 break;
4248         }
4249         case KVM_SET_NESTED_STATE: {
4250                 struct kvm_nested_state __user *user_kvm_nested_state = argp;
4251                 struct kvm_nested_state kvm_state;
4252 
4253                 r = -EINVAL;
4254                 if (!kvm_x86_ops->set_nested_state)
4255                         break;
4256 
4257                 r = -EFAULT;
4258                 if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
4259                         break;
4260 
4261                 r = -EINVAL;
4262                 if (kvm_state.size < sizeof(kvm_state))
4263                         break;
4264 
4265                 if (kvm_state.flags &
4266                     ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
4267                       | KVM_STATE_NESTED_EVMCS))
4268                         break;
4269 
4270                 /* nested_run_pending implies guest_mode.  */
4271                 if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING)
4272                     && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
4273                         break;
4274 
4275                 r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
4276                 break;
4277         }
4278         case KVM_GET_SUPPORTED_HV_CPUID: {
4279                 struct kvm_cpuid2 __user *cpuid_arg = argp;
4280                 struct kvm_cpuid2 cpuid;
4281 
4282                 r = -EFAULT;
4283                 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
4284                         goto out;
4285 
4286                 r = kvm_vcpu_ioctl_get_hv_cpuid(vcpu, &cpuid,
4287                                                 cpuid_arg->entries);
4288                 if (r)
4289                         goto out;
4290 
4291                 r = -EFAULT;
4292                 if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
4293                         goto out;
4294                 r = 0;
4295                 break;
4296         }
4297         default:
4298                 r = -EINVAL;
4299         }
4300 out:
4301         kfree(u.buffer);
4302 out_nofree:
4303         vcpu_put(vcpu);
4304         return r;
4305 }
4306 
4307 vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
4308 {
4309         return VM_FAULT_SIGBUS;
4310 }
4311 
4312 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
4313 {
4314         int ret;
4315 
4316         if (addr > (unsigned int)(-3 * PAGE_SIZE))
4317                 return -EINVAL;
4318         ret = kvm_x86_ops->set_tss_addr(kvm, addr);
4319         return ret;
4320 }
4321 
4322 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
4323                                               u64 ident_addr)
4324 {
4325         return kvm_x86_ops->set_identity_map_addr(kvm, ident_addr);
4326 }
4327 
4328 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
4329                                          unsigned long kvm_nr_mmu_pages)
4330 {
4331         if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
4332                 return -EINVAL;
4333 
4334         mutex_lock(&kvm->slots_lock);
4335 
4336         kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
4337         kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
4338 
4339         mutex_unlock(&kvm->slots_lock);
4340         return 0;
4341 }
4342 
4343 static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
4344 {
4345         return kvm->arch.n_max_mmu_pages;
4346 }
4347 
4348 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
4349 {
4350         struct kvm_pic *pic = kvm->arch.vpic;
4351         int r;
4352 
4353         r = 0;
4354         switch (chip->chip_id) {
4355         case KVM_IRQCHIP_PIC_MASTER:
4356                 memcpy(&chip->chip.pic, &pic->pics[0],
4357                         sizeof(struct kvm_pic_state));
4358                 break;
4359         case KVM_IRQCHIP_PIC_SLAVE:
4360                 memcpy(&chip->chip.pic, &pic->pics[1],
4361                         sizeof(struct kvm_pic_state));
4362                 break;
4363         case KVM_IRQCHIP_IOAPIC:
4364                 kvm_get_ioapic(kvm, &chip->chip.ioapic);
4365                 break;
4366         default:
4367                 r = -EINVAL;
4368                 break;
4369         }
4370         return r;
4371 }
4372 
4373 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
4374 {
4375         struct kvm_pic *pic = kvm->arch.vpic;
4376         int r;
4377 
4378         r = 0;
4379         switch (chip->chip_id) {
4380         case KVM_IRQCHIP_PIC_MASTER:
4381                 spin_lock(&pic->lock);
4382                 memcpy(&pic->pics[0], &chip->chip.pic,
4383                         sizeof(struct kvm_pic_state));
4384                 spin_unlock(&pic->lock);
4385                 break;
4386         case KVM_IRQCHIP_PIC_SLAVE:
4387                 spin_lock(&pic->lock);
4388                 memcpy(&pic->pics[1], &chip->chip.pic,
4389                         sizeof(struct kvm_pic_state));
4390                 spin_unlock(&pic->lock);
4391                 break;
4392         case KVM_IRQCHIP_IOAPIC:
4393                 kvm_set_ioapic(kvm, &chip->chip.ioapic);
4394                 break;
4395         default:
4396                 r = -EINVAL;
4397                 break;
4398         }
4399         kvm_pic_update_irq(pic);
4400         return r;
4401 }
4402 
4403 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
4404 {
4405         struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
4406 
4407         BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
4408 
4409         mutex_lock(&kps->lock);
4410         memcpy(ps, &kps->channels, sizeof(*ps));
4411         mutex_unlock(&kps->lock);
4412         return 0;
4413 }
4414 
4415 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
4416 {
4417         int i;
4418         struct kvm_pit *pit = kvm->arch.vpit;
4419 
4420         mutex_lock(&pit->pit_state.lock);
4421         memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
4422         for (i = 0; i < 3; i++)
4423                 kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
4424         mutex_unlock(&pit->pit_state.lock);
4425         return 0;
4426 }
4427 
4428 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
4429 {
4430         mutex_lock(&kvm->arch.vpit->pit_state.lock);
4431         memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
4432                 sizeof(ps->channels));
4433         ps->flags = kvm->arch.vpit->pit_state.flags;
4434         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
4435         memset(&ps->reserved, 0, sizeof(ps->reserved));
4436         return 0;
4437 }
4438 
4439 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
4440 {
4441         int start = 0;
4442         int i;
4443         u32 prev_legacy, cur_legacy;
4444         struct kvm_pit *pit = kvm->arch.vpit;
4445 
4446         mutex_lock(&pit->pit_state.lock);
4447         prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
4448         cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
4449         if (!prev_legacy && cur_legacy)
4450                 start = 1;
4451         memcpy(&pit->pit_state.channels, &ps->channels,
4452                sizeof(pit->pit_state.channels));
4453         pit->pit_state.flags = ps->flags;
4454         for (i = 0; i < 3; i++)
4455                 kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
4456                                    start && i == 0);
4457         mutex_unlock(&pit->pit_state.lock);
4458         return 0;
4459 }
4460 
4461 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
4462                                  struct kvm_reinject_control *control)
4463 {
4464         struct kvm_pit *pit = kvm->arch.vpit;
4465 
4466         if (!pit)
4467                 return -ENXIO;
4468 
4469         /* pit->pit_state.lock was overloaded to prevent userspace from getting
4470          * an inconsistent state after running multiple KVM_REINJECT_CONTROL
4471          * ioctls in parallel.  Use a separate lock if that ioctl isn't rare.
4472          */
4473         mutex_lock(&pit->pit_state.lock);
4474         kvm_pit_set_reinject(pit, control->pit_reinject);
4475         mutex_unlock(&pit->pit_state.lock);
4476 
4477         return 0;
4478 }
4479 
4480 /**
4481  * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
4482  * @kvm: kvm instance
4483  * @log: slot id and address to which we copy the log
4484  *
4485  * Steps 1-4 below provide general overview of dirty page logging. See
4486  * kvm_get_dirty_log_protect() function description for additional details.
4487  *
4488  * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
4489  * always flush the TLB (step 4) even if previous step failed  and the dirty
4490  * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
4491  * does not preclude user space subsequent dirty log read. Flushing TLB ensures
4492  * writes will be marked dirty for next log read.
4493  *
4494  *   1. Take a snapshot of the bit and clear it if needed.
4495  *   2. Write protect the corresponding page.
4496  *   3. Copy the snapshot to the userspace.
4497  *   4. Flush TLB's if needed.
4498  */
4499 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
4500 {
4501         bool flush = false;
4502         int r;
4503 
4504         mutex_lock(&kvm->slots_lock);
4505 
4506         /*
4507          * Flush potentially hardware-cached dirty pages to dirty_bitmap.
4508          */
4509         if (kvm_x86_ops->flush_log_dirty)
4510                 kvm_x86_ops->flush_log_dirty(kvm);
4511 
4512         r = kvm_get_dirty_log_protect(kvm, log, &flush);
4513 
4514         /*
4515          * All the TLBs can be flushed out of mmu lock, see the comments in
4516          * kvm_mmu_slot_remove_write_access().
4517          */
4518         lockdep_assert_held(&kvm->slots_lock);
4519         if (flush)
4520                 kvm_flush_remote_tlbs(kvm);
4521 
4522         mutex_unlock(&kvm->slots_lock);
4523         return r;
4524 }
4525 
4526 int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
4527 {
4528         bool flush = false;
4529         int r;
4530 
4531         mutex_lock(&kvm->slots_lock);
4532 
4533         /*
4534          * Flush potentially hardware-cached dirty pages to dirty_bitmap.
4535          */
4536         if (kvm_x86_ops->flush_log_dirty)
4537                 kvm_x86_ops->flush_log_dirty(kvm);
4538 
4539         r = kvm_clear_dirty_log_protect(kvm, log, &flush);
4540 
4541         /*
4542          * All the TLBs can be flushed out of mmu lock, see the comments in
4543          * kvm_mmu_slot_remove_write_access().
4544          */
4545         lockdep_assert_held(&kvm->slots_lock);
4546         if (flush)
4547                 kvm_flush_remote_tlbs(kvm);
4548 
4549         mutex_unlock(&kvm->slots_lock);
4550         return r;
4551 }
4552 
4553 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
4554                         bool line_status)
4555 {
4556         if (!irqchip_in_kernel(kvm))
4557                 return -ENXIO;
4558 
4559         irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
4560                                         irq_event->irq, irq_event->level,
4561                                         line_status);
4562         return 0;
4563 }
4564 
4565 int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4566                             struct kvm_enable_cap *cap)
4567 {
4568         int r;
4569 
4570         if (cap->flags)
4571                 return -EINVAL;
4572 
4573         switch (cap->cap) {
4574         case KVM_CAP_DISABLE_QUIRKS:
4575                 kvm->arch.disabled_quirks = cap->args[0];
4576                 r = 0;
4577                 break;
4578         case KVM_CAP_SPLIT_IRQCHIP: {
4579                 mutex_lock(&kvm->lock);
4580                 r = -EINVAL;
4581                 if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
4582                         goto split_irqchip_unlock;
4583                 r = -EEXIST;
4584                 if (irqchip_in_kernel(kvm))
4585                         goto split_irqchip_unlock;
4586                 if (kvm->created_vcpus)
4587                         goto split_irqchip_unlock;
4588                 r = kvm_setup_empty_irq_routing(kvm);
4589                 if (r)
4590                         goto split_irqchip_unlock;
4591                 /* Pairs with irqchip_in_kernel. */
4592                 smp_wmb();
4593                 kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
4594                 kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
4595                 r = 0;
4596 split_irqchip_unlock:
4597                 mutex_unlock(&kvm->lock);
4598                 break;
4599         }
4600         case KVM_CAP_X2APIC_API:
4601                 r = -EINVAL;
4602                 if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
4603                         break;
4604 
4605                 if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
4606                         kvm->arch.x2apic_format = true;
4607                 if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
4608                         kvm->arch.x2apic_broadcast_quirk_disabled = true;
4609 
4610                 r = 0;
4611                 break;
4612         case KVM_CAP_X86_DISABLE_EXITS:
4613                 r = -EINVAL;
4614                 if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
4615                         break;
4616 
4617                 if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
4618                         kvm_can_mwait_in_guest())
4619                         kvm->arch.mwait_in_guest = true;
4620                 if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
4621                         kvm->arch.hlt_in_guest = true;
4622                 if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
4623                         kvm->arch.pause_in_guest = true;
4624                 r = 0;
4625                 break;
4626         case KVM_CAP_MSR_PLATFORM_INFO:
4627                 kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
4628                 r = 0;
4629                 break;
4630         case KVM_CAP_EXCEPTION_PAYLOAD:
4631                 kvm->arch.exception_payload_enabled = cap->args[0];
4632                 r = 0;
4633                 break;
4634         default:
4635                 r = -EINVAL;
4636                 break;
4637         }
4638         return r;
4639 }
4640 
4641 long kvm_arch_vm_ioctl(struct file *filp,
4642                        unsigned int ioctl, unsigned long arg)
4643 {
4644         struct kvm *kvm = filp->private_data;
4645         void __user *argp = (void __user *)arg;
4646         int r = -ENOTTY;
4647         /*
4648          * This union makes it completely explicit to gcc-3.x
4649          * that these two variables' stack usage should be
4650          * combined, not added together.
4651          */
4652         union {
4653                 struct kvm_pit_state ps;
4654                 struct kvm_pit_state2 ps2;
4655                 struct kvm_pit_config pit_config;
4656         } u;
4657 
4658         switch (ioctl) {
4659         case KVM_SET_TSS_ADDR:
4660                 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
4661                 break;
4662         case KVM_SET_IDENTITY_MAP_ADDR: {
4663                 u64 ident_addr;
4664 
4665                 mutex_lock(&kvm->lock);
4666                 r = -EINVAL;
4667                 if (kvm->created_vcpus)
4668                         goto set_identity_unlock;
4669                 r = -EFAULT;
4670                 if (copy_from_user(&ident_addr, argp, sizeof(ident_addr)))
4671                         goto set_identity_unlock;
4672                 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
4673 set_identity_unlock:
4674                 mutex_unlock(&kvm->lock);
4675                 break;
4676         }
4677         case KVM_SET_NR_MMU_PAGES:
4678                 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
4679                 break;
4680         case KVM_GET_NR_MMU_PAGES:
4681                 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
4682                 break;
4683         case KVM_CREATE_IRQCHIP: {
4684                 mutex_lock(&kvm->lock);
4685 
4686                 r = -EEXIST;
4687                 if (irqchip_in_kernel(kvm))
4688                         goto create_irqchip_unlock;
4689 
4690                 r = -EINVAL;
4691                 if (kvm->created_vcpus)
4692                         goto create_irqchip_unlock;
4693 
4694                 r = kvm_pic_init(kvm);
4695                 if (r)
4696                         goto create_irqchip_unlock;
4697 
4698                 r = kvm_ioapic_init(kvm);
4699                 if (r) {
4700                         kvm_pic_destroy(kvm);
4701                         goto create_irqchip_unlock;
4702                 }
4703 
4704                 r = kvm_setup_default_irq_routing(kvm);
4705                 if (r) {
4706                         kvm_ioapic_destroy(kvm);
4707                         kvm_pic_destroy(kvm);
4708                         goto create_irqchip_unlock;
4709                 }
4710                 /* Write kvm->irq_routing before enabling irqchip_in_kernel. */
4711                 smp_wmb();
4712                 kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
4713         create_irqchip_unlock:
4714                 mutex_unlock(&kvm->lock);
4715                 break;
4716         }
4717         case KVM_CREATE_PIT:
4718                 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
4719                 goto create_pit;
4720         case KVM_CREATE_PIT2:
4721                 r = -EFAULT;
4722                 if (copy_from_user(&u.pit_config, argp,
4723                                    sizeof(struct kvm_pit_config)))
4724                         goto out;
4725         create_pit:
4726                 mutex_lock(&kvm->lock);
4727                 r = -EEXIST;
4728                 if (kvm->arch.vpit)
4729                         goto create_pit_unlock;
4730                 r = -ENOMEM;
4731                 kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
4732                 if (kvm->arch.vpit)
4733                         r = 0;
4734         create_pit_unlock:
4735                 mutex_unlock(&kvm->lock);
4736                 break;
4737         case KVM_GET_IRQCHIP: {
4738                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
4739                 struct kvm_irqchip *chip;
4740 
4741                 chip = memdup_user(argp, sizeof(*chip));
4742                 if (IS_ERR(chip)) {
4743                         r = PTR_ERR(chip);
4744                         goto out;
4745                 }
4746 
4747                 r = -ENXIO;
4748                 if (!irqchip_kernel(kvm))
4749                         goto get_irqchip_out;
4750                 r = kvm_vm_ioctl_get_irqchip(kvm, chip);
4751                 if (r)
4752                         goto get_irqchip_out;
4753                 r = -EFAULT;
4754                 if (copy_to_user(argp, chip, sizeof(*chip)))
4755                         goto get_irqchip_out;
4756                 r = 0;
4757         get_irqchip_out:
4758                 kfree(chip);
4759                 break;
4760         }
4761         case KVM_SET_IRQCHIP: {
4762                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
4763                 struct kvm_irqchip *chip;
4764 
4765                 chip = memdup_user(argp, sizeof(*chip));
4766                 if (IS_ERR(chip)) {
4767                         r = PTR_ERR(chip);
4768                         goto out;
4769                 }
4770 
4771                 r = -ENXIO;
4772                 if (!irqchip_kernel(kvm))
4773                         goto set_irqchip_out;
4774                 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
4775                 if (r)
4776                         goto set_irqchip_out;
4777                 r = 0;
4778         set_irqchip_out:
4779                 kfree(chip);
4780                 break;
4781         }
4782         case KVM_GET_PIT: {
4783                 r = -EFAULT;
4784                 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
4785                         goto out;
4786                 r = -ENXIO;
4787                 if (!kvm->arch.vpit)
4788                         goto out;
4789                 r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
4790                 if (r)
4791                         goto out;
4792                 r = -EFAULT;
4793                 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
4794                         goto out;
4795                 r = 0;
4796                 break;
4797         }
4798         case KVM_SET_PIT: {
4799                 r = -EFAULT;
4800                 if (copy_from_user(&u.ps, argp, sizeof(u.ps)))
4801                         goto out;
4802                 r = -ENXIO;
4803                 if (!kvm->arch.vpit)
4804                         goto out;
4805                 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
4806                 break;
4807         }
4808         case KVM_GET_PIT2: {
4809                 r = -ENXIO;
4810                 if (!kvm->arch.vpit)
4811                         goto out;
4812                 r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
4813                 if (r)
4814                         goto out;
4815                 r = -EFAULT;
4816                 if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
4817                         goto out;
4818                 r = 0;
4819                 break;
4820         }
4821         case KVM_SET_PIT2: {
4822                 r = -EFAULT;
4823                 if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
4824                         goto out;
4825                 r = -ENXIO;
4826                 if (!kvm->arch.vpit)
4827                         goto out;
4828                 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
4829                 break;
4830         }
4831         case KVM_REINJECT_CONTROL: {
4832                 struct kvm_reinject_control control;
4833                 r =  -EFAULT;
4834                 if (copy_from_user(&control, argp, sizeof(control)))
4835                         goto out;
4836                 r = kvm_vm_ioctl_reinject(kvm, &control);
4837                 break;
4838         }
4839         case KVM_SET_BOOT_CPU_ID:
4840                 r = 0;
4841                 mutex_lock(&kvm->lock);
4842                 if (kvm->created_vcpus)
4843                         r = -EBUSY;
4844                 else
4845                         kvm->arch.bsp_vcpu_id = arg;
4846                 mutex_unlock(&kvm->lock);
4847                 break;
4848         case KVM_XEN_HVM_CONFIG: {
4849                 struct kvm_xen_hvm_config xhc;
4850                 r = -EFAULT;
4851                 if (copy_from_user(&xhc, argp, sizeof(xhc)))
4852                         goto out;
4853                 r = -EINVAL;
4854                 if (xhc.flags)
4855                         goto out;
4856                 memcpy(&kvm->arch.xen_hvm_config, &xhc, sizeof(xhc));
4857                 r = 0;
4858                 break;
4859         }
4860         case KVM_SET_CLOCK: {
4861                 struct kvm_clock_data user_ns;
4862                 u64 now_ns;
4863 
4864                 r = -EFAULT;
4865                 if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
4866                         goto out;
4867 
4868                 r = -EINVAL;
4869                 if (user_ns.flags)
4870                         goto out;
4871 
4872                 r = 0;
4873                 /*
4874                  * TODO: userspace has to take care of races with VCPU_RUN, so
4875                  * kvm_gen_update_masterclock() can be cut down to locked
4876                  * pvclock_update_vm_gtod_copy().
4877                  */
4878                 kvm_gen_update_masterclock(kvm);
4879                 now_ns = get_kvmclock_ns(kvm);
4880                 kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
4881                 kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
4882                 break;
4883         }
4884         case KVM_GET_CLOCK: {
4885                 struct kvm_clock_data user_ns;
4886                 u64 now_ns;
4887 
4888                 now_ns = get_kvmclock_ns(kvm);
4889                 user_ns.clock = now_ns;
4890                 user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0;
4891                 memset(&user_ns.pad, 0, sizeof(user_ns.pad));
4892 
4893                 r = -EFAULT;
4894                 if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
4895                         goto out;
4896                 r = 0;
4897                 break;
4898         }
4899         case KVM_MEMORY_ENCRYPT_OP: {
4900                 r = -ENOTTY;
4901                 if (kvm_x86_ops->mem_enc_op)
4902                         r = kvm_x86_ops->mem_enc_op(kvm, argp);
4903                 break;
4904         }
4905         case KVM_MEMORY_ENCRYPT_REG_REGION: {
4906                 struct kvm_enc_region region;
4907 
4908                 r = -EFAULT;
4909                 if (copy_from_user(&region, argp, sizeof(region)))
4910                         goto out;
4911 
4912                 r = -ENOTTY;
4913                 if (kvm_x86_ops->mem_enc_reg_region)
4914                         r = kvm_x86_ops->mem_enc_reg_region(kvm, &region);
4915                 break;
4916         }
4917         case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
4918                 struct kvm_enc_region region;
4919 
4920                 r = -EFAULT;
4921                 if (copy_from_user(&region, argp, sizeof(region)))
4922                         goto out;
4923 
4924                 r = -ENOTTY;
4925                 if (kvm_x86_ops->mem_enc_unreg_region)
4926                         r = kvm_x86_ops->mem_enc_unreg_region(kvm, &region);
4927                 break;
4928         }
4929         case KVM_HYPERV_EVENTFD: {
4930                 struct kvm_hyperv_eventfd hvevfd;
4931 
4932                 r = -EFAULT;
4933                 if (copy_from_user(&hvevfd, argp, sizeof(hvevfd)))
4934                         goto out;
4935                 r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
4936                 break;
4937         }
4938         default:
4939                 r = -ENOTTY;
4940         }
4941 out:
4942         return r;
4943 }
4944 
4945 static void kvm_init_msr_list(void)
4946 {
4947         u32 dummy[2];
4948         unsigned i, j;
4949 
4950         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
4951                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
4952                         continue;
4953 
4954                 /*
4955                  * Even MSRs that are valid in the host may not be exposed
4956                  * to the guests in some cases.
4957                  */
4958                 switch (msrs_to_save[i]) {
4959                 case MSR_IA32_BNDCFGS:
4960                         if (!kvm_mpx_supported())
4961                                 continue;
4962                         break;
4963                 case MSR_TSC_AUX:
4964                         if (!kvm_x86_ops->rdtscp_supported())
4965                                 continue;
4966                         break;
4967                 case MSR_IA32_RTIT_CTL:
4968                 case MSR_IA32_RTIT_STATUS:
4969                         if (!kvm_x86_ops->pt_supported())
4970                                 continue;
4971                         break;
4972                 case MSR_IA32_RTIT_CR3_MATCH:
4973                         if (!kvm_x86_ops->pt_supported() ||
4974                             !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
4975                                 continue;
4976                         break;
4977                 case MSR_IA32_RTIT_OUTPUT_BASE:
4978                 case MSR_IA32_RTIT_OUTPUT_MASK:
4979                         if (!kvm_x86_ops->pt_supported() ||
4980                                 (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
4981                                  !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
4982                                 continue;
4983                         break;
4984                 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: {
4985                         if (!kvm_x86_ops->pt_supported() ||
4986                                 msrs_to_save[i] - MSR_IA32_RTIT_ADDR0_A >=
4987                                 intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
4988                                 continue;
4989                         break;
4990                 }
4991                 default:
4992                         break;
4993                 }
4994 
4995                 if (j < i)
4996                         msrs_to_save[j] = msrs_to_save[i];
4997                 j++;
4998         }
4999         num_msrs_to_save = j;
5000 
5001         for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
5002                 if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i]))
5003                         continue;
5004 
5005                 if (j < i)
5006                         emulated_msrs[j] = emulated_msrs[i];
5007                 j++;
5008         }
5009         num_emulated_msrs = j;
5010 
5011         for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) {
5012                 struct kvm_msr_entry msr;
5013 
5014                 msr.index = msr_based_features[i];
5015                 if (kvm_get_msr_feature(&msr))
5016                         continue;
5017 
5018                 if (j < i)
5019                         msr_based_features[j] = msr_based_features[i];
5020                 j++;
5021         }
5022         num_msr_based_features = j;
5023 }
5024 
5025 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
5026                            const void *v)
5027 {
5028         int handled = 0;
5029         int n;
5030 
5031         do {
5032                 n = min(len, 8);
5033                 if (!(lapic_in_kernel(vcpu) &&
5034                       !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
5035                     && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
5036                         break;
5037                 handled += n;
5038                 addr += n;
5039                 len -= n;
5040                 v += n;
5041         } while (len);
5042 
5043         return handled;
5044 }
5045 
5046 static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
5047 {
5048         int handled = 0;
5049         int n;
5050 
5051         do {
5052                 n = min(len, 8);
5053                 if (!(lapic_in_kernel(vcpu) &&
5054                       !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
5055                                          addr, n, v))
5056                     && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
5057                         break;
5058                 trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
5059                 handled += n;
5060                 addr += n;
5061                 len -= n;
5062                 v += n;
5063         } while (len);
5064 
5065         return handled;
5066 }
5067 
5068 static void kvm_set_segment(struct kvm_vcpu *vcpu,
5069                         struct kvm_segment *var, int seg)
5070 {
5071         kvm_x86_ops->set_segment(vcpu, var, seg);
5072 }
5073 
5074 void kvm_get_segment(struct kvm_vcpu *vcpu,
5075                      struct kvm_segment *var, int seg)
5076 {
5077         kvm_x86_ops->get_segment(vcpu, var, seg);
5078 }
5079 
5080 gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
5081                            struct x86_exception *exception)
5082 {
5083         gpa_t t_gpa;
5084 
5085         BUG_ON(!mmu_is_nested(vcpu));
5086 
5087         /* NPT walks are always user-walks */
5088         access |= PFERR_USER_MASK;
5089         t_gpa  = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception);
5090 
5091         return t_gpa;
5092 }
5093 
5094 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
5095                               struct x86_exception *exception)
5096 {
5097         u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
5098         return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
5099 }
5100 
5101  gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
5102                                 struct x86_exception *exception)
5103 {
5104         u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
5105         access |= PFERR_FETCH_MASK;
5106         return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
5107 }
5108 
5109 gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
5110                                struct x86_exception *exception)
5111 {
5112         u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
5113         access |= PFERR_WRITE_MASK;
5114         return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
5115 }
5116 
5117 /* uses this to access any guest's mapped memory without checking CPL */
5118 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
5119                                 struct x86_exception *exception)
5120 {
5121         return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
5122 }
5123 
5124 static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
5125                                       struct kvm_vcpu *vcpu, u32 access,
5126                                       struct x86_exception *exception)
5127 {
5128         void *data = val;
5129         int r = X86EMUL_CONTINUE;
5130 
5131         while (bytes) {
5132                 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
5133                                                             exception);
5134                 unsigned offset = addr & (PAGE_SIZE-1);
5135                 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
5136                 int ret;
5137 
5138                 if (gpa == UNMAPPED_GVA)
5139                         return X86EMUL_PROPAGATE_FAULT;
5140                 ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
5141                                                offset, toread);
5142                 if (ret < 0) {
5143                         r = X86EMUL_IO_NEEDED;
5144                         goto out;
5145                 }
5146 
5147                 bytes -= toread;
5148                 data += toread;
5149                 addr += toread;
5150         }
5151 out:
5152         return r;
5153 }
5154 
5155 /* used for instruction fetching */
5156 static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
5157                                 gva_t addr, void *val, unsigned int bytes,
5158                                 struct x86_exception *exception)
5159 {
5160         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5161         u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
5162         unsigned offset;
5163         int ret;
5164 
5165         /* Inline kvm_read_guest_virt_helper for speed.  */
5166         gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK,
5167                                                     exception);
5168         if (unlikely(gpa == UNMAPPED_GVA))
5169                 return X86EMUL_PROPAGATE_FAULT;
5170 
5171         offset = addr & (PAGE_SIZE-1);
5172         if (WARN_ON(offset + bytes > PAGE_SIZE))
5173                 bytes = (unsigned)PAGE_SIZE - offset;
5174         ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val,
5175                                        offset, bytes);
5176         if (unlikely(ret < 0))
5177                 return X86EMUL_IO_NEEDED;
5178 
5179         return X86EMUL_CONTINUE;
5180 }
5181 
5182 int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
5183                                gva_t addr, void *val, unsigned int bytes,
5184                                struct x86_exception *exception)
5185 {
5186         u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
5187 
5188         /*
5189          * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
5190          * is returned, but our callers are not ready for that and they blindly
5191          * call kvm_inject_page_fault.  Ensure that they at least do not leak
5192          * uninitialized kernel stack memory into cr2 and error code.
5193          */
5194         memset(exception, 0, sizeof(*exception));
5195         return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
5196                                           exception);
5197 }
5198 EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
5199 
5200 static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
5201                              gva_t addr, void *val, unsigned int bytes,
5202                              struct x86_exception *exception, bool system)
5203 {
5204         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5205         u32 access = 0;
5206 
5207         if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
5208                 access |= PFERR_USER_MASK;
5209 
5210         return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
5211 }
5212 
5213 static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
5214                 unsigned long addr, void *val, unsigned int bytes)
5215 {
5216         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5217         int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
5218 
5219         return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
5220 }
5221 
5222 static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
5223                                       struct kvm_vcpu *vcpu, u32 access,
5224                                       struct x86_exception *exception)
5225 {
5226         void *data = val;
5227         int r = X86EMUL_CONTINUE;
5228 
5229         while (bytes) {
5230                 gpa_t gpa =  vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
5231                                                              access,
5232                                                              exception);
5233                 unsigned offset = addr & (PAGE_SIZE-1);
5234                 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
5235                 int ret;
5236 
5237                 if (gpa == UNMAPPED_GVA)
5238                         return X86EMUL_PROPAGATE_FAULT;
5239                 ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
5240                 if (ret < 0) {
5241                         r = X86EMUL_IO_NEEDED;
5242                         goto out;
5243                 }
5244 
5245                 bytes -= towrite;
5246                 data += towrite;
5247                 addr += towrite;
5248         }
5249 out:
5250         return r;
5251 }
5252 
5253 static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val,
5254                               unsigned int bytes, struct x86_exception *exception,
5255                               bool system)
5256 {
5257         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5258         u32 access = PFERR_WRITE_MASK;
5259 
5260         if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
5261                 access |= PFERR_USER_MASK;
5262 
5263         return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
5264                                            access, exception);
5265 }
5266 
5267 int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
5268                                 unsigned int bytes, struct x86_exception *exception)
5269 {
5270         /* kvm_write_guest_virt_system can pull in tons of pages. */
5271         vcpu->arch.l1tf_flush_l1d = true;
5272 
5273         /*
5274          * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
5275          * is returned, but our callers are not ready for that and they blindly
5276          * call kvm_inject_page_fault.  Ensure that they at least do not leak
5277          * uninitialized kernel stack memory into cr2 and error code.
5278          */
5279         memset(exception, 0, sizeof(*exception));
5280         return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
5281                                            PFERR_WRITE_MASK, exception);
5282 }
5283 EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
5284 
5285 int handle_ud(struct kvm_vcpu *vcpu)
5286 {
5287         int emul_type = EMULTYPE_TRAP_UD;
5288         enum emulation_result er;
5289         char sig[5]; /* ud2; .ascii "kvm" */
5290         struct x86_exception e;
5291 
5292         if (force_emulation_prefix &&
5293             kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
5294                                 sig, sizeof(sig), &e) == 0 &&
5295             memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) {
5296                 kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
5297                 emul_type = 0;
5298         }
5299 
5300         er = kvm_emulate_instruction(vcpu, emul_type);
5301         if (er == EMULATE_USER_EXIT)
5302                 return 0;
5303         if (er != EMULATE_DONE)
5304                 kvm_queue_exception(vcpu, UD_VECTOR);
5305         return 1;
5306 }
5307 EXPORT_SYMBOL_GPL(handle_ud);
5308 
5309 static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
5310                             gpa_t gpa, bool write)
5311 {
5312         /* For APIC access vmexit */
5313         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
5314                 return 1;
5315 
5316         if (vcpu_match_mmio_gpa(vcpu, gpa)) {
5317                 trace_vcpu_match_mmio(gva, gpa, write, true);
5318                 return 1;
5319         }
5320 
5321         return 0;
5322 }
5323 
5324 static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
5325                                 gpa_t *gpa, struct x86_exception *exception,
5326                                 bool write)
5327 {
5328         u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
5329                 | (write ? PFERR_WRITE_MASK : 0);
5330 
5331         /*
5332          * currently PKRU is only applied to ept enabled guest so
5333          * there is no pkey in EPT page table for L1 guest or EPT
5334          * shadow page table for L2 guest.
5335          */
5336         if (vcpu_match_mmio_gva(vcpu, gva)
5337             && !permission_fault(vcpu, vcpu->arch.walk_mmu,
5338                                  vcpu->arch.access, 0, access)) {
5339                 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
5340                                         (gva & (PAGE_SIZE - 1));
5341                 trace_vcpu_match_mmio(gva, *gpa, write, false);
5342                 return 1;
5343         }
5344 
5345         *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
5346 
5347         if (*gpa == UNMAPPED_GVA)
5348                 return -1;
5349 
5350         return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
5351 }
5352 
5353 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
5354                         const void *val, int bytes)
5355 {
5356         int ret;
5357 
5358         ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
5359         if (ret < 0)
5360                 return 0;
5361         kvm_page_track_write(vcpu, gpa, val, bytes);
5362         return 1;
5363 }
5364 
5365 struct read_write_emulator_ops {
5366         int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
5367                                   int bytes);
5368         int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
5369                                   void *val, int bytes);
5370         int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
5371                                int bytes, void *val);
5372         int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
5373                                     void *val, int bytes);
5374         bool write;
5375 };
5376 
5377 static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
5378 {
5379         if (vcpu->mmio_read_completed) {
5380                 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
5381                                vcpu->mmio_fragments[0].gpa, val);
5382                 vcpu->mmio_read_completed = 0;
5383                 return 1;
5384         }
5385 
5386         return 0;
5387 }
5388 
5389 static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
5390                         void *val, int bytes)
5391 {
5392         return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes);
5393 }
5394 
5395 static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
5396                          void *val, int bytes)
5397 {
5398         return emulator_write_phys(vcpu, gpa, val, bytes);
5399 }
5400 
5401 static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
5402 {
5403         trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
5404         return vcpu_mmio_write(vcpu, gpa, bytes, val);
5405 }
5406 
5407 static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
5408                           void *val, int bytes)
5409 {
5410         trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
5411         return X86EMUL_IO_NEEDED;
5412 }
5413 
5414 static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
5415                            void *val, int bytes)
5416 {
5417         struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
5418 
5419         memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
5420         return X86EMUL_CONTINUE;
5421 }
5422 
5423 static const struct read_write_emulator_ops read_emultor = {
5424         .read_write_prepare = read_prepare,
5425         .read_write_emulate = read_emulate,
5426         .read_write_mmio = vcpu_mmio_read,
5427         .read_write_exit_mmio = read_exit_mmio,
5428 };
5429 
5430 static const struct read_write_emulator_ops write_emultor = {
5431         .read_write_emulate = write_emulate,
5432         .read_write_mmio = write_mmio,
5433         .read_write_exit_mmio = write_exit_mmio,
5434         .write = true,
5435 };
5436 
5437 static int emulator_read_write_onepage(unsigned long addr, void *val,
5438                                        unsigned int bytes,
5439                                        struct x86_exception *exception,
5440                                        struct kvm_vcpu *vcpu,
5441                                        const struct read_write_emulator_ops *ops)
5442 {
5443         gpa_t gpa;
5444         int handled, ret;
5445         bool write = ops->write;
5446         struct kvm_mmio_fragment *frag;
5447         struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
5448 
5449         /*
5450          * If the exit was due to a NPF we may already have a GPA.
5451          * If the GPA is present, use it to avoid the GVA to GPA table walk.
5452          * Note, this cannot be used on string operations since string
5453          * operation using rep will only have the initial GPA from the NPF
5454          * occurred.
5455          */
5456         if (vcpu->arch.gpa_available &&
5457             emulator_can_use_gpa(ctxt) &&
5458             (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
5459                 gpa = vcpu->arch.gpa_val;
5460                 ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
5461         } else {
5462                 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
5463                 if (ret < 0)
5464                         return X86EMUL_PROPAGATE_FAULT;
5465         }
5466 
5467         if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes))
5468                 return X86EMUL_CONTINUE;
5469 
5470         /*
5471          * Is this MMIO handled locally?
5472          */
5473         handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
5474         if (handled == bytes)
5475                 return X86EMUL_CONTINUE;
5476 
5477         gpa += handled;
5478         bytes -= handled;
5479         val += handled;
5480 
5481         WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
5482         frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
5483         frag->gpa = gpa;
5484         frag->data = val;
5485         frag->len = bytes;
5486         return X86EMUL_CONTINUE;
5487 }
5488 
5489 static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
5490                         unsigned long addr,
5491                         void *val, unsigned int bytes,
5492                         struct x86_exception *exception,
5493                         const struct read_write_emulator_ops *ops)
5494 {
5495         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5496         gpa_t gpa;
5497         int rc;
5498 
5499         if (ops->read_write_prepare &&
5500                   ops->read_write_prepare(vcpu, val, bytes))
5501                 return X86EMUL_CONTINUE;
5502 
5503         vcpu->mmio_nr_fragments = 0;
5504 
5505         /* Crossing a page boundary? */
5506         if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
5507                 int now;
5508 
5509                 now = -addr & ~PAGE_MASK;
5510                 rc = emulator_read_write_onepage(addr, val, now, exception,
5511                                                  vcpu, ops);
5512 
5513                 if (rc != X86EMUL_CONTINUE)
5514                         return rc;
5515                 addr += now;
5516                 if (ctxt->mode != X86EMUL_MODE_PROT64)
5517                         addr = (u32)addr;
5518                 val += now;
5519                 bytes -= now;
5520         }
5521 
5522         rc = emulator_read_write_onepage(addr, val, bytes, exception,
5523                                          vcpu, ops);
5524         if (rc != X86EMUL_CONTINUE)
5525                 return rc;
5526 
5527         if (!vcpu->mmio_nr_fragments)
5528                 return rc;
5529 
5530         gpa = vcpu->mmio_fragments[0].gpa;
5531 
5532         vcpu->mmio_needed = 1;
5533         vcpu->mmio_cur_fragment = 0;
5534 
5535         vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
5536         vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
5537         vcpu->run->exit_reason = KVM_EXIT_MMIO;
5538         vcpu->run->mmio.phys_addr = gpa;
5539 
5540         return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
5541 }
5542 
5543 static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
5544                                   unsigned long addr,
5545                                   void *val,
5546                                   unsigned int bytes,
5547                                   struct x86_exception *exception)
5548 {
5549         return emulator_read_write(ctxt, addr, val, bytes,
5550                                    exception, &read_emultor);
5551 }
5552 
5553 static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
5554                             unsigned long addr,
5555                             const void *val,
5556                             unsigned int bytes,
5557                             struct x86_exception *exception)
5558 {
5559         return emulator_read_write(ctxt, addr, (void *)val, bytes,
5560                                    exception, &write_emultor);
5561 }
5562 
5563 #define CMPXCHG_TYPE(t, ptr, old, new) \
5564         (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
5565 
5566 #ifdef CONFIG_X86_64
5567 #  define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
5568 #else
5569 #  define CMPXCHG64(ptr, old, new) \
5570         (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
5571 #endif
5572 
5573 static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
5574                                      unsigned long addr,
5575                                      const void *old,
5576                                      const void *new,
5577                                      unsigned int bytes,
5578                                      struct x86_exception *exception)
5579 {
5580         struct kvm_host_map map;
5581         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5582         gpa_t gpa;
5583         char *kaddr;
5584         bool exchanged;
5585 
5586         /* guests cmpxchg8b have to be emulated atomically */
5587         if (bytes > 8 || (bytes & (bytes - 1)))
5588                 goto emul_write;
5589 
5590         gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
5591 
5592         if (gpa == UNMAPPED_GVA ||
5593             (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
5594                 goto emul_write;
5595 
5596         if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
5597                 goto emul_write;
5598 
5599         if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
5600                 goto emul_write;
5601 
5602         kaddr = map.hva + offset_in_page(gpa);
5603 
5604         switch (bytes) {
5605         case 1:
5606                 exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
5607                 break;
5608         case 2:
5609                 exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
5610                 break;
5611         case 4:
5612                 exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
5613                 break;
5614         case 8:
5615                 exchanged = CMPXCHG64(kaddr, old, new);
5616                 break;
5617         default:
5618                 BUG();
5619         }
5620 
5621         kvm_vcpu_unmap(vcpu, &map, true);
5622 
5623         if (!exchanged)
5624                 return X86EMUL_CMPXCHG_FAILED;
5625 
5626         kvm_page_track_write(vcpu, gpa, new, bytes);
5627 
5628         return X86EMUL_CONTINUE;
5629 
5630 emul_write:
5631         printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
5632 
5633         return emulator_write_emulated(ctxt, addr, new, bytes, exception);
5634 }
5635 
5636 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
5637 {
5638         int r = 0, i;
5639 
5640         for (i = 0; i < vcpu->arch.pio.count; i++) {
5641                 if (vcpu->arch.pio.in)
5642                         r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
5643                                             vcpu->arch.pio.size, pd);
5644                 else
5645                         r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
5646                                              vcpu->arch.pio.port, vcpu->arch.pio.size,
5647                                              pd);
5648                 if (r)
5649                         break;
5650                 pd += vcpu->arch.pio.size;
5651         }
5652         return r;
5653 }
5654 
5655 static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
5656                                unsigned short port, void *val,
5657                                unsigned int count, bool in)
5658 {
5659         vcpu->arch.pio.port = port;
5660         vcpu->arch.pio.in = in;
5661         vcpu->arch.pio.count  = count;
5662         vcpu->arch.pio.size = size;
5663 
5664         if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
5665                 vcpu->arch.pio.count = 0;
5666                 return 1;
5667         }
5668 
5669         vcpu->run->exit_reason = KVM_EXIT_IO;
5670         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
5671         vcpu->run->io.size = size;
5672         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
5673         vcpu->run->io.count = count;
5674         vcpu->run->io.port = port;
5675 
5676         return 0;
5677 }
5678 
5679 static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
5680                                     int size, unsigned short port, void *val,
5681                                     unsigned int count)
5682 {
5683         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5684         int ret;
5685 
5686         if (vcpu->arch.pio.count)
5687                 goto data_avail;
5688 
5689         memset(vcpu->arch.pio_data, 0, size * count);
5690 
5691         ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
5692         if (ret) {
5693 data_avail:
5694                 memcpy(val, vcpu->arch.pio_data, size * count);
5695                 trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
5696                 vcpu->arch.pio.count = 0;
5697                 return 1;
5698         }
5699 
5700         return 0;
5701 }
5702 
5703 static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
5704                                      int size, unsigned short port,
5705                                      const void *val, unsigned int count)
5706 {
5707         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5708 
5709         memcpy(vcpu->arch.pio_data, val, size * count);
5710         trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
5711         return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
5712 }
5713 
5714 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
5715 {
5716         return kvm_x86_ops->get_segment_base(vcpu, seg);
5717 }
5718 
5719 static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
5720 {
5721         kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
5722 }
5723 
5724 static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
5725 {
5726         if (!need_emulate_wbinvd(vcpu))
5727                 return X86EMUL_CONTINUE;
5728 
5729         if (kvm_x86_ops->has_wbinvd_exit()) {
5730                 int cpu = get_cpu();
5731 
5732                 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
5733                 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
5734                                 wbinvd_ipi, NULL, 1);
5735                 put_cpu();
5736                 cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
5737         } else
5738                 wbinvd();
5739         return X86EMUL_CONTINUE;
5740 }
5741 
5742 int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
5743 {
5744         kvm_emulate_wbinvd_noskip(vcpu);
5745         return kvm_skip_emulated_instruction(vcpu);
5746 }
5747 EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
5748 
5749 
5750 
5751 static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
5752 {
5753         kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
5754 }
5755 
5756 static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
5757                            unsigned long *dest)
5758 {
5759         return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
5760 }
5761 
5762 static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
5763                            unsigned long value)
5764 {
5765 
5766         return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
5767 }
5768 
5769 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
5770 {
5771         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
5772 }
5773 
5774 static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
5775 {
5776         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5777         unsigned long value;
5778 
5779         switch (cr) {
5780         case 0:
5781                 value = kvm_read_cr0(vcpu);
5782                 break;
5783         case 2:
5784                 value = vcpu->arch.cr2;
5785                 break;
5786         case 3:
5787                 value = kvm_read_cr3(vcpu);
5788                 break;
5789         case 4:
5790                 value = kvm_read_cr4(vcpu);
5791                 break;
5792         case 8:
5793                 value = kvm_get_cr8(vcpu);
5794                 break;
5795         default:
5796                 kvm_err("%s: unexpected cr %u\n", __func__, cr);
5797                 return 0;
5798         }
5799 
5800         return value;
5801 }
5802 
5803 static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
5804 {
5805         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5806         int res = 0;
5807 
5808         switch (cr) {
5809         case 0:
5810                 res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
5811                 break;
5812         case 2:
5813                 vcpu->arch.cr2 = val;
5814                 break;
5815         case 3:
5816                 res = kvm_set_cr3(vcpu, val);
5817                 break;
5818         case 4:
5819                 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
5820                 break;
5821         case 8:
5822                 res = kvm_set_cr8(vcpu, val);
5823                 break;
5824         default:
5825                 kvm_err("%s: unexpected cr %u\n", __func__, cr);
5826                 res = -1;
5827         }
5828 
5829         return res;
5830 }
5831 
5832 static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
5833 {
5834         return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
5835 }
5836 
5837 static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
5838 {
5839         kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
5840 }
5841 
5842 static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
5843 {
5844         kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
5845 }
5846 
5847 static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
5848 {
5849         kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
5850 }
5851 
5852 static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
5853 {
5854         kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
5855 }
5856 
5857 static unsigned long emulator_get_cached_segment_base(
5858         struct x86_emulate_ctxt *ctxt, int seg)
5859 {
5860         return get_segment_base(emul_to_vcpu(ctxt), seg);
5861 }
5862 
5863 static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
5864                                  struct desc_struct *desc, u32 *base3,
5865                                  int seg)
5866 {
5867         struct kvm_segment var;
5868 
5869         kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
5870         *selector = var.selector;
5871 
5872         if (var.unusable) {
5873                 memset(desc, 0, sizeof(*desc));
5874                 if (base3)
5875                         *base3 = 0;
5876                 return false;
5877         }
5878 
5879         if (var.g)
5880                 var.limit >>= 12;
5881         set_desc_limit(desc, var.limit);
5882         set_desc_base(desc, (unsigned long)var.base);
5883 #ifdef CONFIG_X86_64
5884         if (base3)
5885                 *base3 = var.base >> 32;
5886 #endif
5887         desc->type = var.type;
5888         desc->s = var.s;
5889         desc->dpl = var.dpl;
5890         desc->p = var.present;
5891         desc->avl = var.avl;
5892         desc->l = var.l;
5893         desc->d = var.db;
5894         desc->g = var.g;
5895 
5896         return true;
5897 }
5898 
5899 static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
5900                                  struct desc_struct *desc, u32 base3,
5901                                  int seg)
5902 {
5903         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5904         struct kvm_segment var;
5905 
5906         var.selector = selector;
5907         var.base = get_desc_base(desc);
5908 #ifdef CONFIG_X86_64
5909         var.base |= ((u64)base3) << 32;
5910 #endif
5911         var.limit = get_desc_limit(desc);
5912         if (desc->g)
5913                 var.limit = (var.limit << 12) | 0xfff;
5914         var.type = desc->type;
5915         var.dpl = desc->dpl;
5916         var.db = desc->d;
5917         var.s = desc->s;
5918         var.l = desc->l;
5919         var.g = desc->g;
5920         var.avl = desc->avl;
5921         var.present = desc->p;
5922         var.unusable = !var.present;
5923         var.padding = 0;
5924 
5925         kvm_set_segment(vcpu, &var, seg);
5926         return;
5927 }
5928 
5929 static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
5930                             u32 msr_index, u64 *pdata)
5931 {
5932         struct msr_data msr;
5933         int r;
5934 
5935         msr.index = msr_index;
5936         msr.host_initiated = false;
5937         r = kvm_get_msr(emul_to_vcpu(ctxt), &msr);
5938         if (r)
5939                 return r;
5940 
5941         *pdata = msr.data;
5942         return 0;
5943 }
5944 
5945 static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
5946                             u32 msr_index, u64 data)
5947 {
5948         struct msr_data msr;
5949 
5950         msr.data = data;
5951         msr.index = msr_index;
5952         msr.host_initiated = false;
5953         return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
5954 }
5955 
5956 static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
5957 {
5958         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5959 
5960         return vcpu->arch.smbase;
5961 }
5962 
5963 static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
5964 {
5965         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5966 
5967         vcpu->arch.smbase = smbase;
5968 }
5969 
5970 static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
5971                               u32 pmc)
5972 {
5973         return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc);
5974 }
5975 
5976 static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
5977                              u32 pmc, u64 *pdata)
5978 {
5979         return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata);
5980 }
5981 
5982 static void emulator_halt(struct x86_emulate_ctxt *ctxt)
5983 {
5984         emul_to_vcpu(ctxt)->arch.halt_request = 1;
5985 }
5986 
5987 static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
5988                               struct x86_instruction_info *info,
5989                               enum x86_intercept_stage stage)
5990 {
5991         return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
5992 }
5993 
5994 static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
5995                         u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit)
5996 {
5997         return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
5998 }
5999 
6000 static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
6001 {
6002         return kvm_register_read(emul_to_vcpu(ctxt), reg);
6003 }
6004 
6005 static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
6006 {
6007         kvm_register_write(emul_to_vcpu(ctxt), reg, val);
6008 }
6009 
6010 static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
6011 {
6012         kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
6013 }
6014 
6015 static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
6016 {
6017         return emul_to_vcpu(ctxt)->arch.hflags;
6018 }
6019 
6020 static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
6021 {
6022         emul_to_vcpu(ctxt)->arch.hflags = emul_flags;
6023 }
6024 
6025 static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
6026                                   const char *smstate)
6027 {
6028         return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smstate);
6029 }
6030 
6031 static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
6032 {
6033         kvm_smm_changed(emul_to_vcpu(ctxt));
6034 }
6035 
6036 static const struct x86_emulate_ops emulate_ops = {
6037         .read_gpr            = emulator_read_gpr,
6038         .write_gpr           = emulator_write_gpr,
6039         .read_std            = emulator_read_std,
6040         .write_std           = emulator_write_std,
6041         .read_phys           = kvm_read_guest_phys_system,
6042         .fetch               = kvm_fetch_guest_virt,
6043         .read_emulated       = emulator_read_emulated,
6044         .write_emulated      = emulator_write_emulated,
6045         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
6046         .invlpg              = emulator_invlpg,
6047         .pio_in_emulated     = emulator_pio_in_emulated,
6048         .pio_out_emulated    = emulator_pio_out_emulated,
6049         .get_segment         = emulator_get_segment,
6050         .set_segment         = emulator_set_segment,
6051         .get_cached_segment_base = emulator_get_cached_segment_base,
6052         .get_gdt             = emulator_get_gdt,