1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * derived from drivers/kvm/kvm_main.c 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright (C) 2008 Qumranet, Inc. 9 * Copyright IBM Corporation, 2008 10 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 11 * 12 * Authors: 13 * Avi Kivity <avi@qumranet.com> 14 * Yaniv Kamay <yaniv@qumranet.com> 15 * Amit Shah <amit.shah@qumranet.com> 16 * Ben-Ami Yassour <benami@il.ibm.com> 17 */ 18 19 #include <linux/kvm_host.h> 20 #include "irq.h" 21 #include "ioapic.h" 22 #include "mmu.h" 23 #include "i8254.h" 24 #include "tss.h" 25 #include "kvm_cache_regs.h" 26 #include "kvm_emulate.h" 27 #include "x86.h" 28 #include "cpuid.h" 29 #include "pmu.h" 30 #include "hyperv.h" 31 #include "lapic.h" 32 #include "xen.h" 33 34 #include <linux/clocksource.h> 35 #include <linux/interrupt.h> 36 #include <linux/kvm.h> 37 #include <linux/fs.h> 38 #include <linux/vmalloc.h> 39 #include <linux/export.h> 40 #include <linux/moduleparam.h> 41 #include <linux/mman.h> 42 #include <linux/highmem.h> 43 #include <linux/iommu.h> 44 #include <linux/intel-iommu.h> 45 #include <linux/cpufreq.h> 46 #include <linux/user-return-notifier.h> 47 #include <linux/srcu.h> 48 #include <linux/slab.h> 49 #include <linux/perf_event.h> 50 #include <linux/uaccess.h> 51 #include <linux/hash.h> 52 #include <linux/pci.h> 53 #include <linux/timekeeper_internal.h> 54 #include <linux/pvclock_gtod.h> 55 #include <linux/kvm_irqfd.h> 56 #include <linux/irqbypass.h> 57 #include <linux/sched/stat.h> 58 #include <linux/sched/isolation.h> 59 #include <linux/mem_encrypt.h> 60 #include <linux/entry-kvm.h> 61 62 #include <trace/events/kvm.h> 63 64 #include <asm/debugreg.h> 65 #include <asm/msr.h> 66 #include <asm/desc.h> 67 #include <asm/mce.h> 68 #include <linux/kernel_stat.h> 69 #include <asm/fpu/internal.h> /* Ugh! */ 70 #include <asm/pvclock.h> 71 #include <asm/div64.h> 72 #include <asm/irq_remapping.h> 73 #include <asm/mshyperv.h> 74 #include <asm/hypervisor.h> 75 #include <asm/tlbflush.h> 76 #include <asm/intel_pt.h> 77 #include <asm/emulate_prefix.h> 78 #include <clocksource/hyperv_timer.h> 79 80 #define CREATE_TRACE_POINTS 81 #include "trace.h" 82 83 #define MAX_IO_MSRS 256 84 #define KVM_MAX_MCE_BANKS 32 85 u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; 86 EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); 87 88 #define emul_to_vcpu(ctxt) \ 89 ((struct kvm_vcpu *)(ctxt)->vcpu) 90 91 /* EFER defaults: 92 * - enable syscall per default because its emulated by KVM 93 * - enable LME and LMA per default on 64 bit KVM 94 */ 95 #ifdef CONFIG_X86_64 96 static 97 u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); 98 #else 99 static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); 100 #endif 101 102 static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; 103 104 #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ 105 KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) 106 107 static void update_cr8_intercept(struct kvm_vcpu *vcpu); 108 static void process_nmi(struct kvm_vcpu *vcpu); 109 static void process_smi(struct kvm_vcpu *vcpu); 110 static void enter_smm(struct kvm_vcpu *vcpu); 111 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); 112 static void store_regs(struct kvm_vcpu *vcpu); 113 static int sync_regs(struct kvm_vcpu *vcpu); 114 115 struct kvm_x86_ops kvm_x86_ops __read_mostly; 116 EXPORT_SYMBOL_GPL(kvm_x86_ops); 117 118 #define KVM_X86_OP(func) \ 119 DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \ 120 *(((struct kvm_x86_ops *)0)->func)); 121 #define KVM_X86_OP_NULL KVM_X86_OP 122 #include <asm/kvm-x86-ops.h> 123 EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); 124 EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); 125 EXPORT_STATIC_CALL_GPL(kvm_x86_tlb_flush_current); 126 127 static bool __read_mostly ignore_msrs = 0; 128 module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); 129 130 bool __read_mostly report_ignored_msrs = true; 131 module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR); 132 EXPORT_SYMBOL_GPL(report_ignored_msrs); 133 134 unsigned int min_timer_period_us = 200; 135 module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); 136 137 static bool __read_mostly kvmclock_periodic_sync = true; 138 module_param(kvmclock_periodic_sync, bool, S_IRUGO); 139 140 bool __read_mostly kvm_has_tsc_control; 141 EXPORT_SYMBOL_GPL(kvm_has_tsc_control); 142 u32 __read_mostly kvm_max_guest_tsc_khz; 143 EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); 144 u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits; 145 EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits); 146 u64 __read_mostly kvm_max_tsc_scaling_ratio; 147 EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); 148 u64 __read_mostly kvm_default_tsc_scaling_ratio; 149 EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); 150 bool __read_mostly kvm_has_bus_lock_exit; 151 EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit); 152 153 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ 154 static u32 __read_mostly tsc_tolerance_ppm = 250; 155 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); 156 157 /* 158 * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables 159 * adaptive tuning starting from default advancment of 1000ns. '' disables 160 * advancement entirely. Any other value is used as-is and disables adaptive 161 * tuning, i.e. allows priveleged userspace to set an exact advancement time. 162 */ 163 static int __read_mostly lapic_timer_advance_ns = -1; 164 module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); 165 166 static bool __read_mostly vector_hashing = true; 167 module_param(vector_hashing, bool, S_IRUGO); 168 169 bool __read_mostly enable_vmware_backdoor = false; 170 module_param(enable_vmware_backdoor, bool, S_IRUGO); 171 EXPORT_SYMBOL_GPL(enable_vmware_backdoor); 172 173 static bool __read_mostly force_emulation_prefix = false; 174 module_param(force_emulation_prefix, bool, S_IRUGO); 175 176 int __read_mostly pi_inject_timer = -1; 177 module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); 178 179 /* 180 * Restoring the host value for MSRs that are only consumed when running in 181 * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU 182 * returns to userspace, i.e. the kernel can run with the guest's value. 183 */ 184 #define KVM_MAX_NR_USER_RETURN_MSRS 16 185 186 struct kvm_user_return_msrs_global { 187 int nr; 188 u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS]; 189 }; 190 191 struct kvm_user_return_msrs { 192 struct user_return_notifier urn; 193 bool registered; 194 struct kvm_user_return_msr_values { 195 u64 host; 196 u64 curr; 197 } values[KVM_MAX_NR_USER_RETURN_MSRS]; 198 }; 199 200 static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global; 201 static struct kvm_user_return_msrs __percpu *user_return_msrs; 202 203 #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ 204 | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ 205 | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ 206 | XFEATURE_MASK_PKRU) 207 208 u64 __read_mostly host_efer; 209 EXPORT_SYMBOL_GPL(host_efer); 210 211 bool __read_mostly allow_smaller_maxphyaddr = 0; 212 EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); 213 214 u64 __read_mostly host_xss; 215 EXPORT_SYMBOL_GPL(host_xss); 216 u64 __read_mostly supported_xss; 217 EXPORT_SYMBOL_GPL(supported_xss); 218 219 struct kvm_stats_debugfs_item debugfs_entries[] = { 220 VCPU_STAT("pf_fixed", pf_fixed), 221 VCPU_STAT("pf_guest", pf_guest), 222 VCPU_STAT("tlb_flush", tlb_flush), 223 VCPU_STAT("invlpg", invlpg), 224 VCPU_STAT("exits", exits), 225 VCPU_STAT("io_exits", io_exits), 226 VCPU_STAT("mmio_exits", mmio_exits), 227 VCPU_STAT("signal_exits", signal_exits), 228 VCPU_STAT("irq_window", irq_window_exits), 229 VCPU_STAT("nmi_window", nmi_window_exits), 230 VCPU_STAT("halt_exits", halt_exits), 231 VCPU_STAT("halt_successful_poll", halt_successful_poll), 232 VCPU_STAT("halt_attempted_poll", halt_attempted_poll), 233 VCPU_STAT("halt_poll_invalid", halt_poll_invalid), 234 VCPU_STAT("halt_wakeup", halt_wakeup), 235 VCPU_STAT("hypercalls", hypercalls), 236 VCPU_STAT("request_irq", request_irq_exits), 237 VCPU_STAT("irq_exits", irq_exits), 238 VCPU_STAT("host_state_reload", host_state_reload), 239 VCPU_STAT("fpu_reload", fpu_reload), 240 VCPU_STAT("insn_emulation", insn_emulation), 241 VCPU_STAT("insn_emulation_fail", insn_emulation_fail), 242 VCPU_STAT("irq_injections", irq_injections), 243 VCPU_STAT("nmi_injections", nmi_injections), 244 VCPU_STAT("req_event", req_event), 245 VCPU_STAT("l1d_flush", l1d_flush), 246 VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), 247 VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), 248 VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), 249 VM_STAT("mmu_pte_write", mmu_pte_write), 250 VM_STAT("mmu_pde_zapped", mmu_pde_zapped), 251 VM_STAT("mmu_flooded", mmu_flooded), 252 VM_STAT("mmu_recycled", mmu_recycled), 253 VM_STAT("mmu_cache_miss", mmu_cache_miss), 254 VM_STAT("mmu_unsync", mmu_unsync), 255 VM_STAT("remote_tlb_flush", remote_tlb_flush), 256 VM_STAT("largepages", lpages, .mode = 0444), 257 VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444), 258 VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions), 259 { NULL } 260 }; 261 262 u64 __read_mostly host_xcr0; 263 u64 __read_mostly supported_xcr0; 264 EXPORT_SYMBOL_GPL(supported_xcr0); 265 266 static struct kmem_cache *x86_fpu_cache; 267 268 static struct kmem_cache *x86_emulator_cache; 269 270 /* 271 * When called, it means the previous get/set msr reached an invalid msr. 272 * Return true if we want to ignore/silent this failed msr access. 273 */ 274 static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write) 275 { 276 const char *op = write ? "wrmsr" : "rdmsr"; 277 278 if (ignore_msrs) { 279 if (report_ignored_msrs) 280 kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", 281 op, msr, data); 282 /* Mask the error */ 283 return true; 284 } else { 285 kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", 286 op, msr, data); 287 return false; 288 } 289 } 290 291 static struct kmem_cache *kvm_alloc_emulator_cache(void) 292 { 293 unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src); 294 unsigned int size = sizeof(struct x86_emulate_ctxt); 295 296 return kmem_cache_create_usercopy("x86_emulator", size, 297 __alignof__(struct x86_emulate_ctxt), 298 SLAB_ACCOUNT, useroffset, 299 size - useroffset, NULL); 300 } 301 302 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); 303 304 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) 305 { 306 int i; 307 for (i = 0; i < ASYNC_PF_PER_VCPU; i++) 308 vcpu->arch.apf.gfns[i] = ~0; 309 } 310 311 static void kvm_on_user_return(struct user_return_notifier *urn) 312 { 313 unsigned slot; 314 struct kvm_user_return_msrs *msrs 315 = container_of(urn, struct kvm_user_return_msrs, urn); 316 struct kvm_user_return_msr_values *values; 317 unsigned long flags; 318 319 /* 320 * Disabling irqs at this point since the following code could be 321 * interrupted and executed through kvm_arch_hardware_disable() 322 */ 323 local_irq_save(flags); 324 if (msrs->registered) { 325 msrs->registered = false; 326 user_return_notifier_unregister(urn); 327 } 328 local_irq_restore(flags); 329 for (slot = 0; slot < user_return_msrs_global.nr; ++slot) { 330 values = &msrs->values[slot]; 331 if (values->host != values->curr) { 332 wrmsrl(user_return_msrs_global.msrs[slot], values->host); 333 values->curr = values->host; 334 } 335 } 336 } 337 338 int kvm_probe_user_return_msr(u32 msr) 339 { 340 u64 val; 341 int ret; 342 343 preempt_disable(); 344 ret = rdmsrl_safe(msr, &val); 345 if (ret) 346 goto out; 347 ret = wrmsrl_safe(msr, val); 348 out: 349 preempt_enable(); 350 return ret; 351 } 352 EXPORT_SYMBOL_GPL(kvm_probe_user_return_msr); 353 354 void kvm_define_user_return_msr(unsigned slot, u32 msr) 355 { 356 BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS); 357 user_return_msrs_global.msrs[slot] = msr; 358 if (slot >= user_return_msrs_global.nr) 359 user_return_msrs_global.nr = slot + 1; 360 } 361 EXPORT_SYMBOL_GPL(kvm_define_user_return_msr); 362 363 static void kvm_user_return_msr_cpu_online(void) 364 { 365 unsigned int cpu = smp_processor_id(); 366 struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); 367 u64 value; 368 int i; 369 370 for (i = 0; i < user_return_msrs_global.nr; ++i) { 371 rdmsrl_safe(user_return_msrs_global.msrs[i], &value); 372 msrs->values[i].host = value; 373 msrs->values[i].curr = value; 374 } 375 } 376 377 int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) 378 { 379 unsigned int cpu = smp_processor_id(); 380 struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); 381 int err; 382 383 value = (value & mask) | (msrs->values[slot].host & ~mask); 384 if (value == msrs->values[slot].curr) 385 return 0; 386 err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value); 387 if (err) 388 return 1; 389 390 msrs->values[slot].curr = value; 391 if (!msrs->registered) { 392 msrs->urn.on_user_return = kvm_on_user_return; 393 user_return_notifier_register(&msrs->urn); 394 msrs->registered = true; 395 } 396 return 0; 397 } 398 EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); 399 400 static void drop_user_return_notifiers(void) 401 { 402 unsigned int cpu = smp_processor_id(); 403 struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); 404 405 if (msrs->registered) 406 kvm_on_user_return(&msrs->urn); 407 } 408 409 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 410 { 411 return vcpu->arch.apic_base; 412 } 413 EXPORT_SYMBOL_GPL(kvm_get_apic_base); 414 415 enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu) 416 { 417 return kvm_apic_mode(kvm_get_apic_base(vcpu)); 418 } 419 EXPORT_SYMBOL_GPL(kvm_get_apic_mode); 420 421 int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 422 { 423 enum lapic_mode old_mode = kvm_get_apic_mode(vcpu); 424 enum lapic_mode new_mode = kvm_apic_mode(msr_info->data); 425 u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff | 426 (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); 427 428 if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID) 429 return 1; 430 if (!msr_info->host_initiated) { 431 if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC) 432 return 1; 433 if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC) 434 return 1; 435 } 436 437 kvm_lapic_set_base(vcpu, msr_info->data); 438 kvm_recalculate_apic_map(vcpu->kvm); 439 return 0; 440 } 441 EXPORT_SYMBOL_GPL(kvm_set_apic_base); 442 443 asmlinkage __visible noinstr void kvm_spurious_fault(void) 444 { 445 /* Fault while not rebooting. We want the trace. */ 446 BUG_ON(!kvm_rebooting); 447 } 448 EXPORT_SYMBOL_GPL(kvm_spurious_fault); 449 450 #define EXCPT_BENIGN 0 451 #define EXCPT_CONTRIBUTORY 1 452 #define EXCPT_PF 2 453 454 static int exception_class(int vector) 455 { 456 switch (vector) { 457 case PF_VECTOR: 458 return EXCPT_PF; 459 case DE_VECTOR: 460 case TS_VECTOR: 461 case NP_VECTOR: 462 case SS_VECTOR: 463 case GP_VECTOR: 464 return EXCPT_CONTRIBUTORY; 465 default: 466 break; 467 } 468 return EXCPT_BENIGN; 469 } 470 471 #define EXCPT_FAULT 0 472 #define EXCPT_TRAP 1 473 #define EXCPT_ABORT 2 474 #define EXCPT_INTERRUPT 3 475 476 static int exception_type(int vector) 477 { 478 unsigned int mask; 479 480 if (WARN_ON(vector > 31 || vector == NMI_VECTOR)) 481 return EXCPT_INTERRUPT; 482 483 mask = 1 << vector; 484 485 /* #DB is trap, as instruction watchpoints are handled elsewhere */ 486 if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR))) 487 return EXCPT_TRAP; 488 489 if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR))) 490 return EXCPT_ABORT; 491 492 /* Reserved exceptions will result in fault */ 493 return EXCPT_FAULT; 494 } 495 496 void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) 497 { 498 unsigned nr = vcpu->arch.exception.nr; 499 bool has_payload = vcpu->arch.exception.has_payload; 500 unsigned long payload = vcpu->arch.exception.payload; 501 502 if (!has_payload) 503 return; 504 505 switch (nr) { 506 case DB_VECTOR: 507 /* 508 * "Certain debug exceptions may clear bit 0-3. The 509 * remaining contents of the DR6 register are never 510 * cleared by the processor". 511 */ 512 vcpu->arch.dr6 &= ~DR_TRAP_BITS; 513 /* 514 * In order to reflect the #DB exception payload in guest 515 * dr6, three components need to be considered: active low 516 * bit, FIXED_1 bits and active high bits (e.g. DR6_BD, 517 * DR6_BS and DR6_BT) 518 * DR6_ACTIVE_LOW contains the FIXED_1 and active low bits. 519 * In the target guest dr6: 520 * FIXED_1 bits should always be set. 521 * Active low bits should be cleared if 1-setting in payload. 522 * Active high bits should be set if 1-setting in payload. 523 * 524 * Note, the payload is compatible with the pending debug 525 * exceptions/exit qualification under VMX, that active_low bits 526 * are active high in payload. 527 * So they need to be flipped for DR6. 528 */ 529 vcpu->arch.dr6 |= DR6_ACTIVE_LOW; 530 vcpu->arch.dr6 |= payload; 531 vcpu->arch.dr6 ^= payload & DR6_ACTIVE_LOW; 532 533 /* 534 * The #DB payload is defined as compatible with the 'pending 535 * debug exceptions' field under VMX, not DR6. While bit 12 is 536 * defined in the 'pending debug exceptions' field (enabled 537 * breakpoint), it is reserved and must be zero in DR6. 538 */ 539 vcpu->arch.dr6 &= ~BIT(12); 540 break; 541 case PF_VECTOR: 542 vcpu->arch.cr2 = payload; 543 break; 544 } 545 546 vcpu->arch.exception.has_payload = false; 547 vcpu->arch.exception.payload = 0; 548 } 549 EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload); 550 551 static void kvm_multiple_exception(struct kvm_vcpu *vcpu, 552 unsigned nr, bool has_error, u32 error_code, 553 bool has_payload, unsigned long payload, bool reinject) 554 { 555 u32 prev_nr; 556 int class1, class2; 557 558 kvm_make_request(KVM_REQ_EVENT, vcpu); 559 560 if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) { 561 queue: 562 if (has_error && !is_protmode(vcpu)) 563 has_error = false; 564 if (reinject) { 565 /* 566 * On vmentry, vcpu->arch.exception.pending is only 567 * true if an event injection was blocked by 568 * nested_run_pending. In that case, however, 569 * vcpu_enter_guest requests an immediate exit, 570 * and the guest shouldn't proceed far enough to 571 * need reinjection. 572 */ 573 WARN_ON_ONCE(vcpu->arch.exception.pending); 574 vcpu->arch.exception.injected = true; 575 if (WARN_ON_ONCE(has_payload)) { 576 /* 577 * A reinjected event has already 578 * delivered its payload. 579 */ 580 has_payload = false; 581 payload = 0; 582 } 583 } else { 584 vcpu->arch.exception.pending = true; 585 vcpu->arch.exception.injected = false; 586 } 587 vcpu->arch.exception.has_error_code = has_error; 588 vcpu->arch.exception.nr = nr; 589 vcpu->arch.exception.error_code = error_code; 590 vcpu->arch.exception.has_payload = has_payload; 591 vcpu->arch.exception.payload = payload; 592 if (!is_guest_mode(vcpu)) 593 kvm_deliver_exception_payload(vcpu); 594 return; 595 } 596 597 /* to check exception */ 598 prev_nr = vcpu->arch.exception.nr; 599 if (prev_nr == DF_VECTOR) { 600 /* triple fault -> shutdown */ 601 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 602 return; 603 } 604 class1 = exception_class(prev_nr); 605 class2 = exception_class(nr); 606 if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) 607 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { 608 /* 609 * Generate double fault per SDM Table 5-5. Set 610 * exception.pending = true so that the double fault 611 * can trigger a nested vmexit. 612 */ 613 vcpu->arch.exception.pending = true; 614 vcpu->arch.exception.injected = false; 615 vcpu->arch.exception.has_error_code = true; 616 vcpu->arch.exception.nr = DF_VECTOR; 617 vcpu->arch.exception.error_code = 0; 618 vcpu->arch.exception.has_payload = false; 619 vcpu->arch.exception.payload = 0; 620 } else 621 /* replace previous exception with a new one in a hope 622 that instruction re-execution will regenerate lost 623 exception */ 624 goto queue; 625 } 626 627 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 628 { 629 kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false); 630 } 631 EXPORT_SYMBOL_GPL(kvm_queue_exception); 632 633 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) 634 { 635 kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true); 636 } 637 EXPORT_SYMBOL_GPL(kvm_requeue_exception); 638 639 void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, 640 unsigned long payload) 641 { 642 kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false); 643 } 644 EXPORT_SYMBOL_GPL(kvm_queue_exception_p); 645 646 static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, 647 u32 error_code, unsigned long payload) 648 { 649 kvm_multiple_exception(vcpu, nr, true, error_code, 650 true, payload, false); 651 } 652 653 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) 654 { 655 if (err) 656 kvm_inject_gp(vcpu, 0); 657 else 658 return kvm_skip_emulated_instruction(vcpu); 659 660 return 1; 661 } 662 EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); 663 664 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) 665 { 666 ++vcpu->stat.pf_guest; 667 vcpu->arch.exception.nested_apf = 668 is_guest_mode(vcpu) && fault->async_page_fault; 669 if (vcpu->arch.exception.nested_apf) { 670 vcpu->arch.apf.nested_apf_token = fault->address; 671 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); 672 } else { 673 kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code, 674 fault->address); 675 } 676 } 677 EXPORT_SYMBOL_GPL(kvm_inject_page_fault); 678 679 bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, 680 struct x86_exception *fault) 681 { 682 struct kvm_mmu *fault_mmu; 683 WARN_ON_ONCE(fault->vector != PF_VECTOR); 684 685 fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu : 686 vcpu->arch.walk_mmu; 687 688 /* 689 * Invalidate the TLB entry for the faulting address, if it exists, 690 * else the access will fault indefinitely (and to emulate hardware). 691 */ 692 if ((fault->error_code & PFERR_PRESENT_MASK) && 693 !(fault->error_code & PFERR_RSVD_MASK)) 694 kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address, 695 fault_mmu->root_hpa); 696 697 fault_mmu->inject_page_fault(vcpu, fault); 698 return fault->nested_page_fault; 699 } 700 EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault); 701 702 void kvm_inject_nmi(struct kvm_vcpu *vcpu) 703 { 704 atomic_inc(&vcpu->arch.nmi_queued); 705 kvm_make_request(KVM_REQ_NMI, vcpu); 706 } 707 EXPORT_SYMBOL_GPL(kvm_inject_nmi); 708 709 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 710 { 711 kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false); 712 } 713 EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 714 715 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 716 { 717 kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true); 718 } 719 EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); 720 721 /* 722 * Checks if cpl <= required_cpl; if true, return true. Otherwise queue 723 * a #GP and return false. 724 */ 725 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) 726 { 727 if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl) 728 return true; 729 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 730 return false; 731 } 732 EXPORT_SYMBOL_GPL(kvm_require_cpl); 733 734 bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) 735 { 736 if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE)) 737 return true; 738 739 kvm_queue_exception(vcpu, UD_VECTOR); 740 return false; 741 } 742 EXPORT_SYMBOL_GPL(kvm_require_dr); 743 744 /* 745 * This function will be used to read from the physical memory of the currently 746 * running guest. The difference to kvm_vcpu_read_guest_page is that this function 747 * can read from guest physical or from the guest's guest physical memory. 748 */ 749 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 750 gfn_t ngfn, void *data, int offset, int len, 751 u32 access) 752 { 753 struct x86_exception exception; 754 gfn_t real_gfn; 755 gpa_t ngpa; 756 757 ngpa = gfn_to_gpa(ngfn); 758 real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception); 759 if (real_gfn == UNMAPPED_GVA) 760 return -EFAULT; 761 762 real_gfn = gpa_to_gfn(real_gfn); 763 764 return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len); 765 } 766 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); 767 768 static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, 769 void *data, int offset, int len, u32 access) 770 { 771 return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, 772 data, offset, len, access); 773 } 774 775 static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) 776 { 777 return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2); 778 } 779 780 /* 781 * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise. 782 */ 783 int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) 784 { 785 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 786 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 787 int i; 788 int ret; 789 u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; 790 791 ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte, 792 offset * sizeof(u64), sizeof(pdpte), 793 PFERR_USER_MASK|PFERR_WRITE_MASK); 794 if (ret < 0) { 795 ret = 0; 796 goto out; 797 } 798 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 799 if ((pdpte[i] & PT_PRESENT_MASK) && 800 (pdpte[i] & pdptr_rsvd_bits(vcpu))) { 801 ret = 0; 802 goto out; 803 } 804 } 805 ret = 1; 806 807 memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); 808 kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); 809 810 out: 811 812 return ret; 813 } 814 EXPORT_SYMBOL_GPL(load_pdptrs); 815 816 bool pdptrs_changed(struct kvm_vcpu *vcpu) 817 { 818 u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; 819 int offset; 820 gfn_t gfn; 821 int r; 822 823 if (!is_pae_paging(vcpu)) 824 return false; 825 826 if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR)) 827 return true; 828 829 gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT; 830 offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1); 831 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), 832 PFERR_USER_MASK | PFERR_WRITE_MASK); 833 if (r < 0) 834 return true; 835 836 return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; 837 } 838 EXPORT_SYMBOL_GPL(pdptrs_changed); 839 840 void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0) 841 { 842 unsigned long update_bits = X86_CR0_PG | X86_CR0_WP; 843 844 if ((cr0 ^ old_cr0) & X86_CR0_PG) { 845 kvm_clear_async_pf_completion_queue(vcpu); 846 kvm_async_pf_hash_reset(vcpu); 847 } 848 849 if ((cr0 ^ old_cr0) & update_bits) 850 kvm_mmu_reset_context(vcpu); 851 852 if (((cr0 ^ old_cr0) & X86_CR0_CD) && 853 kvm_arch_has_noncoherent_dma(vcpu->kvm) && 854 !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) 855 kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); 856 } 857 EXPORT_SYMBOL_GPL(kvm_post_set_cr0); 858 859 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 860 { 861 unsigned long old_cr0 = kvm_read_cr0(vcpu); 862 unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG; 863 864 cr0 |= X86_CR0_ET; 865 866 #ifdef CONFIG_X86_64 867 if (cr0 & 0xffffffff00000000UL) 868 return 1; 869 #endif 870 871 cr0 &= ~CR0_RESERVED_BITS; 872 873 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) 874 return 1; 875 876 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) 877 return 1; 878 879 #ifdef CONFIG_X86_64 880 if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) && 881 (cr0 & X86_CR0_PG)) { 882 int cs_db, cs_l; 883 884 if (!is_pae(vcpu)) 885 return 1; 886 static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); 887 if (cs_l) 888 return 1; 889 } 890 #endif 891 if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) && 892 is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) && 893 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu))) 894 return 1; 895 896 if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) 897 return 1; 898 899 static_call(kvm_x86_set_cr0)(vcpu, cr0); 900 901 kvm_post_set_cr0(vcpu, old_cr0, cr0); 902 903 return 0; 904 } 905 EXPORT_SYMBOL_GPL(kvm_set_cr0); 906 907 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 908 { 909 (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); 910 } 911 EXPORT_SYMBOL_GPL(kvm_lmsw); 912 913 void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) 914 { 915 if (vcpu->arch.guest_state_protected) 916 return; 917 918 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { 919 920 if (vcpu->arch.xcr0 != host_xcr0) 921 xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); 922 923 if (vcpu->arch.xsaves_enabled && 924 vcpu->arch.ia32_xss != host_xss) 925 wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); 926 } 927 928 if (static_cpu_has(X86_FEATURE_PKU) && 929 (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || 930 (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) && 931 vcpu->arch.pkru != vcpu->arch.host_pkru) 932 __write_pkru(vcpu->arch.pkru); 933 } 934 EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); 935 936 void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) 937 { 938 if (vcpu->arch.guest_state_protected) 939 return; 940 941 if (static_cpu_has(X86_FEATURE_PKU) && 942 (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || 943 (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) { 944 vcpu->arch.pkru = rdpkru(); 945 if (vcpu->arch.pkru != vcpu->arch.host_pkru) 946 __write_pkru(vcpu->arch.host_pkru); 947 } 948 949 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { 950 951 if (vcpu->arch.xcr0 != host_xcr0) 952 xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); 953 954 if (vcpu->arch.xsaves_enabled && 955 vcpu->arch.ia32_xss != host_xss) 956 wrmsrl(MSR_IA32_XSS, host_xss); 957 } 958 959 } 960 EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); 961 962 static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) 963 { 964 u64 xcr0 = xcr; 965 u64 old_xcr0 = vcpu->arch.xcr0; 966 u64 valid_bits; 967 968 /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ 969 if (index != XCR_XFEATURE_ENABLED_MASK) 970 return 1; 971 if (!(xcr0 & XFEATURE_MASK_FP)) 972 return 1; 973 if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE)) 974 return 1; 975 976 /* 977 * Do not allow the guest to set bits that we do not support 978 * saving. However, xcr0 bit 0 is always set, even if the 979 * emulated CPU does not support XSAVE (see fx_init). 980 */ 981 valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP; 982 if (xcr0 & ~valid_bits) 983 return 1; 984 985 if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) != 986 (!(xcr0 & XFEATURE_MASK_BNDCSR))) 987 return 1; 988 989 if (xcr0 & XFEATURE_MASK_AVX512) { 990 if (!(xcr0 & XFEATURE_MASK_YMM)) 991 return 1; 992 if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512) 993 return 1; 994 } 995 vcpu->arch.xcr0 = xcr0; 996 997 if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) 998 kvm_update_cpuid_runtime(vcpu); 999 return 0; 1000 } 1001 1002 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) 1003 { 1004 if (static_call(kvm_x86_get_cpl)(vcpu) == 0) 1005 return __kvm_set_xcr(vcpu, index, xcr); 1006 1007 return 1; 1008 } 1009 EXPORT_SYMBOL_GPL(kvm_set_xcr); 1010 1011 bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1012 { 1013 if (cr4 & cr4_reserved_bits) 1014 return false; 1015 1016 if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) 1017 return false; 1018 1019 return static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); 1020 } 1021 EXPORT_SYMBOL_GPL(kvm_is_valid_cr4); 1022 1023 void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) 1024 { 1025 unsigned long mmu_role_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | 1026 X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; 1027 1028 if (((cr4 ^ old_cr4) & mmu_role_bits) || 1029 (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) 1030 kvm_mmu_reset_context(vcpu); 1031 } 1032 EXPORT_SYMBOL_GPL(kvm_post_set_cr4); 1033 1034 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1035 { 1036 unsigned long old_cr4 = kvm_read_cr4(vcpu); 1037 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | 1038 X86_CR4_SMEP; 1039 1040 if (!kvm_is_valid_cr4(vcpu, cr4)) 1041 return 1; 1042 1043 if (is_long_mode(vcpu)) { 1044 if (!(cr4 & X86_CR4_PAE)) 1045 return 1; 1046 if ((cr4 ^ old_cr4) & X86_CR4_LA57) 1047 return 1; 1048 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 1049 && ((cr4 ^ old_cr4) & pdptr_bits) 1050 && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, 1051 kvm_read_cr3(vcpu))) 1052 return 1; 1053 1054 if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { 1055 if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID)) 1056 return 1; 1057 1058 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ 1059 if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) 1060 return 1; 1061 } 1062 1063 static_call(kvm_x86_set_cr4)(vcpu, cr4); 1064 1065 kvm_post_set_cr4(vcpu, old_cr4, cr4); 1066 1067 return 0; 1068 } 1069 EXPORT_SYMBOL_GPL(kvm_set_cr4); 1070 1071 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 1072 { 1073 bool skip_tlb_flush = false; 1074 #ifdef CONFIG_X86_64 1075 bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); 1076 1077 if (pcid_enabled) { 1078 skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH; 1079 cr3 &= ~X86_CR3_PCID_NOFLUSH; 1080 } 1081 #endif 1082 1083 if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { 1084 if (!skip_tlb_flush) { 1085 kvm_mmu_sync_roots(vcpu); 1086 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1087 } 1088 return 0; 1089 } 1090 1091 /* 1092 * Do not condition the GPA check on long mode, this helper is used to 1093 * stuff CR3, e.g. for RSM emulation, and there is no guarantee that 1094 * the current vCPU mode is accurate. 1095 */ 1096 if (kvm_vcpu_is_illegal_gpa(vcpu, cr3)) 1097 return 1; 1098 1099 if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) 1100 return 1; 1101 1102 kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush); 1103 vcpu->arch.cr3 = cr3; 1104 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 1105 1106 return 0; 1107 } 1108 EXPORT_SYMBOL_GPL(kvm_set_cr3); 1109 1110 int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 1111 { 1112 if (cr8 & CR8_RESERVED_BITS) 1113 return 1; 1114 if (lapic_in_kernel(vcpu)) 1115 kvm_lapic_set_tpr(vcpu, cr8); 1116 else 1117 vcpu->arch.cr8 = cr8; 1118 return 0; 1119 } 1120 EXPORT_SYMBOL_GPL(kvm_set_cr8); 1121 1122 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 1123 { 1124 if (lapic_in_kernel(vcpu)) 1125 return kvm_lapic_get_cr8(vcpu); 1126 else 1127 return vcpu->arch.cr8; 1128 } 1129 EXPORT_SYMBOL_GPL(kvm_get_cr8); 1130 1131 static void kvm_update_dr0123(struct kvm_vcpu *vcpu) 1132 { 1133 int i; 1134 1135 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { 1136 for (i = 0; i < KVM_NR_DB_REGS; i++) 1137 vcpu->arch.eff_db[i] = vcpu->arch.db[i]; 1138 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD; 1139 } 1140 } 1141 1142 void kvm_update_dr7(struct kvm_vcpu *vcpu) 1143 { 1144 unsigned long dr7; 1145 1146 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1147 dr7 = vcpu->arch.guest_debug_dr7; 1148 else 1149 dr7 = vcpu->arch.dr7; 1150 static_call(kvm_x86_set_dr7)(vcpu, dr7); 1151 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; 1152 if (dr7 & DR7_BP_EN_MASK) 1153 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; 1154 } 1155 EXPORT_SYMBOL_GPL(kvm_update_dr7); 1156 1157 static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) 1158 { 1159 u64 fixed = DR6_FIXED_1; 1160 1161 if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM)) 1162 fixed |= DR6_RTM; 1163 return fixed; 1164 } 1165 1166 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) 1167 { 1168 size_t size = ARRAY_SIZE(vcpu->arch.db); 1169 1170 switch (dr) { 1171 case 0 ... 3: 1172 vcpu->arch.db[array_index_nospec(dr, size)] = val; 1173 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) 1174 vcpu->arch.eff_db[dr] = val; 1175 break; 1176 case 4: 1177 case 6: 1178 if (!kvm_dr6_valid(val)) 1179 return 1; /* #GP */ 1180 vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); 1181 break; 1182 case 5: 1183 default: /* 7 */ 1184 if (!kvm_dr7_valid(val)) 1185 return 1; /* #GP */ 1186 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; 1187 kvm_update_dr7(vcpu); 1188 break; 1189 } 1190 1191 return 0; 1192 } 1193 EXPORT_SYMBOL_GPL(kvm_set_dr); 1194 1195 void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) 1196 { 1197 size_t size = ARRAY_SIZE(vcpu->arch.db); 1198 1199 switch (dr) { 1200 case 0 ... 3: 1201 *val = vcpu->arch.db[array_index_nospec(dr, size)]; 1202 break; 1203 case 4: 1204 case 6: 1205 *val = vcpu->arch.dr6; 1206 break; 1207 case 5: 1208 default: /* 7 */ 1209 *val = vcpu->arch.dr7; 1210 break; 1211 } 1212 } 1213 EXPORT_SYMBOL_GPL(kvm_get_dr); 1214 1215 bool kvm_rdpmc(struct kvm_vcpu *vcpu) 1216 { 1217 u32 ecx = kvm_rcx_read(vcpu); 1218 u64 data; 1219 int err; 1220 1221 err = kvm_pmu_rdpmc(vcpu, ecx, &data); 1222 if (err) 1223 return err; 1224 kvm_rax_write(vcpu, (u32)data); 1225 kvm_rdx_write(vcpu, data >> 32); 1226 return err; 1227 } 1228 EXPORT_SYMBOL_GPL(kvm_rdpmc); 1229 1230 /* 1231 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 1232 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 1233 * 1234 * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) 1235 * extract the supported MSRs from the related const lists. 1236 * msrs_to_save is selected from the msrs_to_save_all to reflect the 1237 * capabilities of the host cpu. This capabilities test skips MSRs that are 1238 * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs 1239 * may depend on host virtualization features rather than host cpu features. 1240 */ 1241 1242 static const u32 msrs_to_save_all[] = { 1243 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 1244 MSR_STAR, 1245 #ifdef CONFIG_X86_64 1246 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 1247 #endif 1248 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, 1249 MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, 1250 MSR_IA32_SPEC_CTRL, 1251 MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, 1252 MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, 1253 MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, 1254 MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, 1255 MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, 1256 MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, 1257 MSR_IA32_UMWAIT_CONTROL, 1258 1259 MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, 1260 MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3, 1261 MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, 1262 MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 1263 MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, 1264 MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, 1265 MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, 1266 MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, 1267 MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9, 1268 MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11, 1269 MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, 1270 MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, 1271 MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, 1272 MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, 1273 MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, 1274 MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, 1275 MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, 1276 MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9, 1277 MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11, 1278 MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, 1279 MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, 1280 MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, 1281 }; 1282 1283 static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; 1284 static unsigned num_msrs_to_save; 1285 1286 static const u32 emulated_msrs_all[] = { 1287 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 1288 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 1289 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 1290 HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, 1291 HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, 1292 HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, 1293 HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, 1294 HV_X64_MSR_RESET, 1295 HV_X64_MSR_VP_INDEX, 1296 HV_X64_MSR_VP_RUNTIME, 1297 HV_X64_MSR_SCONTROL, 1298 HV_X64_MSR_STIMER0_CONFIG, 1299 HV_X64_MSR_VP_ASSIST_PAGE, 1300 HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, 1301 HV_X64_MSR_TSC_EMULATION_STATUS, 1302 HV_X64_MSR_SYNDBG_OPTIONS, 1303 HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, 1304 HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, 1305 HV_X64_MSR_SYNDBG_PENDING_BUFFER, 1306 1307 MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, 1308 MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, 1309 1310 MSR_IA32_TSC_ADJUST, 1311 MSR_IA32_TSCDEADLINE, 1312 MSR_IA32_ARCH_CAPABILITIES, 1313 MSR_IA32_PERF_CAPABILITIES, 1314 MSR_IA32_MISC_ENABLE, 1315 MSR_IA32_MCG_STATUS, 1316 MSR_IA32_MCG_CTL, 1317 MSR_IA32_MCG_EXT_CTL, 1318 MSR_IA32_SMBASE, 1319 MSR_SMI_COUNT, 1320 MSR_PLATFORM_INFO, 1321 MSR_MISC_FEATURES_ENABLES, 1322 MSR_AMD64_VIRT_SPEC_CTRL, 1323 MSR_IA32_POWER_CTL, 1324 MSR_IA32_UCODE_REV, 1325 1326 /* 1327 * The following list leaves out MSRs whose values are determined 1328 * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs. 1329 * We always support the "true" VMX control MSRs, even if the host 1330 * processor does not, so I am putting these registers here rather 1331 * than in msrs_to_save_all. 1332 */ 1333 MSR_IA32_VMX_BASIC, 1334 MSR_IA32_VMX_TRUE_PINBASED_CTLS, 1335 MSR_IA32_VMX_TRUE_PROCBASED_CTLS, 1336 MSR_IA32_VMX_TRUE_EXIT_CTLS, 1337 MSR_IA32_VMX_TRUE_ENTRY_CTLS, 1338 MSR_IA32_VMX_MISC, 1339 MSR_IA32_VMX_CR0_FIXED0, 1340 MSR_IA32_VMX_CR4_FIXED0, 1341 MSR_IA32_VMX_VMCS_ENUM, 1342 MSR_IA32_VMX_PROCBASED_CTLS2, 1343 MSR_IA32_VMX_EPT_VPID_CAP, 1344 MSR_IA32_VMX_VMFUNC, 1345 1346 MSR_K7_HWCR, 1347 MSR_KVM_POLL_CONTROL, 1348 }; 1349 1350 static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; 1351 static unsigned num_emulated_msrs; 1352 1353 /* 1354 * List of msr numbers which are used to expose MSR-based features that 1355 * can be used by a hypervisor to validate requested CPU features. 1356 */ 1357 static const u32 msr_based_features_all[] = { 1358 MSR_IA32_VMX_BASIC, 1359 MSR_IA32_VMX_TRUE_PINBASED_CTLS, 1360 MSR_IA32_VMX_PINBASED_CTLS, 1361 MSR_IA32_VMX_TRUE_PROCBASED_CTLS, 1362 MSR_IA32_VMX_PROCBASED_CTLS, 1363 MSR_IA32_VMX_TRUE_EXIT_CTLS, 1364 MSR_IA32_VMX_EXIT_CTLS, 1365 MSR_IA32_VMX_TRUE_ENTRY_CTLS, 1366 MSR_IA32_VMX_ENTRY_CTLS, 1367 MSR_IA32_VMX_MISC, 1368 MSR_IA32_VMX_CR0_FIXED0, 1369 MSR_IA32_VMX_CR0_FIXED1, 1370 MSR_IA32_VMX_CR4_FIXED0, 1371 MSR_IA32_VMX_CR4_FIXED1, 1372 MSR_IA32_VMX_VMCS_ENUM, 1373 MSR_IA32_VMX_PROCBASED_CTLS2, 1374 MSR_IA32_VMX_EPT_VPID_CAP, 1375 MSR_IA32_VMX_VMFUNC, 1376 1377 MSR_F10H_DECFG, 1378 MSR_IA32_UCODE_REV, 1379 MSR_IA32_ARCH_CAPABILITIES, 1380 MSR_IA32_PERF_CAPABILITIES, 1381 }; 1382 1383 static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)]; 1384 static unsigned int num_msr_based_features; 1385 1386 static u64 kvm_get_arch_capabilities(void) 1387 { 1388 u64 data = 0; 1389 1390 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) 1391 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); 1392 1393 /* 1394 * If nx_huge_pages is enabled, KVM's shadow paging will ensure that 1395 * the nested hypervisor runs with NX huge pages. If it is not, 1396 * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other 1397 * L1 guests, so it need not worry about its own (L2) guests. 1398 */ 1399 data |= ARCH_CAP_PSCHANGE_MC_NO; 1400 1401 /* 1402 * If we're doing cache flushes (either "always" or "cond") 1403 * we will do one whenever the guest does a vmlaunch/vmresume. 1404 * If an outer hypervisor is doing the cache flush for us 1405 * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that 1406 * capability to the guest too, and if EPT is disabled we're not 1407 * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will 1408 * require a nested hypervisor to do a flush of its own. 1409 */ 1410 if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) 1411 data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; 1412 1413 if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) 1414 data |= ARCH_CAP_RDCL_NO; 1415 if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) 1416 data |= ARCH_CAP_SSB_NO; 1417 if (!boot_cpu_has_bug(X86_BUG_MDS)) 1418 data |= ARCH_CAP_MDS_NO; 1419 1420 if (!boot_cpu_has(X86_FEATURE_RTM)) { 1421 /* 1422 * If RTM=0 because the kernel has disabled TSX, the host might 1423 * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0 1424 * and therefore knows that there cannot be TAA) but keep 1425 * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts, 1426 * and we want to allow migrating those guests to tsx=off hosts. 1427 */ 1428 data &= ~ARCH_CAP_TAA_NO; 1429 } else if (!boot_cpu_has_bug(X86_BUG_TAA)) { 1430 data |= ARCH_CAP_TAA_NO; 1431 } else { 1432 /* 1433 * Nothing to do here; we emulate TSX_CTRL if present on the 1434 * host so the guest can choose between disabling TSX or 1435 * using VERW to clear CPU buffers. 1436 */ 1437 } 1438 1439 return data; 1440 } 1441 1442 static int kvm_get_msr_feature(struct kvm_msr_entry *msr) 1443 { 1444 switch (msr->index) { 1445 case MSR_IA32_ARCH_CAPABILITIES: 1446 msr->data = kvm_get_arch_capabilities(); 1447 break; 1448 case MSR_IA32_UCODE_REV: 1449 rdmsrl_safe(msr->index, &msr->data); 1450 break; 1451 default: 1452 return static_call(kvm_x86_get_msr_feature)(msr); 1453 } 1454 return 0; 1455 } 1456 1457 static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 1458 { 1459 struct kvm_msr_entry msr; 1460 int r; 1461 1462 msr.index = index; 1463 r = kvm_get_msr_feature(&msr); 1464 1465 if (r == KVM_MSR_RET_INVALID) { 1466 /* Unconditionally clear the output for simplicity */ 1467 *data = 0; 1468 if (kvm_msr_ignored_check(index, 0, false)) 1469 r = 0; 1470 } 1471 1472 if (r) 1473 return r; 1474 1475 *data = msr.data; 1476 1477 return 0; 1478 } 1479 1480 static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) 1481 { 1482 if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT)) 1483 return false; 1484 1485 if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM)) 1486 return false; 1487 1488 if (efer & (EFER_LME | EFER_LMA) && 1489 !guest_cpuid_has(vcpu, X86_FEATURE_LM)) 1490 return false; 1491 1492 if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX)) 1493 return false; 1494 1495 return true; 1496 1497 } 1498 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) 1499 { 1500 if (efer & efer_reserved_bits) 1501 return false; 1502 1503 return __kvm_valid_efer(vcpu, efer); 1504 } 1505 EXPORT_SYMBOL_GPL(kvm_valid_efer); 1506 1507 static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 1508 { 1509 u64 old_efer = vcpu->arch.efer; 1510 u64 efer = msr_info->data; 1511 int r; 1512 1513 if (efer & efer_reserved_bits) 1514 return 1; 1515 1516 if (!msr_info->host_initiated) { 1517 if (!__kvm_valid_efer(vcpu, efer)) 1518 return 1; 1519 1520 if (is_paging(vcpu) && 1521 (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) 1522 return 1; 1523 } 1524 1525 efer &= ~EFER_LMA; 1526 efer |= vcpu->arch.efer & EFER_LMA; 1527 1528 r = static_call(kvm_x86_set_efer)(vcpu, efer); 1529 if (r) { 1530 WARN_ON(r > 0); 1531 return r; 1532 } 1533 1534 /* Update reserved bits */ 1535 if ((efer ^ old_efer) & EFER_NX) 1536 kvm_mmu_reset_context(vcpu); 1537 1538 return 0; 1539 } 1540 1541 void kvm_enable_efer_bits(u64 mask) 1542 { 1543 efer_reserved_bits &= ~mask; 1544 } 1545 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); 1546 1547 bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) 1548 { 1549 struct kvm_x86_msr_filter *msr_filter; 1550 struct msr_bitmap_range *ranges; 1551 struct kvm *kvm = vcpu->kvm; 1552 bool allowed; 1553 int idx; 1554 u32 i; 1555 1556 /* x2APIC MSRs do not support filtering. */ 1557 if (index >= 0x800 && index <= 0x8ff) 1558 return true; 1559 1560 idx = srcu_read_lock(&kvm->srcu); 1561 1562 msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu); 1563 if (!msr_filter) { 1564 allowed = true; 1565 goto out; 1566 } 1567 1568 allowed = msr_filter->default_allow; 1569 ranges = msr_filter->ranges; 1570 1571 for (i = 0; i < msr_filter->count; i++) { 1572 u32 start = ranges[i].base; 1573 u32 end = start + ranges[i].nmsrs; 1574 u32 flags = ranges[i].flags; 1575 unsigned long *bitmap = ranges[i].bitmap; 1576 1577 if ((index >= start) && (index < end) && (flags & type)) { 1578 allowed = !!test_bit(index - start, bitmap); 1579 break; 1580 } 1581 } 1582 1583 out: 1584 srcu_read_unlock(&kvm->srcu, idx); 1585 1586 return allowed; 1587 } 1588 EXPORT_SYMBOL_GPL(kvm_msr_allowed); 1589 1590 /* 1591 * Write @data into the MSR specified by @index. Select MSR specific fault 1592 * checks are bypassed if @host_initiated is %true. 1593 * Returns 0 on success, non-0 otherwise. 1594 * Assumes vcpu_load() was already called. 1595 */ 1596 static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, 1597 bool host_initiated) 1598 { 1599 struct msr_data msr; 1600 1601 if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) 1602 return KVM_MSR_RET_FILTERED; 1603 1604 switch (index) { 1605 case MSR_FS_BASE: 1606 case MSR_GS_BASE: 1607 case MSR_KERNEL_GS_BASE: 1608 case MSR_CSTAR: 1609 case MSR_LSTAR: 1610 if (is_noncanonical_address(data, vcpu)) 1611 return 1; 1612 break; 1613 case MSR_IA32_SYSENTER_EIP: 1614 case MSR_IA32_SYSENTER_ESP: 1615 /* 1616 * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if 1617 * non-canonical address is written on Intel but not on 1618 * AMD (which ignores the top 32-bits, because it does 1619 * not implement 64-bit SYSENTER). 1620 * 1621 * 64-bit code should hence be able to write a non-canonical 1622 * value on AMD. Making the address canonical ensures that 1623 * vmentry does not fail on Intel after writing a non-canonical 1624 * value, and that something deterministic happens if the guest 1625 * invokes 64-bit SYSENTER. 1626 */ 1627 data = get_canonical(data, vcpu_virt_addr_bits(vcpu)); 1628 } 1629 1630 msr.data = data; 1631 msr.index = index; 1632 msr.host_initiated = host_initiated; 1633 1634 return static_call(kvm_x86_set_msr)(vcpu, &msr); 1635 } 1636 1637 static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, 1638 u32 index, u64 data, bool host_initiated) 1639 { 1640 int ret = __kvm_set_msr(vcpu, index, data, host_initiated); 1641 1642 if (ret == KVM_MSR_RET_INVALID) 1643 if (kvm_msr_ignored_check(index, data, true)) 1644 ret = 0; 1645 1646 return ret; 1647 } 1648 1649 /* 1650 * Read the MSR specified by @index into @data. Select MSR specific fault 1651 * checks are bypassed if @host_initiated is %true. 1652 * Returns 0 on success, non-0 otherwise. 1653 * Assumes vcpu_load() was already called. 1654 */ 1655 int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, 1656 bool host_initiated) 1657 { 1658 struct msr_data msr; 1659 int ret; 1660 1661 if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) 1662 return KVM_MSR_RET_FILTERED; 1663 1664 msr.index = index; 1665 msr.host_initiated = host_initiated; 1666 1667 ret = static_call(kvm_x86_get_msr)(vcpu, &msr); 1668 if (!ret) 1669 *data = msr.data; 1670 return ret; 1671 } 1672 1673 static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, 1674 u32 index, u64 *data, bool host_initiated) 1675 { 1676 int ret = __kvm_get_msr(vcpu, index, data, host_initiated); 1677 1678 if (ret == KVM_MSR_RET_INVALID) { 1679 /* Unconditionally clear *data for simplicity */ 1680 *data = 0; 1681 if (kvm_msr_ignored_check(index, 0, false)) 1682 ret = 0; 1683 } 1684 1685 return ret; 1686 } 1687 1688 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) 1689 { 1690 return kvm_get_msr_ignored_check(vcpu, index, data, false); 1691 } 1692 EXPORT_SYMBOL_GPL(kvm_get_msr); 1693 1694 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) 1695 { 1696 return kvm_set_msr_ignored_check(vcpu, index, data, false); 1697 } 1698 EXPORT_SYMBOL_GPL(kvm_set_msr); 1699 1700 static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) 1701 { 1702 int err = vcpu->run->msr.error; 1703 if (!err) { 1704 kvm_rax_write(vcpu, (u32)vcpu->run->msr.data); 1705 kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32); 1706 } 1707 1708 return static_call(kvm_x86_complete_emulated_msr)(vcpu, err); 1709 } 1710 1711 static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu) 1712 { 1713 return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error); 1714 } 1715 1716 static u64 kvm_msr_reason(int r) 1717 { 1718 switch (r) { 1719 case KVM_MSR_RET_INVALID: 1720 return KVM_MSR_EXIT_REASON_UNKNOWN; 1721 case KVM_MSR_RET_FILTERED: 1722 return KVM_MSR_EXIT_REASON_FILTER; 1723 default: 1724 return KVM_MSR_EXIT_REASON_INVAL; 1725 } 1726 } 1727 1728 static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, 1729 u32 exit_reason, u64 data, 1730 int (*completion)(struct kvm_vcpu *vcpu), 1731 int r) 1732 { 1733 u64 msr_reason = kvm_msr_reason(r); 1734 1735 /* Check if the user wanted to know about this MSR fault */ 1736 if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason)) 1737 return 0; 1738 1739 vcpu->run->exit_reason = exit_reason; 1740 vcpu->run->msr.error = 0; 1741 memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad)); 1742 vcpu->run->msr.reason = msr_reason; 1743 vcpu->run->msr.index = index; 1744 vcpu->run->msr.data = data; 1745 vcpu->arch.complete_userspace_io = completion; 1746 1747 return 1; 1748 } 1749 1750 static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r) 1751 { 1752 return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0, 1753 complete_emulated_rdmsr, r); 1754 } 1755 1756 static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r) 1757 { 1758 return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data, 1759 complete_emulated_wrmsr, r); 1760 } 1761 1762 int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) 1763 { 1764 u32 ecx = kvm_rcx_read(vcpu); 1765 u64 data; 1766 int r; 1767 1768 r = kvm_get_msr(vcpu, ecx, &data); 1769 1770 /* MSR read failed? See if we should ask user space */ 1771 if (r && kvm_get_msr_user_space(vcpu, ecx, r)) { 1772 /* Bounce to user space */ 1773 return 0; 1774 } 1775 1776 if (!r) { 1777 trace_kvm_msr_read(ecx, data); 1778 1779 kvm_rax_write(vcpu, data & -1u); 1780 kvm_rdx_write(vcpu, (data >> 32) & -1u); 1781 } else { 1782 trace_kvm_msr_read_ex(ecx); 1783 } 1784 1785 return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); 1786 } 1787 EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); 1788 1789 int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) 1790 { 1791 u32 ecx = kvm_rcx_read(vcpu); 1792 u64 data = kvm_read_edx_eax(vcpu); 1793 int r; 1794 1795 r = kvm_set_msr(vcpu, ecx, data); 1796 1797 /* MSR write failed? See if we should ask user space */ 1798 if (r && kvm_set_msr_user_space(vcpu, ecx, data, r)) 1799 /* Bounce to user space */ 1800 return 0; 1801 1802 /* Signal all other negative errors to userspace */ 1803 if (r < 0) 1804 return r; 1805 1806 if (!r) 1807 trace_kvm_msr_write(ecx, data); 1808 else 1809 trace_kvm_msr_write_ex(ecx, data); 1810 1811 return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); 1812 } 1813 EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); 1814 1815 static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) 1816 { 1817 xfer_to_guest_mode_prepare(); 1818 return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || 1819 xfer_to_guest_mode_work_pending(); 1820 } 1821 1822 /* 1823 * The fast path for frequent and performance sensitive wrmsr emulation, 1824 * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces 1825 * the latency of virtual IPI by avoiding the expensive bits of transitioning 1826 * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the 1827 * other cases which must be called after interrupts are enabled on the host. 1828 */ 1829 static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) 1830 { 1831 if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic)) 1832 return 1; 1833 1834 if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) && 1835 ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && 1836 ((data & APIC_MODE_MASK) == APIC_DM_FIXED) && 1837 ((u32)(data >> 32) != X2APIC_BROADCAST)) { 1838 1839 data &= ~(1 << 12); 1840 kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32)); 1841 kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32)); 1842 kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data); 1843 trace_kvm_apic_write(APIC_ICR, (u32)data); 1844 return 0; 1845 } 1846 1847 return 1; 1848 } 1849 1850 static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) 1851 { 1852 if (!kvm_can_use_hv_timer(vcpu)) 1853 return 1; 1854 1855 kvm_set_lapic_tscdeadline_msr(vcpu, data); 1856 return 0; 1857 } 1858 1859 fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) 1860 { 1861 u32 msr = kvm_rcx_read(vcpu); 1862 u64 data; 1863 fastpath_t ret = EXIT_FASTPATH_NONE; 1864 1865 switch (msr) { 1866 case APIC_BASE_MSR + (APIC_ICR >> 4): 1867 data = kvm_read_edx_eax(vcpu); 1868 if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) { 1869 kvm_skip_emulated_instruction(vcpu); 1870 ret = EXIT_FASTPATH_EXIT_HANDLED; 1871 } 1872 break; 1873 case MSR_IA32_TSCDEADLINE: 1874 data = kvm_read_edx_eax(vcpu); 1875 if (!handle_fastpath_set_tscdeadline(vcpu, data)) { 1876 kvm_skip_emulated_instruction(vcpu); 1877 ret = EXIT_FASTPATH_REENTER_GUEST; 1878 } 1879 break; 1880 default: 1881 break; 1882 } 1883 1884 if (ret != EXIT_FASTPATH_NONE) 1885 trace_kvm_msr_write(msr, data); 1886 1887 return ret; 1888 } 1889 EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); 1890 1891 /* 1892 * Adapt set_msr() to msr_io()'s calling convention 1893 */ 1894 static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 1895 { 1896 return kvm_get_msr_ignored_check(vcpu, index, data, true); 1897 } 1898 1899 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 1900 { 1901 return kvm_set_msr_ignored_check(vcpu, index, *data, true); 1902 } 1903 1904 #ifdef CONFIG_X86_64 1905 struct pvclock_clock { 1906 int vclock_mode; 1907 u64 cycle_last; 1908 u64 mask; 1909 u32 mult; 1910 u32 shift; 1911 u64 base_cycles; 1912 u64 offset; 1913 }; 1914 1915 struct pvclock_gtod_data { 1916 seqcount_t seq; 1917 1918 struct pvclock_clock clock; /* extract of a clocksource struct */ 1919 struct pvclock_clock raw_clock; /* extract of a clocksource struct */ 1920 1921 ktime_t offs_boot; 1922 u64 wall_time_sec; 1923 }; 1924 1925 static struct pvclock_gtod_data pvclock_gtod_data; 1926 1927 static void update_pvclock_gtod(struct timekeeper *tk) 1928 { 1929 struct pvclock_gtod_data *vdata = &pvclock_gtod_data; 1930 1931 write_seqcount_begin(&vdata->seq); 1932 1933 /* copy pvclock gtod data */ 1934 vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode; 1935 vdata->clock.cycle_last = tk->tkr_mono.cycle_last; 1936 vdata->clock.mask = tk->tkr_mono.mask; 1937 vdata->clock.mult = tk->tkr_mono.mult; 1938 vdata->clock.shift = tk->tkr_mono.shift; 1939 vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec; 1940 vdata->clock.offset = tk->tkr_mono.base; 1941 1942 vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode; 1943 vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last; 1944 vdata->raw_clock.mask = tk->tkr_raw.mask; 1945 vdata->raw_clock.mult = tk->tkr_raw.mult; 1946 vdata->raw_clock.shift = tk->tkr_raw.shift; 1947 vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec; 1948 vdata->raw_clock.offset = tk->tkr_raw.base; 1949 1950 vdata->wall_time_sec = tk->xtime_sec; 1951 1952 vdata->offs_boot = tk->offs_boot; 1953 1954 write_seqcount_end(&vdata->seq); 1955 } 1956 1957 static s64 get_kvmclock_base_ns(void) 1958 { 1959 /* Count up from boot time, but with the frequency of the raw clock. */ 1960 return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot)); 1961 } 1962 #else 1963 static s64 get_kvmclock_base_ns(void) 1964 { 1965 /* Master clock not used, so we can just use CLOCK_BOOTTIME. */ 1966 return ktime_get_boottime_ns(); 1967 } 1968 #endif 1969 1970 void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs) 1971 { 1972 int version; 1973 int r; 1974 struct pvclock_wall_clock wc; 1975 u32 wc_sec_hi; 1976 u64 wall_nsec; 1977 1978 if (!wall_clock) 1979 return; 1980 1981 r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version)); 1982 if (r) 1983 return; 1984 1985 if (version & 1) 1986 ++version; /* first time write, random junk */ 1987 1988 ++version; 1989 1990 if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version))) 1991 return; 1992 1993 /* 1994 * The guest calculates current wall clock time by adding 1995 * system time (updated by kvm_guest_time_update below) to the 1996 * wall clock specified here. We do the reverse here. 1997 */ 1998 wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm); 1999 2000 wc.nsec = do_div(wall_nsec, 1000000000); 2001 wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */ 2002 wc.version = version; 2003 2004 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); 2005 2006 if (sec_hi_ofs) { 2007 wc_sec_hi = wall_nsec >> 32; 2008 kvm_write_guest(kvm, wall_clock + sec_hi_ofs, 2009 &wc_sec_hi, sizeof(wc_sec_hi)); 2010 } 2011 2012 version++; 2013 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 2014 } 2015 2016 static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, 2017 bool old_msr, bool host_initiated) 2018 { 2019 struct kvm_arch *ka = &vcpu->kvm->arch; 2020 2021 if (vcpu->vcpu_id == 0 && !host_initiated) { 2022 if (ka->boot_vcpu_runs_old_kvmclock != old_msr) 2023 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); 2024 2025 ka->boot_vcpu_runs_old_kvmclock = old_msr; 2026 } 2027 2028 vcpu->arch.time = system_time; 2029 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); 2030 2031 /* we verify if the enable bit is set... */ 2032 vcpu->arch.pv_time_enabled = false; 2033 if (!(system_time & 1)) 2034 return; 2035 2036 if (!kvm_gfn_to_hva_cache_init(vcpu->kvm, 2037 &vcpu->arch.pv_time, system_time & ~1ULL, 2038 sizeof(struct pvclock_vcpu_time_info))) 2039 vcpu->arch.pv_time_enabled = true; 2040 2041 return; 2042 } 2043 2044 static uint32_t div_frac(uint32_t dividend, uint32_t divisor) 2045 { 2046 do_shl32_div32(dividend, divisor); 2047 return dividend; 2048 } 2049 2050 static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz, 2051 s8 *pshift, u32 *pmultiplier) 2052 { 2053 uint64_t scaled64; 2054 int32_t shift = 0; 2055 uint64_t tps64; 2056 uint32_t tps32; 2057 2058 tps64 = base_hz; 2059 scaled64 = scaled_hz; 2060 while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { 2061 tps64 >>= 1; 2062 shift--; 2063 } 2064 2065 tps32 = (uint32_t)tps64; 2066 while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) { 2067 if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000) 2068 scaled64 >>= 1; 2069 else 2070 tps32 <<= 1; 2071 shift++; 2072 } 2073 2074 *pshift = shift; 2075 *pmultiplier = div_frac(scaled64, tps32); 2076 } 2077 2078 #ifdef CONFIG_X86_64 2079 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); 2080 #endif 2081 2082 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 2083 static unsigned long max_tsc_khz; 2084 2085 static u32 adjust_tsc_khz(u32 khz, s32 ppm) 2086 { 2087 u64 v = (u64)khz * (1000000 + ppm); 2088 do_div(v, 1000000); 2089 return v; 2090 } 2091 2092 static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) 2093 { 2094 u64 ratio; 2095 2096 /* Guest TSC same frequency as host TSC? */ 2097 if (!scale) { 2098 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; 2099 return 0; 2100 } 2101 2102 /* TSC scaling supported? */ 2103 if (!kvm_has_tsc_control) { 2104 if (user_tsc_khz > tsc_khz) { 2105 vcpu->arch.tsc_catchup = 1; 2106 vcpu->arch.tsc_always_catchup = 1; 2107 return 0; 2108 } else { 2109 pr_warn_ratelimited("user requested TSC rate below hardware speed\n"); 2110 return -1; 2111 } 2112 } 2113 2114 /* TSC scaling required - calculate ratio */ 2115 ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits, 2116 user_tsc_khz, tsc_khz); 2117 2118 if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) { 2119 pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", 2120 user_tsc_khz); 2121 return -1; 2122 } 2123 2124 vcpu->arch.tsc_scaling_ratio = ratio; 2125 return 0; 2126 } 2127 2128 static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) 2129 { 2130 u32 thresh_lo, thresh_hi; 2131 int use_scaling = 0; 2132 2133 /* tsc_khz can be zero if TSC calibration fails */ 2134 if (user_tsc_khz == 0) { 2135 /* set tsc_scaling_ratio to a safe value */ 2136 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; 2137 return -1; 2138 } 2139 2140 /* Compute a scale to convert nanoseconds in TSC cycles */ 2141 kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC, 2142 &vcpu->arch.virtual_tsc_shift, 2143 &vcpu->arch.virtual_tsc_mult); 2144 vcpu->arch.virtual_tsc_khz = user_tsc_khz; 2145 2146 /* 2147 * Compute the variation in TSC rate which is acceptable 2148 * within the range of tolerance and decide if the 2149 * rate being applied is within that bounds of the hardware 2150 * rate. If so, no scaling or compensation need be done. 2151 */ 2152 thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); 2153 thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); 2154 if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) { 2155 pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi); 2156 use_scaling = 1; 2157 } 2158 return set_tsc_khz(vcpu, user_tsc_khz, use_scaling); 2159 } 2160 2161 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) 2162 { 2163 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec, 2164 vcpu->arch.virtual_tsc_mult, 2165 vcpu->arch.virtual_tsc_shift); 2166 tsc += vcpu->arch.this_tsc_write; 2167 return tsc; 2168 } 2169 2170 static inline int gtod_is_based_on_tsc(int mode) 2171 { 2172 return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK; 2173 } 2174 2175 static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) 2176 { 2177 #ifdef CONFIG_X86_64 2178 bool vcpus_matched; 2179 struct kvm_arch *ka = &vcpu->kvm->arch; 2180 struct pvclock_gtod_data *gtod = &pvclock_gtod_data; 2181 2182 vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == 2183 atomic_read(&vcpu->kvm->online_vcpus)); 2184 2185 /* 2186 * Once the masterclock is enabled, always perform request in 2187 * order to update it. 2188 * 2189 * In order to enable masterclock, the host clocksource must be TSC 2190 * and the vcpus need to have matched TSCs. When that happens, 2191 * perform request to enable masterclock. 2192 */ 2193 if (ka->use_master_clock || 2194 (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched)) 2195 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); 2196 2197 trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, 2198 atomic_read(&vcpu->kvm->online_vcpus), 2199 ka->use_master_clock, gtod->clock.vclock_mode); 2200 #endif 2201 } 2202 2203 /* 2204 * Multiply tsc by a fixed point number represented by ratio. 2205 * 2206 * The most significant 64-N bits (mult) of ratio represent the 2207 * integral part of the fixed point number; the remaining N bits 2208 * (frac) represent the fractional part, ie. ratio represents a fixed 2209 * point number (mult + frac * 2^(-N)). 2210 * 2211 * N equals to kvm_tsc_scaling_ratio_frac_bits. 2212 */ 2213 static inline u64 __scale_tsc(u64 ratio, u64 tsc) 2214 { 2215 return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits); 2216 } 2217 2218 u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) 2219 { 2220 u64 _tsc = tsc; 2221 u64 ratio = vcpu->arch.tsc_scaling_ratio; 2222 2223 if (ratio != kvm_default_tsc_scaling_ratio) 2224 _tsc = __scale_tsc(ratio, tsc); 2225 2226 return _tsc; 2227 } 2228 EXPORT_SYMBOL_GPL(kvm_scale_tsc); 2229 2230 static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) 2231 { 2232 u64 tsc; 2233 2234 tsc = kvm_scale_tsc(vcpu, rdtsc()); 2235 2236 return target_tsc - tsc; 2237 } 2238 2239 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) 2240 { 2241 return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc); 2242 } 2243 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); 2244 2245 static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 2246 { 2247 vcpu->arch.l1_tsc_offset = offset; 2248 vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset); 2249 } 2250 2251 static inline bool kvm_check_tsc_unstable(void) 2252 { 2253 #ifdef CONFIG_X86_64 2254 /* 2255 * TSC is marked unstable when we're running on Hyper-V, 2256 * 'TSC page' clocksource is good. 2257 */ 2258 if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK) 2259 return false; 2260 #endif 2261 return check_tsc_unstable(); 2262 } 2263 2264 static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) 2265 { 2266 struct kvm *kvm = vcpu->kvm; 2267 u64 offset, ns, elapsed; 2268 unsigned long flags; 2269 bool matched; 2270 bool already_matched; 2271 bool synchronizing = false; 2272 2273 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); 2274 offset = kvm_compute_tsc_offset(vcpu, data); 2275 ns = get_kvmclock_base_ns(); 2276 elapsed = ns - kvm->arch.last_tsc_nsec; 2277 2278 if (vcpu->arch.virtual_tsc_khz) { 2279 if (data == 0) { 2280 /* 2281 * detection of vcpu initialization -- need to sync 2282 * with other vCPUs. This particularly helps to keep 2283 * kvm_clock stable after CPU hotplug 2284 */ 2285 synchronizing = true; 2286 } else { 2287 u64 tsc_exp = kvm->arch.last_tsc_write + 2288 nsec_to_cycles(vcpu, elapsed); 2289 u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL; 2290 /* 2291 * Special case: TSC write with a small delta (1 second) 2292 * of virtual cycle time against real time is 2293 * interpreted as an attempt to synchronize the CPU. 2294 */ 2295 synchronizing = data < tsc_exp + tsc_hz && 2296 data + tsc_hz > tsc_exp; 2297 } 2298 } 2299 2300 /* 2301 * For a reliable TSC, we can match TSC offsets, and for an unstable 2302 * TSC, we add elapsed time in this computation. We could let the 2303 * compensation code attempt to catch up if we fall behind, but 2304 * it's better to try to match offsets from the beginning. 2305 */ 2306 if (synchronizing && 2307 vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { 2308 if (!kvm_check_tsc_unstable()) { 2309 offset = kvm->arch.cur_tsc_offset; 2310 } else { 2311 u64 delta = nsec_to_cycles(vcpu, elapsed); 2312 data += delta; 2313 offset = kvm_compute_tsc_offset(vcpu, data); 2314 } 2315 matched = true; 2316 already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation); 2317 } else { 2318 /* 2319 * We split periods of matched TSC writes into generations. 2320 * For each generation, we track the original measured 2321 * nanosecond time, offset, and write, so if TSCs are in 2322 * sync, we can match exact offset, and if not, we can match 2323 * exact software computation in compute_guest_tsc() 2324 * 2325 * These values are tracked in kvm->arch.cur_xxx variables. 2326 */ 2327 kvm->arch.cur_tsc_generation++; 2328 kvm->arch.cur_tsc_nsec = ns; 2329 kvm->arch.cur_tsc_write = data; 2330 kvm->arch.cur_tsc_offset = offset; 2331 matched = false; 2332 } 2333 2334 /* 2335 * We also track th most recent recorded KHZ, write and time to 2336 * allow the matching interval to be extended at each write. 2337 */ 2338 kvm->arch.last_tsc_nsec = ns; 2339 kvm->arch.last_tsc_write = data; 2340 kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; 2341 2342 vcpu->arch.last_guest_tsc = data; 2343 2344 /* Keep track of which generation this VCPU has synchronized to */ 2345 vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; 2346 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; 2347 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; 2348 2349 kvm_vcpu_write_tsc_offset(vcpu, offset); 2350 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); 2351 2352 spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags); 2353 if (!matched) { 2354 kvm->arch.nr_vcpus_matched_tsc = 0; 2355 } else if (!already_matched) { 2356 kvm->arch.nr_vcpus_matched_tsc++; 2357 } 2358 2359 kvm_track_tsc_matching(vcpu); 2360 spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags); 2361 } 2362 2363 static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, 2364 s64 adjustment) 2365 { 2366 u64 tsc_offset = vcpu->arch.l1_tsc_offset; 2367 kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment); 2368 } 2369 2370 static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) 2371 { 2372 if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) 2373 WARN_ON(adjustment < 0); 2374 adjustment = kvm_scale_tsc(vcpu, (u64) adjustment); 2375 adjust_tsc_offset_guest(vcpu, adjustment); 2376 } 2377 2378 #ifdef CONFIG_X86_64 2379 2380 static u64 read_tsc(void) 2381 { 2382 u64 ret = (u64)rdtsc_ordered(); 2383 u64 last = pvclock_gtod_data.clock.cycle_last; 2384 2385 if (likely(ret >= last)) 2386 return ret; 2387 2388 /* 2389 * GCC likes to generate cmov here, but this branch is extremely 2390 * predictable (it's just a function of time and the likely is 2391 * very likely) and there's a data dependence, so force GCC 2392 * to generate a branch instead. I don't barrier() because 2393 * we don't actually need a barrier, and if this function 2394 * ever gets inlined it will generate worse code. 2395 */ 2396 asm volatile (""); 2397 return last; 2398 } 2399 2400 static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, 2401 int *mode) 2402 { 2403 long v; 2404 u64 tsc_pg_val; 2405 2406 switch (clock->vclock_mode) { 2407 case VDSO_CLOCKMODE_HVCLOCK: 2408 tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(), 2409 tsc_timestamp); 2410 if (tsc_pg_val != U64_MAX) { 2411 /* TSC page valid */ 2412 *mode = VDSO_CLOCKMODE_HVCLOCK; 2413 v = (tsc_pg_val - clock->cycle_last) & 2414 clock->mask; 2415 } else { 2416 /* TSC page invalid */ 2417 *mode = VDSO_CLOCKMODE_NONE; 2418 } 2419 break; 2420 case VDSO_CLOCKMODE_TSC: 2421 *mode = VDSO_CLOCKMODE_TSC; 2422 *tsc_timestamp = read_tsc(); 2423 v = (*tsc_timestamp - clock->cycle_last) & 2424 clock->mask; 2425 break; 2426 default: 2427 *mode = VDSO_CLOCKMODE_NONE; 2428 } 2429 2430 if (*mode == VDSO_CLOCKMODE_NONE) 2431 *tsc_timestamp = v = 0; 2432 2433 return v * clock->mult; 2434 } 2435 2436 static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp) 2437 { 2438 struct pvclock_gtod_data *gtod = &pvclock_gtod_data; 2439 unsigned long seq; 2440 int mode; 2441 u64 ns; 2442 2443 do { 2444 seq = read_seqcount_begin(>od->seq); 2445 ns = gtod->raw_clock.base_cycles; 2446 ns += vgettsc(>od->raw_clock, tsc_timestamp, &mode); 2447 ns >>= gtod->raw_clock.shift; 2448 ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot)); 2449 } while (unlikely(read_seqcount_retry(>od->seq, seq))); 2450 *t = ns; 2451 2452 return mode; 2453 } 2454 2455 static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp) 2456 { 2457 struct pvclock_gtod_data *gtod = &pvclock_gtod_data; 2458 unsigned long seq; 2459 int mode; 2460 u64 ns; 2461 2462 do { 2463 seq = read_seqcount_begin(>od->seq); 2464 ts->tv_sec = gtod->wall_time_sec; 2465 ns = gtod->clock.base_cycles; 2466 ns += vgettsc(>od->clock, tsc_timestamp, &mode); 2467 ns >>= gtod->clock.shift; 2468 } while (unlikely(read_seqcount_retry(>od->seq, seq))); 2469 2470 ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); 2471 ts->tv_nsec = ns; 2472 2473 return mode; 2474 } 2475 2476 /* returns true if host is using TSC based clocksource */ 2477 static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) 2478 { 2479 /* checked again under seqlock below */ 2480 if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) 2481 return false; 2482 2483 return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns, 2484 tsc_timestamp)); 2485 } 2486 2487 /* returns true if host is using TSC based clocksource */ 2488 static bool kvm_get_walltime_and_clockread(struct timespec64 *ts, 2489 u64 *tsc_timestamp) 2490 { 2491 /* checked again under seqlock below */ 2492 if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) 2493 return false; 2494 2495 return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp)); 2496 } 2497 #endif 2498 2499 /* 2500 * 2501 * Assuming a stable TSC across physical CPUS, and a stable TSC 2502 * across virtual CPUs, the following condition is possible. 2503 * Each numbered line represents an event visible to both 2504 * CPUs at the next numbered event. 2505 * 2506 * "timespecX" represents host monotonic time. "tscX" represents 2507 * RDTSC value. 2508 * 2509 * VCPU0 on CPU0 | VCPU1 on CPU1 2510 * 2511 * 1. read timespec0,tsc0 2512 * 2. | timespec1 = timespec0 + N 2513 * | tsc1 = tsc0 + M 2514 * 3. transition to guest | transition to guest 2515 * 4. ret0 = timespec0 + (rdtsc - tsc0) | 2516 * 5. | ret1 = timespec1 + (rdtsc - tsc1) 2517 * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M)) 2518 * 2519 * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity: 2520 * 2521 * - ret0 < ret1 2522 * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M)) 2523 * ... 2524 * - 0 < N - M => M < N 2525 * 2526 * That is, when timespec0 != timespec1, M < N. Unfortunately that is not 2527 * always the case (the difference between two distinct xtime instances 2528 * might be smaller then the difference between corresponding TSC reads, 2529 * when updating guest vcpus pvclock areas). 2530 * 2531 * To avoid that problem, do not allow visibility of distinct 2532 * system_timestamp/tsc_timestamp values simultaneously: use a master 2533 * copy of host monotonic time values. Update that master copy 2534 * in lockstep. 2535 * 2536 * Rely on synchronization of host TSCs and guest TSCs for monotonicity. 2537 * 2538 */ 2539 2540 static void pvclock_update_vm_gtod_copy(struct kvm *kvm) 2541 { 2542 #ifdef CONFIG_X86_64 2543 struct kvm_arch *ka = &kvm->arch; 2544 int vclock_mode; 2545 bool host_tsc_clocksource, vcpus_matched; 2546 2547 vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == 2548 atomic_read(&kvm->online_vcpus)); 2549 2550 /* 2551 * If the host uses TSC clock, then passthrough TSC as stable 2552 * to the guest. 2553 */ 2554 host_tsc_clocksource = kvm_get_time_and_clockread( 2555 &ka->master_kernel_ns, 2556 &ka->master_cycle_now); 2557 2558 ka->use_master_clock = host_tsc_clocksource && vcpus_matched 2559 && !ka->backwards_tsc_observed 2560 && !ka->boot_vcpu_runs_old_kvmclock; 2561 2562 if (ka->use_master_clock) 2563 atomic_set(&kvm_guest_has_master_clock, 1); 2564 2565 vclock_mode = pvclock_gtod_data.clock.vclock_mode; 2566 trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode, 2567 vcpus_matched); 2568 #endif 2569 } 2570 2571 void kvm_make_mclock_inprogress_request(struct kvm *kvm) 2572 { 2573 kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); 2574 } 2575 2576 static void kvm_gen_update_masterclock(struct kvm *kvm) 2577 { 2578 #ifdef CONFIG_X86_64 2579 int i; 2580 struct kvm_vcpu *vcpu; 2581 struct kvm_arch *ka = &kvm->arch; 2582 unsigned long flags; 2583 2584 kvm_hv_invalidate_tsc_page(kvm); 2585 2586 kvm_make_mclock_inprogress_request(kvm); 2587 2588 /* no guest entries from this point */ 2589 spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); 2590 pvclock_update_vm_gtod_copy(kvm); 2591 spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); 2592 2593 kvm_for_each_vcpu(i, vcpu, kvm) 2594 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2595 2596 /* guest entries allowed */ 2597 kvm_for_each_vcpu(i, vcpu, kvm) 2598 kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); 2599 #endif 2600 } 2601 2602 u64 get_kvmclock_ns(struct kvm *kvm) 2603 { 2604 struct kvm_arch *ka = &kvm->arch; 2605 struct pvclock_vcpu_time_info hv_clock; 2606 unsigned long flags; 2607 u64 ret; 2608 2609 spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); 2610 if (!ka->use_master_clock) { 2611 spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); 2612 return get_kvmclock_base_ns() + ka->kvmclock_offset; 2613 } 2614 2615 hv_clock.tsc_timestamp = ka->master_cycle_now; 2616 hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; 2617 spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); 2618 2619 /* both __this_cpu_read() and rdtsc() should be on the same cpu */ 2620 get_cpu(); 2621 2622 if (__this_cpu_read(cpu_tsc_khz)) { 2623 kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, 2624 &hv_clock.tsc_shift, 2625 &hv_clock.tsc_to_system_mul); 2626 ret = __pvclock_read_cycles(&hv_clock, rdtsc()); 2627 } else 2628 ret = get_kvmclock_base_ns() + ka->kvmclock_offset; 2629 2630 put_cpu(); 2631 2632 return ret; 2633 } 2634 2635 static void kvm_setup_pvclock_page(struct kvm_vcpu *v, 2636 struct gfn_to_hva_cache *cache, 2637 unsigned int offset) 2638 { 2639 struct kvm_vcpu_arch *vcpu = &v->arch; 2640 struct pvclock_vcpu_time_info guest_hv_clock; 2641 2642 if (unlikely(kvm_read_guest_offset_cached(v->kvm, cache, 2643 &guest_hv_clock, offset, sizeof(guest_hv_clock)))) 2644 return; 2645 2646 /* This VCPU is paused, but it's legal for a guest to read another 2647 * VCPU's kvmclock, so we really have to follow the specification where 2648 * it says that version is odd if data is being modified, and even after 2649 * it is consistent. 2650 * 2651 * Version field updates must be kept separate. This is because 2652 * kvm_write_guest_cached might use a "rep movs" instruction, and 2653 * writes within a string instruction are weakly ordered. So there 2654 * are three writes overall. 2655 * 2656 * As a small optimization, only write the version field in the first 2657 * and third write. The vcpu->pv_time cache is still valid, because the 2658 * version field is the first in the struct. 2659 */ 2660 BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); 2661 2662 if (guest_hv_clock.version & 1) 2663 ++guest_hv_clock.version; /* first time write, random junk */ 2664 2665 vcpu->hv_clock.version = guest_hv_clock.version + 1; 2666 kvm_write_guest_offset_cached(v->kvm, cache, 2667 &vcpu->hv_clock, offset, 2668 sizeof(vcpu->hv_clock.version)); 2669 2670 smp_wmb(); 2671 2672 /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ 2673 vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); 2674 2675 if (vcpu->pvclock_set_guest_stopped_request) { 2676 vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; 2677 vcpu->pvclock_set_guest_stopped_request = false; 2678 } 2679 2680 trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); 2681 2682 kvm_write_guest_offset_cached(v->kvm, cache, 2683 &vcpu->hv_clock, offset, 2684 sizeof(vcpu->hv_clock)); 2685 2686 smp_wmb(); 2687 2688 vcpu->hv_clock.version++; 2689 kvm_write_guest_offset_cached(v->kvm, cache, 2690 &vcpu->hv_clock, offset, 2691 sizeof(vcpu->hv_clock.version)); 2692 } 2693 2694 static int kvm_guest_time_update(struct kvm_vcpu *v) 2695 { 2696 unsigned long flags, tgt_tsc_khz; 2697 struct kvm_vcpu_arch *vcpu = &v->arch; 2698 struct kvm_arch *ka = &v->kvm->arch; 2699 s64 kernel_ns; 2700 u64 tsc_timestamp, host_tsc; 2701 u8 pvclock_flags; 2702 bool use_master_clock; 2703 2704 kernel_ns = 0; 2705 host_tsc = 0; 2706 2707 /* 2708 * If the host uses TSC clock, then passthrough TSC as stable 2709 * to the guest. 2710 */ 2711 spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); 2712 use_master_clock = ka->use_master_clock; 2713 if (use_master_clock) { 2714 host_tsc = ka->master_cycle_now; 2715 kernel_ns = ka->master_kernel_ns; 2716 } 2717 spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); 2718 2719 /* Keep irq disabled to prevent changes to the clock */ 2720 local_irq_save(flags); 2721 tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz); 2722 if (unlikely(tgt_tsc_khz == 0)) { 2723 local_irq_restore(flags); 2724 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); 2725 return 1; 2726 } 2727 if (!use_master_clock) { 2728 host_tsc = rdtsc(); 2729 kernel_ns = get_kvmclock_base_ns(); 2730 } 2731 2732 tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); 2733 2734 /* 2735 * We may have to catch up the TSC to match elapsed wall clock 2736 * time for two reasons, even if kvmclock is used. 2737 * 1) CPU could have been running below the maximum TSC rate 2738 * 2) Broken TSC compensation resets the base at each VCPU 2739 * entry to avoid unknown leaps of TSC even when running 2740 * again on the same CPU. This may cause apparent elapsed 2741 * time to disappear, and the guest to stand still or run 2742 * very slowly. 2743 */ 2744 if (vcpu->tsc_catchup) { 2745 u64 tsc = compute_guest_tsc(v, kernel_ns); 2746 if (tsc > tsc_timestamp) { 2747 adjust_tsc_offset_guest(v, tsc - tsc_timestamp); 2748 tsc_timestamp = tsc; 2749 } 2750 } 2751 2752 local_irq_restore(flags); 2753 2754 /* With all the info we got, fill in the values */ 2755 2756 if (kvm_has_tsc_control) 2757 tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz); 2758 2759 if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { 2760 kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL, 2761 &vcpu->hv_clock.tsc_shift, 2762 &vcpu->hv_clock.tsc_to_system_mul); 2763 vcpu->hw_tsc_khz = tgt_tsc_khz; 2764 } 2765 2766 vcpu->hv_clock.tsc_timestamp = tsc_timestamp; 2767 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; 2768 vcpu->last_guest_tsc = tsc_timestamp; 2769 2770 /* If the host uses TSC clocksource, then it is stable */ 2771 pvclock_flags = 0; 2772 if (use_master_clock) 2773 pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; 2774 2775 vcpu->hv_clock.flags = pvclock_flags; 2776 2777 if (vcpu->pv_time_enabled) 2778 kvm_setup_pvclock_page(v, &vcpu->pv_time, 0); 2779 if (vcpu->xen.vcpu_info_set) 2780 kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_info_cache, 2781 offsetof(struct compat_vcpu_info, time)); 2782 if (vcpu->xen.vcpu_time_info_set) 2783 kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0); 2784 if (v == kvm_get_vcpu(v->kvm, 0)) 2785 kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); 2786 return 0; 2787 } 2788 2789 /* 2790 * kvmclock updates which are isolated to a given vcpu, such as 2791 * vcpu->cpu migration, should not allow system_timestamp from 2792 * the rest of the vcpus to remain static. Otherwise ntp frequency 2793 * correction applies to one vcpu's system_timestamp but not 2794 * the others. 2795 * 2796 * So in those cases, request a kvmclock update for all vcpus. 2797 * We need to rate-limit these requests though, as they can 2798 * considerably slow guests that have a large number of vcpus. 2799 * The time for a remote vcpu to update its kvmclock is bound 2800 * by the delay we use to rate-limit the updates. 2801 */ 2802 2803 #define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100) 2804 2805 static void kvmclock_update_fn(struct work_struct *work) 2806 { 2807 int i; 2808 struct delayed_work *dwork = to_delayed_work(work); 2809 struct kvm_arch *ka = container_of(dwork, struct kvm_arch, 2810 kvmclock_update_work); 2811 struct kvm *kvm = container_of(ka, struct kvm, arch); 2812 struct kvm_vcpu *vcpu; 2813 2814 kvm_for_each_vcpu(i, vcpu, kvm) { 2815 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2816 kvm_vcpu_kick(vcpu); 2817 } 2818 } 2819 2820 static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) 2821 { 2822 struct kvm *kvm = v->kvm; 2823 2824 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); 2825 schedule_delayed_work(&kvm->arch.kvmclock_update_work, 2826 KVMCLOCK_UPDATE_DELAY); 2827 } 2828 2829 #define KVMCLOCK_SYNC_PERIOD (300 * HZ) 2830 2831 static void kvmclock_sync_fn(struct work_struct *work) 2832 { 2833 struct delayed_work *dwork = to_delayed_work(work); 2834 struct kvm_arch *ka = container_of(dwork, struct kvm_arch, 2835 kvmclock_sync_work); 2836 struct kvm *kvm = container_of(ka, struct kvm, arch); 2837 2838 if (!kvmclock_periodic_sync) 2839 return; 2840 2841 schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); 2842 schedule_delayed_work(&kvm->arch.kvmclock_sync_work, 2843 KVMCLOCK_SYNC_PERIOD); 2844 } 2845 2846 /* 2847 * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP. 2848 */ 2849 static bool can_set_mci_status(struct kvm_vcpu *vcpu) 2850 { 2851 /* McStatusWrEn enabled? */ 2852 if (guest_cpuid_is_amd_or_hygon(vcpu)) 2853 return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); 2854 2855 return false; 2856 } 2857 2858 static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2859 { 2860 u64 mcg_cap = vcpu->arch.mcg_cap; 2861 unsigned bank_num = mcg_cap & 0xff; 2862 u32 msr = msr_info->index; 2863 u64 data = msr_info->data; 2864 2865 switch (msr) { 2866 case MSR_IA32_MCG_STATUS: 2867 vcpu->arch.mcg_status = data; 2868 break; 2869 case MSR_IA32_MCG_CTL: 2870 if (!(mcg_cap & MCG_CTL_P) && 2871 (data || !msr_info->host_initiated)) 2872 return 1; 2873 if (data != 0 && data != ~(u64)0) 2874 return 1; 2875 vcpu->arch.mcg_ctl = data; 2876 break; 2877 default: 2878 if (msr >= MSR_IA32_MC0_CTL && 2879 msr < MSR_IA32_MCx_CTL(bank_num)) { 2880 u32 offset = array_index_nospec( 2881 msr - MSR_IA32_MC0_CTL, 2882 MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); 2883 2884 /* only 0 or all 1s can be written to IA32_MCi_CTL 2885 * some Linux kernels though clear bit 10 in bank 4 to 2886 * workaround a BIOS/GART TBL issue on AMD K8s, ignore 2887 * this to avoid an uncatched #GP in the guest 2888 */ 2889 if ((offset & 0x3) == 0 && 2890 data != 0 && (data | (1 << 10)) != ~(u64)0) 2891 return -1; 2892 2893 /* MCi_STATUS */ 2894 if (!msr_info->host_initiated && 2895 (offset & 0x3) == 1 && data != 0) { 2896 if (!can_set_mci_status(vcpu)) 2897 return -1; 2898 } 2899 2900 vcpu->arch.mce_banks[offset] = data; 2901 break; 2902 } 2903 return 1; 2904 } 2905 return 0; 2906 } 2907 2908 static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) 2909 { 2910 u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; 2911 2912 return (vcpu->arch.apf.msr_en_val & mask) == mask; 2913 } 2914 2915 static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) 2916 { 2917 gpa_t gpa = data & ~0x3f; 2918 2919 /* Bits 4:5 are reserved, Should be zero */ 2920 if (data & 0x30) 2921 return 1; 2922 2923 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) && 2924 (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT)) 2925 return 1; 2926 2927 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) && 2928 (data & KVM_ASYNC_PF_DELIVERY_AS_INT)) 2929 return 1; 2930 2931 if (!lapic_in_kernel(vcpu)) 2932 return data ? 1 : 0; 2933 2934 vcpu->arch.apf.msr_en_val = data; 2935 2936 if (!kvm_pv_async_pf_enabled(vcpu)) { 2937 kvm_clear_async_pf_completion_queue(vcpu); 2938 kvm_async_pf_hash_reset(vcpu); 2939 return 0; 2940 } 2941 2942 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, 2943 sizeof(u64))) 2944 return 1; 2945 2946 vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); 2947 vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; 2948 2949 kvm_async_pf_wakeup_all(vcpu); 2950 2951 return 0; 2952 } 2953 2954 static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data) 2955 { 2956 /* Bits 8-63 are reserved */ 2957 if (data >> 8) 2958 return 1; 2959 2960 if (!lapic_in_kernel(vcpu)) 2961 return 1; 2962 2963 vcpu->arch.apf.msr_int_val = data; 2964 2965 vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK; 2966 2967 return 0; 2968 } 2969 2970 static void kvmclock_reset(struct kvm_vcpu *vcpu) 2971 { 2972 vcpu->arch.pv_time_enabled = false; 2973 vcpu->arch.time = 0; 2974 } 2975 2976 static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) 2977 { 2978 ++vcpu->stat.tlb_flush; 2979 static_call(kvm_x86_tlb_flush_all)(vcpu); 2980 } 2981 2982 static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) 2983 { 2984 ++vcpu->stat.tlb_flush; 2985 2986 if (!tdp_enabled) { 2987 /* 2988 * A TLB flush on behalf of the guest is equivalent to 2989 * INVPCID(all), toggling CR4.PGE, etc., which requires 2990 * a forced sync of the shadow page tables. Unload the 2991 * entire MMU here and the subsequent load will sync the 2992 * shadow page tables, and also flush the TLB. 2993 */ 2994 kvm_mmu_unload(vcpu); 2995 return; 2996 } 2997 2998 static_call(kvm_x86_tlb_flush_guest)(vcpu); 2999 } 3000 3001 static void record_steal_time(struct kvm_vcpu *vcpu) 3002 { 3003 struct kvm_host_map map; 3004 struct kvm_steal_time *st; 3005 3006 if (kvm_xen_msr_enabled(vcpu->kvm)) { 3007 kvm_xen_runstate_set_running(vcpu); 3008 return; 3009 } 3010 3011 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) 3012 return; 3013 3014 /* -EAGAIN is returned in atomic context so we can just return. */ 3015 if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, 3016 &map, &vcpu->arch.st.cache, false)) 3017 return; 3018 3019 st = map.hva + 3020 offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); 3021 3022 /* 3023 * Doing a TLB flush here, on the guest's behalf, can avoid 3024 * expensive IPIs. 3025 */ 3026 if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { 3027 trace_kvm_pv_tlb_flush(vcpu->vcpu_id, 3028 st->preempted & KVM_VCPU_FLUSH_TLB); 3029 if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) 3030 kvm_vcpu_flush_tlb_guest(vcpu); 3031 } else { 3032 st->preempted = 0; 3033 } 3034 3035 vcpu->arch.st.preempted = 0; 3036 3037 if (st->version & 1) 3038 st->version += 1; /* first time write, random junk */ 3039 3040 st->version += 1; 3041 3042 smp_wmb(); 3043 3044 st->steal += current->sched_info.run_delay - 3045 vcpu->arch.st.last_steal; 3046 vcpu->arch.st.last_steal = current->sched_info.run_delay; 3047 3048 smp_wmb(); 3049 3050 st->version += 1; 3051 3052 kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false); 3053 } 3054 3055 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 3056 { 3057 bool pr = false; 3058 u32 msr = msr_info->index; 3059 u64 data = msr_info->data; 3060 3061 if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr) 3062 return kvm_xen_write_hypercall_page(vcpu, data); 3063 3064 switch (msr) { 3065 case MSR_AMD64_NB_CFG: 3066 case MSR_IA32_UCODE_WRITE: 3067 case MSR_VM_HSAVE_PA: 3068 case MSR_AMD64_PATCH_LOADER: 3069 case MSR_AMD64_BU_CFG2: 3070 case MSR_AMD64_DC_CFG: 3071 case MSR_F15H_EX_CFG: 3072 break; 3073 3074 case MSR_IA32_UCODE_REV: 3075 if (msr_info->host_initiated) 3076 vcpu->arch.microcode_version = data; 3077 break; 3078 case MSR_IA32_ARCH_CAPABILITIES: 3079 if (!msr_info->host_initiated) 3080 return 1; 3081 vcpu->arch.arch_capabilities = data; 3082 break; 3083 case MSR_IA32_PERF_CAPABILITIES: { 3084 struct kvm_msr_entry msr_ent = {.index = msr, .data = 0}; 3085 3086 if (!msr_info->host_initiated) 3087 return 1; 3088 if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent)) 3089 return 1; 3090 if (data & ~msr_ent.data) 3091 return 1; 3092 3093 vcpu->arch.perf_capabilities = data; 3094 3095 return 0; 3096 } 3097 case MSR_EFER: 3098 return set_efer(vcpu, msr_info); 3099 case MSR_K7_HWCR: 3100 data &= ~(u64)0x40; /* ignore flush filter disable */ 3101 data &= ~(u64)0x100; /* ignore ignne emulation enable */ 3102 data &= ~(u64)0x8; /* ignore TLB cache disable */ 3103 3104 /* Handle McStatusWrEn */ 3105 if (data == BIT_ULL(18)) { 3106 vcpu->arch.msr_hwcr = data; 3107 } else if (data != 0) { 3108 vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 3109 data); 3110 return 1; 3111 } 3112 break; 3113 case MSR_FAM10H_MMIO_CONF_BASE: 3114 if (data != 0) { 3115 vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " 3116 "0x%llx\n", data); 3117 return 1; 3118 } 3119 break; 3120 case 0x200 ... 0x2ff: 3121 return kvm_mtrr_set_msr(vcpu, msr, data); 3122 case MSR_IA32_APICBASE: 3123 return kvm_set_apic_base(vcpu, msr_info); 3124 case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: 3125 return kvm_x2apic_msr_write(vcpu, msr, data); 3126 case MSR_IA32_TSCDEADLINE: 3127 kvm_set_lapic_tscdeadline_msr(vcpu, data); 3128 break; 3129 case MSR_IA32_TSC_ADJUST: 3130 if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) { 3131 if (!msr_info->host_initiated) { 3132 s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; 3133 adjust_tsc_offset_guest(vcpu, adj); 3134 } 3135 vcpu->arch.ia32_tsc_adjust_msr = data; 3136 } 3137 break; 3138 case MSR_IA32_MISC_ENABLE: 3139 if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && 3140 ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { 3141 if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) 3142 return 1; 3143 vcpu->arch.ia32_misc_enable_msr = data; 3144 kvm_update_cpuid_runtime(vcpu); 3145 } else { 3146 vcpu->arch.ia32_misc_enable_msr = data; 3147 } 3148 break; 3149 case MSR_IA32_SMBASE: 3150 if (!msr_info->host_initiated) 3151 return 1; 3152 vcpu->arch.smbase = data; 3153 break; 3154 case MSR_IA32_POWER_CTL: 3155 vcpu->arch.msr_ia32_power_ctl = data; 3156 break; 3157 case MSR_IA32_TSC: 3158 if (msr_info->host_initiated) { 3159 kvm_synchronize_tsc(vcpu, data); 3160 } else { 3161 u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; 3162 adjust_tsc_offset_guest(vcpu, adj); 3163 vcpu->arch.ia32_tsc_adjust_msr += adj; 3164 } 3165 break; 3166 case MSR_IA32_XSS: 3167 if (!msr_info->host_initiated && 3168 !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) 3169 return 1; 3170 /* 3171 * KVM supports exposing PT to the guest, but does not support 3172 * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than 3173 * XSAVES/XRSTORS to save/restore PT MSRs. 3174 */ 3175 if (data & ~supported_xss) 3176 return 1; 3177 vcpu->arch.ia32_xss = data; 3178 break; 3179 case MSR_SMI_COUNT: 3180 if (!msr_info->host_initiated) 3181 return 1; 3182 vcpu->arch.smi_count = data; 3183 break; 3184 case MSR_KVM_WALL_CLOCK_NEW: 3185 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) 3186 return 1; 3187 3188 vcpu->kvm->arch.wall_clock = data; 3189 kvm_write_wall_clock(vcpu->kvm, data, 0); 3190 break; 3191 case MSR_KVM_WALL_CLOCK: 3192 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) 3193 return 1; 3194 3195 vcpu->kvm->arch.wall_clock = data; 3196 kvm_write_wall_clock(vcpu->kvm, data, 0); 3197 break; 3198 case MSR_KVM_SYSTEM_TIME_NEW: 3199 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) 3200 return 1; 3201 3202 kvm_write_system_time(vcpu, data, false, msr_info->host_initiated); 3203 break; 3204 case MSR_KVM_SYSTEM_TIME: 3205 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) 3206 return 1; 3207 3208 kvm_write_system_time(vcpu, data, true, msr_info->host_initiated); 3209 break; 3210 case MSR_KVM_ASYNC_PF_EN: 3211 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) 3212 return 1; 3213 3214 if (kvm_pv_enable_async_pf(vcpu, data)) 3215 return 1; 3216 break; 3217 case MSR_KVM_ASYNC_PF_INT: 3218 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) 3219 return 1; 3220 3221 if (kvm_pv_enable_async_pf_int(vcpu, data)) 3222 return 1; 3223 break; 3224 case MSR_KVM_ASYNC_PF_ACK: 3225 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) 3226 return 1; 3227 if (data & 0x1) { 3228 vcpu->arch.apf.pageready_pending = false; 3229 kvm_check_async_pf_completion(vcpu); 3230 } 3231 break; 3232 case MSR_KVM_STEAL_TIME: 3233 if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) 3234 return 1; 3235 3236 if (unlikely(!sched_info_on())) 3237 return 1; 3238 3239 if (data & KVM_STEAL_RESERVED_MASK) 3240 return 1; 3241 3242 vcpu->arch.st.msr_val = data; 3243 3244 if (!(data & KVM_MSR_ENABLED)) 3245 break; 3246 3247 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); 3248 3249 break; 3250 case MSR_KVM_PV_EOI_EN: 3251 if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) 3252 return 1; 3253 3254 if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8))) 3255 return 1; 3256 break; 3257 3258 case MSR_KVM_POLL_CONTROL: 3259 if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) 3260 return 1; 3261 3262 /* only enable bit supported */ 3263 if (data & (-1ULL << 1)) 3264 return 1; 3265 3266 vcpu->arch.msr_kvm_poll_control = data; 3267 break; 3268 3269 case MSR_IA32_MCG_CTL: 3270 case MSR_IA32_MCG_STATUS: 3271 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: 3272 return set_msr_mce(vcpu, msr_info); 3273 3274 case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: 3275 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: 3276 pr = true; 3277 fallthrough; 3278 case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: 3279 case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: 3280 if (kvm_pmu_is_valid_msr(vcpu, msr)) 3281 return kvm_pmu_set_msr(vcpu, msr_info); 3282 3283 if (pr || data != 0) 3284 vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " 3285 "0x%x data 0x%llx\n", msr, data); 3286 break; 3287 case MSR_K7_CLK_CTL: 3288 /* 3289 * Ignore all writes to this no longer documented MSR. 3290 * Writes are only relevant for old K7 processors, 3291 * all pre-dating SVM, but a recommended workaround from 3292 * AMD for these chips. It is possible to specify the 3293 * affected processor models on the command line, hence 3294 * the need to ignore the workaround. 3295 */ 3296 break; 3297 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: 3298 case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: 3299 case HV_X64_MSR_SYNDBG_OPTIONS: 3300 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: 3301 case HV_X64_MSR_CRASH_CTL: 3302 case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: 3303 case HV_X64_MSR_REENLIGHTENMENT_CONTROL: 3304 case HV_X64_MSR_TSC_EMULATION_CONTROL: 3305 case HV_X64_MSR_TSC_EMULATION_STATUS: 3306 return kvm_hv_set_msr_common(vcpu, msr, data, 3307 msr_info->host_initiated); 3308 case MSR_IA32_BBL_CR_CTL3: 3309 /* Drop writes to this legacy MSR -- see rdmsr 3310 * counterpart for further detail. 3311 */ 3312 if (report_ignored_msrs) 3313 vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", 3314 msr, data); 3315 break; 3316 case MSR_AMD64_OSVW_ID_LENGTH: 3317 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) 3318 return 1; 3319 vcpu->arch.osvw.length = data; 3320 break; 3321 case MSR_AMD64_OSVW_STATUS: 3322 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) 3323 return 1; 3324 vcpu->arch.osvw.status = data; 3325 break; 3326 case MSR_PLATFORM_INFO: 3327 if (!msr_info->host_initiated || 3328 (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) && 3329 cpuid_fault_enabled(vcpu))) 3330 return 1; 3331 vcpu->arch.msr_platform_info = data; 3332 break; 3333 case MSR_MISC_FEATURES_ENABLES: 3334 if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT || 3335 (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT && 3336 !supports_cpuid_fault(vcpu))) 3337 return 1; 3338 vcpu->arch.msr_misc_features_enables = data; 3339 break; 3340 default: 3341 if (kvm_pmu_is_valid_msr(vcpu, msr)) 3342 return kvm_pmu_set_msr(vcpu, msr_info); 3343 return KVM_MSR_RET_INVALID; 3344 } 3345 return 0; 3346 } 3347 EXPORT_SYMBOL_GPL(kvm_set_msr_common); 3348 3349 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) 3350 { 3351 u64 data; 3352 u64 mcg_cap = vcpu->arch.mcg_cap; 3353 unsigned bank_num = mcg_cap & 0xff; 3354 3355 switch (msr) { 3356 case MSR_IA32_P5_MC_ADDR: 3357 case MSR_IA32_P5_MC_TYPE: 3358 data = 0; 3359 break; 3360 case MSR_IA32_MCG_CAP: 3361 data = vcpu->arch.mcg_cap; 3362 break; 3363 case MSR_IA32_MCG_CTL: 3364 if (!(mcg_cap & MCG_CTL_P) && !host) 3365 return 1; 3366 data = vcpu->arch.mcg_ctl; 3367 break; 3368 case MSR_IA32_MCG_STATUS: 3369 data = vcpu->arch.mcg_status; 3370 break; 3371 default: 3372 if (msr >= MSR_IA32_MC0_CTL && 3373 msr < MSR_IA32_MCx_CTL(bank_num)) { 3374 u32 offset = array_index_nospec( 3375 msr - MSR_IA32_MC0_CTL, 3376 MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); 3377 3378 data = vcpu->arch.mce_banks[offset]; 3379 break; 3380 } 3381 return 1; 3382 } 3383 *pdata = data; 3384 return 0; 3385 } 3386 3387 int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 3388 { 3389 switch (msr_info->index) { 3390 case MSR_IA32_PLATFORM_ID: 3391 case MSR_IA32_EBL_CR_POWERON: 3392 case MSR_IA32_LASTBRANCHFROMIP: 3393 case MSR_IA32_LASTBRANCHTOIP: 3394 case MSR_IA32_LASTINTFROMIP: 3395 case MSR_IA32_LASTINTTOIP: 3396 case MSR_K8_SYSCFG: 3397 case MSR_K8_TSEG_ADDR: 3398 case MSR_K8_TSEG_MASK: 3399 case MSR_VM_HSAVE_PA: 3400 case MSR_K8_INT_PENDING_MSG: 3401 case MSR_AMD64_NB_CFG: 3402 case MSR_FAM10H_MMIO_CONF_BASE: 3403 case MSR_AMD64_BU_CFG2: 3404 case MSR_IA32_PERF_CTL: 3405 case MSR_AMD64_DC_CFG: 3406 case MSR_F15H_EX_CFG: 3407 /* 3408 * Intel Sandy Bridge CPUs must support the RAPL (running average power 3409 * limit) MSRs. Just return 0, as we do not want to expose the host 3410 * data here. Do not conditionalize this on CPUID, as KVM does not do 3411 * so for existing CPU-specific MSRs. 3412 */ 3413 case MSR_RAPL_POWER_UNIT: 3414 case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */ 3415 case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */ 3416 case MSR_PKG_ENERGY_STATUS: /* Total package */ 3417 case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ 3418 msr_info->data = 0; 3419 break; 3420 case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: 3421 case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: 3422 case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: 3423 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: 3424 case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: 3425 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) 3426 return kvm_pmu_get_msr(vcpu, msr_info); 3427 msr_info->data = 0; 3428 break; 3429 case MSR_IA32_UCODE_REV: 3430 msr_info->data = vcpu->arch.microcode_version; 3431 break; 3432 case MSR_IA32_ARCH_CAPABILITIES: 3433 if (!msr_info->host_initiated && 3434 !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) 3435 return 1; 3436 msr_info->data = vcpu->arch.arch_capabilities; 3437 break; 3438 case MSR_IA32_PERF_CAPABILITIES: 3439 if (!msr_info->host_initiated && 3440 !guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) 3441 return 1; 3442 msr_info->data = vcpu->arch.perf_capabilities; 3443 break; 3444 case MSR_IA32_POWER_CTL: 3445 msr_info->data = vcpu->arch.msr_ia32_power_ctl; 3446 break; 3447 case MSR_IA32_TSC: { 3448 /* 3449 * Intel SDM states that MSR_IA32_TSC read adds the TSC offset 3450 * even when not intercepted. AMD manual doesn't explicitly 3451 * state this but appears to behave the same. 3452 * 3453 * On userspace reads and writes, however, we unconditionally 3454 * return L1's TSC value to ensure backwards-compatible 3455 * behavior for migration. 3456 */ 3457 u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset : 3458 vcpu->arch.tsc_offset; 3459 3460 msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset; 3461 break; 3462 } 3463 case MSR_MTRRcap: 3464 case 0x200 ... 0x2ff: 3465 return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); 3466 case 0xcd: /* fsb frequency */ 3467 msr_info->data = 3; 3468 break; 3469 /* 3470 * MSR_EBC_FREQUENCY_ID 3471 * Conservative value valid for even the basic CPU models. 3472 * Models 0,1: 000 in bits 23:21 indicating a bus speed of 3473 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz, 3474 * and 266MHz for model 3, or 4. Set Core Clock 3475 * Frequency to System Bus Frequency Ratio to 1 (bits 3476 * 31:24) even though these are only valid for CPU 3477 * models > 2, however guests may end up dividing or 3478 * multiplying by zero otherwise. 3479 */ 3480 case MSR_EBC_FREQUENCY_ID: 3481 msr_info->data = 1 << 24; 3482 break; 3483 case MSR_IA32_APICBASE: 3484 msr_info->data = kvm_get_apic_base(vcpu); 3485 break; 3486 case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: 3487 return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); 3488 case MSR_IA32_TSCDEADLINE: 3489 msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); 3490 break; 3491 case MSR_IA32_TSC_ADJUST: 3492 msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr; 3493 break; 3494 case MSR_IA32_MISC_ENABLE: 3495 msr_info->data = vcpu->arch.ia32_misc_enable_msr; 3496 break; 3497 case MSR_IA32_SMBASE: 3498 if (!msr_info->host_initiated) 3499 return 1; 3500 msr_info->data = vcpu->arch.smbase; 3501 break; 3502 case MSR_SMI_COUNT: 3503 msr_info->data = vcpu->arch.smi_count; 3504 break; 3505 case MSR_IA32_PERF_STATUS: 3506 /* TSC increment by tick */ 3507 msr_info->data = 1000ULL; 3508 /* CPU multiplier */ 3509 msr_info->data |= (((uint64_t)4ULL) << 40); 3510 break; 3511 case MSR_EFER: 3512 msr_info->data = vcpu->arch.efer; 3513 break; 3514 case MSR_KVM_WALL_CLOCK: 3515 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) 3516 return 1; 3517 3518 msr_info->data = vcpu->kvm->arch.wall_clock; 3519 break; 3520 case MSR_KVM_WALL_CLOCK_NEW: 3521 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) 3522 return 1; 3523 3524 msr_info->data = vcpu->kvm->arch.wall_clock; 3525 break; 3526 case MSR_KVM_SYSTEM_TIME: 3527 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) 3528 return 1; 3529 3530 msr_info->data = vcpu->arch.time; 3531 break; 3532 case MSR_KVM_SYSTEM_TIME_NEW: 3533 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) 3534 return 1; 3535 3536 msr_info->data = vcpu->arch.time; 3537 break; 3538 case MSR_KVM_ASYNC_PF_EN: 3539 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) 3540 return 1; 3541 3542 msr_info->data = vcpu->arch.apf.msr_en_val; 3543 break; 3544 case MSR_KVM_ASYNC_PF_INT: 3545 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) 3546 return 1; 3547 3548 msr_info->data = vcpu->arch.apf.msr_int_val; 3549 break; 3550 case MSR_KVM_ASYNC_PF_ACK: 3551 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) 3552 return 1; 3553 3554 msr_info->data = 0; 3555 break; 3556 case MSR_KVM_STEAL_TIME: 3557 if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) 3558 return 1; 3559 3560 msr_info->data = vcpu->arch.st.msr_val; 3561 break; 3562 case MSR_KVM_PV_EOI_EN: 3563 if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) 3564 return 1; 3565 3566 msr_info->data = vcpu->arch.pv_eoi.msr_val; 3567 break; 3568 case MSR_KVM_POLL_CONTROL: 3569 if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) 3570 return 1; 3571 3572 msr_info->data = vcpu->arch.msr_kvm_poll_control; 3573 break; 3574 case MSR_IA32_P5_MC_ADDR: 3575 case MSR_IA32_P5_MC_TYPE: 3576 case MSR_IA32_MCG_CAP: 3577 case MSR_IA32_MCG_CTL: 3578 case MSR_IA32_MCG_STATUS: 3579 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: 3580 return get_msr_mce(vcpu, msr_info->index, &msr_info->data, 3581 msr_info->host_initiated); 3582 case MSR_IA32_XSS: 3583 if (!msr_info->host_initiated && 3584 !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) 3585 return 1; 3586 msr_info->data = vcpu->arch.ia32_xss; 3587 break; 3588 case MSR_K7_CLK_CTL: 3589 /* 3590 * Provide expected ramp-up count for K7. All other 3591 * are set to zero, indicating minimum divisors for 3592 * every field. 3593 * 3594 * This prevents guest kernels on AMD host with CPU 3595 * type 6, model 8 and higher from exploding due to 3596 * the rdmsr failing. 3597 */ 3598 msr_info->data = 0x20000000; 3599 break; 3600 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: 3601 case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: 3602 case HV_X64_MSR_SYNDBG_OPTIONS: 3603 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: 3604 case HV_X64_MSR_CRASH_CTL: 3605 case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: 3606 case HV_X64_MSR_REENLIGHTENMENT_CONTROL: 3607 case HV_X64_MSR_TSC_EMULATION_CONTROL: 3608 case HV_X64_MSR_TSC_EMULATION_STATUS: 3609 return kvm_hv_get_msr_common(vcpu, 3610 msr_info->index, &msr_info->data, 3611 msr_info->host_initiated); 3612 case MSR_IA32_BBL_CR_CTL3: 3613 /* This legacy MSR exists but isn't fully documented in current 3614 * silicon. It is however accessed by winxp in very narrow 3615 * scenarios where it sets bit #19, itself documented as 3616 * a "reserved" bit. Best effort attempt to source coherent 3617 * read data here should the balance of the register be 3618 * interpreted by the guest: 3619 * 3620 * L2 cache control register 3: 64GB range, 256KB size, 3621 * enabled, latency 0x1, configured 3622 */ 3623 msr_info->data = 0xbe702111; 3624 break; 3625 case MSR_AMD64_OSVW_ID_LENGTH: 3626 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) 3627 return 1; 3628 msr_info->data = vcpu->arch.osvw.length; 3629 break; 3630 case MSR_AMD64_OSVW_STATUS: 3631 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) 3632 return 1; 3633 msr_info->data = vcpu->arch.osvw.status; 3634 break; 3635 case MSR_PLATFORM_INFO: 3636 if (!msr_info->host_initiated && 3637 !vcpu->kvm->arch.guest_can_read_msr_platform_info) 3638 return 1; 3639 msr_info->data = vcpu->arch.msr_platform_info; 3640 break; 3641 case MSR_MISC_FEATURES_ENABLES: 3642 msr_info->data = vcpu->arch.msr_misc_features_enables; 3643 break; 3644 case MSR_K7_HWCR: 3645 msr_info->data = vcpu->arch.msr_hwcr; 3646 break; 3647 default: 3648 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) 3649 return kvm_pmu_get_msr(vcpu, msr_info); 3650 return KVM_MSR_RET_INVALID; 3651 } 3652 return 0; 3653 } 3654 EXPORT_SYMBOL_GPL(kvm_get_msr_common); 3655 3656 /* 3657 * Read or write a bunch of msrs. All parameters are kernel addresses. 3658 * 3659 * @return number of msrs set successfully. 3660 */ 3661 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, 3662 struct kvm_msr_entry *entries, 3663 int (*do_msr)(struct kvm_vcpu *vcpu, 3664 unsigned index, u64 *data)) 3665 { 3666 int i; 3667 3668 for (i = 0; i < msrs->nmsrs; ++i) 3669 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 3670 break; 3671 3672 return i; 3673 } 3674 3675 /* 3676 * Read or write a bunch of msrs. Parameters are user addresses. 3677 * 3678 * @return number of msrs set successfully. 3679 */ 3680 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, 3681 int (*do_msr)(struct kvm_vcpu *vcpu, 3682 unsigned index, u64 *data), 3683 int writeback) 3684 { 3685 struct kvm_msrs msrs; 3686 struct kvm_msr_entry *entries; 3687 int r, n; 3688 unsigned size; 3689 3690 r = -EFAULT; 3691 if (copy_from_user(&msrs, user_msrs, sizeof(msrs))) 3692 goto out; 3693 3694 r = -E2BIG; 3695 if (msrs.nmsrs >= MAX_IO_MSRS) 3696 goto out; 3697 3698 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; 3699 entries = memdup_user(user_msrs->entries, size); 3700 if (IS_ERR(entries)) { 3701 r = PTR_ERR(entries); 3702 goto out; 3703 } 3704 3705 r = n = __msr_io(vcpu, &msrs, entries, do_msr); 3706 if (r < 0) 3707 goto out_free; 3708 3709 r = -EFAULT; 3710 if (writeback && copy_to_user(user_msrs->entries, entries, size)) 3711 goto out_free; 3712 3713 r = n; 3714 3715 out_free: 3716 kfree(entries); 3717 out: 3718 return r; 3719 } 3720 3721 static inline bool kvm_can_mwait_in_guest(void) 3722 { 3723 return boot_cpu_has(X86_FEATURE_MWAIT) && 3724 !boot_cpu_has_bug(X86_BUG_MONITOR) && 3725 boot_cpu_has(X86_FEATURE_ARAT); 3726 } 3727 3728 static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu, 3729 struct kvm_cpuid2 __user *cpuid_arg) 3730 { 3731 struct kvm_cpuid2 cpuid; 3732 int r; 3733 3734 r = -EFAULT; 3735 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) 3736 return r; 3737 3738 r = kvm_get_hv_cpuid(vcpu, &cpuid, cpuid_arg->entries); 3739 if (r) 3740 return r; 3741 3742 r = -EFAULT; 3743 if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) 3744 return r; 3745 3746 return 0; 3747 } 3748 3749 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) 3750 { 3751 int r = 0; 3752 3753 switch (ext) { 3754 case KVM_CAP_IRQCHIP: 3755 case KVM_CAP_HLT: 3756 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: 3757 case KVM_CAP_SET_TSS_ADDR: 3758 case KVM_CAP_EXT_CPUID: 3759 case KVM_CAP_EXT_EMUL_CPUID: 3760 case KVM_CAP_CLOCKSOURCE: 3761 case KVM_CAP_PIT: 3762 case KVM_CAP_NOP_IO_DELAY: 3763 case KVM_CAP_MP_STATE: 3764 case KVM_CAP_SYNC_MMU: 3765 case KVM_CAP_USER_NMI: 3766 case KVM_CAP_REINJECT_CONTROL: 3767 case KVM_CAP_IRQ_INJECT_STATUS: 3768 case KVM_CAP_IOEVENTFD: 3769 case KVM_CAP_IOEVENTFD_NO_LENGTH: 3770 case KVM_CAP_PIT2: 3771 case KVM_CAP_PIT_STATE2: 3772 case KVM_CAP_SET_IDENTITY_MAP_ADDR: 3773 case KVM_CAP_VCPU_EVENTS: 3774 case KVM_CAP_HYPERV: 3775 case KVM_CAP_HYPERV_VAPIC: 3776 case KVM_CAP_HYPERV_SPIN: 3777 case KVM_CAP_HYPERV_SYNIC: 3778 case KVM_CAP_HYPERV_SYNIC2: 3779 case KVM_CAP_HYPERV_VP_INDEX: 3780 case KVM_CAP_HYPERV_EVENTFD: 3781 case KVM_CAP_HYPERV_TLBFLUSH: 3782 case KVM_CAP_HYPERV_SEND_IPI: 3783 case KVM_CAP_HYPERV_CPUID: 3784 case KVM_CAP_SYS_HYPERV_CPUID: 3785 case KVM_CAP_PCI_SEGMENT: 3786 case KVM_CAP_DEBUGREGS: 3787 case KVM_CAP_X86_ROBUST_SINGLESTEP: 3788 case KVM_CAP_XSAVE: 3789 case KVM_CAP_ASYNC_PF: 3790 case KVM_CAP_ASYNC_PF_INT: 3791 case KVM_CAP_GET_TSC_KHZ: 3792 case KVM_CAP_KVMCLOCK_CTRL: 3793 case KVM_CAP_READONLY_MEM: 3794 case KVM_CAP_HYPERV_TIME: 3795 case KVM_CAP_IOAPIC_POLARITY_IGNORED: 3796 case KVM_CAP_TSC_DEADLINE_TIMER: 3797 case KVM_CAP_DISABLE_QUIRKS: 3798 case KVM_CAP_SET_BOOT_CPU_ID: 3799 case KVM_CAP_SPLIT_IRQCHIP: 3800 case KVM_CAP_IMMEDIATE_EXIT: 3801 case KVM_CAP_PMU_EVENT_FILTER: 3802 case KVM_CAP_GET_MSR_FEATURES: 3803 case KVM_CAP_MSR_PLATFORM_INFO: 3804 case KVM_CAP_EXCEPTION_PAYLOAD: 3805 case KVM_CAP_SET_GUEST_DEBUG: 3806 case KVM_CAP_LAST_CPU: 3807 case KVM_CAP_X86_USER_SPACE_MSR: 3808 case KVM_CAP_X86_MSR_FILTER: 3809 case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: 3810 r = 1; 3811 break; 3812 #ifdef CONFIG_KVM_XEN 3813 case KVM_CAP_XEN_HVM: 3814 r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR | 3815 KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | 3816 KVM_XEN_HVM_CONFIG_SHARED_INFO; 3817 if (sched_info_on()) 3818 r |= KVM_XEN_HVM_CONFIG_RUNSTATE; 3819 break; 3820 #endif 3821 case KVM_CAP_SYNC_REGS: 3822 r = KVM_SYNC_X86_VALID_FIELDS; 3823 break; 3824 case KVM_CAP_ADJUST_CLOCK: 3825 r = KVM_CLOCK_TSC_STABLE; 3826 break; 3827 case KVM_CAP_X86_DISABLE_EXITS: 3828 r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE | 3829 KVM_X86_DISABLE_EXITS_CSTATE; 3830 if(kvm_can_mwait_in_guest()) 3831 r |= KVM_X86_DISABLE_EXITS_MWAIT; 3832 break; 3833 case KVM_CAP_X86_SMM: 3834 /* SMBASE is usually relocated above 1M on modern chipsets, 3835 * and SMM handlers might indeed rely on 4G segment limits, 3836 * so do not report SMM to be available if real mode is 3837 * emulated via vm86 mode. Still, do not go to great lengths 3838 * to avoid userspace's usage of the feature, because it is a 3839 * fringe case that is not enabled except via specific settings 3840 * of the module parameters. 3841 */ 3842 r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE); 3843 break; 3844 case KVM_CAP_VAPIC: 3845 r = !static_call(kvm_x86_cpu_has_accelerated_tpr)(); 3846 break; 3847 case KVM_CAP_NR_VCPUS: 3848 r = KVM_SOFT_MAX_VCPUS; 3849 break; 3850 case KVM_CAP_MAX_VCPUS: 3851 r = KVM_MAX_VCPUS; 3852 break; 3853 case KVM_CAP_MAX_VCPU_ID: 3854 r = KVM_MAX_VCPU_ID; 3855 break; 3856 case KVM_CAP_PV_MMU: /* obsolete */ 3857 r = 0; 3858 break; 3859 case KVM_CAP_MCE: 3860 r = KVM_MAX_MCE_BANKS; 3861 break; 3862 case KVM_CAP_XCRS: 3863 r = boot_cpu_has(X86_FEATURE_XSAVE); 3864 break; 3865 case KVM_CAP_TSC_CONTROL: 3866 r = kvm_has_tsc_control; 3867 break; 3868 case KVM_CAP_X2APIC_API: 3869 r = KVM_X2APIC_API_VALID_FLAGS; 3870 break; 3871 case KVM_CAP_NESTED_STATE: 3872 r = kvm_x86_ops.nested_ops->get_state ? 3873 kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0; 3874 break; 3875 case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: 3876 r = kvm_x86_ops.enable_direct_tlbflush != NULL; 3877 break; 3878 case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: 3879 r = kvm_x86_ops.nested_ops->enable_evmcs != NULL; 3880 break; 3881 case KVM_CAP_SMALLER_MAXPHYADDR: 3882 r = (int) allow_smaller_maxphyaddr; 3883 break; 3884 case KVM_CAP_STEAL_TIME: 3885 r = sched_info_on(); 3886 break; 3887 case KVM_CAP_X86_BUS_LOCK_EXIT: 3888 if (kvm_has_bus_lock_exit) 3889 r = KVM_BUS_LOCK_DETECTION_OFF | 3890 KVM_BUS_LOCK_DETECTION_EXIT; 3891 else 3892 r = 0; 3893 break; 3894 default: 3895 break; 3896 } 3897 return r; 3898 3899 } 3900 3901 long kvm_arch_dev_ioctl(struct file *filp, 3902 unsigned int ioctl, unsigned long arg) 3903 { 3904 void __user *argp = (void __user *)arg; 3905 long r; 3906 3907 switch (ioctl) { 3908 case KVM_GET_MSR_INDEX_LIST: { 3909 struct kvm_msr_list __user *user_msr_list = argp; 3910 struct kvm_msr_list msr_list; 3911 unsigned n; 3912 3913 r = -EFAULT; 3914 if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list))) 3915 goto out; 3916 n = msr_list.nmsrs; 3917 msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs; 3918 if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list))) 3919 goto out; 3920 r = -E2BIG; 3921 if (n < msr_list.nmsrs) 3922 goto out; 3923 r = -EFAULT; 3924 if (copy_to_user(user_msr_list->indices, &msrs_to_save, 3925 num_msrs_to_save * sizeof(u32))) 3926 goto out; 3927 if (copy_to_user(user_msr_list->indices + num_msrs_to_save, 3928 &emulated_msrs, 3929 num_emulated_msrs * sizeof(u32))) 3930 goto out; 3931 r = 0; 3932 break; 3933 } 3934 case KVM_GET_SUPPORTED_CPUID: 3935 case KVM_GET_EMULATED_CPUID: { 3936 struct kvm_cpuid2 __user *cpuid_arg = argp; 3937 struct kvm_cpuid2 cpuid; 3938 3939 r = -EFAULT; 3940 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) 3941 goto out; 3942 3943 r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries, 3944 ioctl); 3945 if (r) 3946 goto out; 3947 3948 r = -EFAULT; 3949 if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) 3950 goto out; 3951 r = 0; 3952 break; 3953 } 3954 case KVM_X86_GET_MCE_CAP_SUPPORTED: 3955 r = -EFAULT; 3956 if (copy_to_user(argp, &kvm_mce_cap_supported, 3957 sizeof(kvm_mce_cap_supported))) 3958 goto out; 3959 r = 0; 3960 break; 3961 case KVM_GET_MSR_FEATURE_INDEX_LIST: { 3962 struct kvm_msr_list __user *user_msr_list = argp; 3963 struct kvm_msr_list msr_list; 3964 unsigned int n; 3965 3966 r = -EFAULT; 3967 if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list))) 3968 goto out; 3969 n = msr_list.nmsrs; 3970 msr_list.nmsrs = num_msr_based_features; 3971 if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list))) 3972 goto out; 3973 r = -E2BIG; 3974 if (n < msr_list.nmsrs) 3975 goto out; 3976 r = -EFAULT; 3977 if (copy_to_user(user_msr_list->indices, &msr_based_features, 3978 num_msr_based_features * sizeof(u32))) 3979 goto out; 3980 r = 0; 3981 break; 3982 } 3983 case KVM_GET_MSRS: 3984 r = msr_io(NULL, argp, do_get_msr_feature, 1); 3985 break; 3986 case KVM_GET_SUPPORTED_HV_CPUID: 3987 r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp); 3988 break; 3989 default: 3990 r = -EINVAL; 3991 break; 3992 } 3993 out: 3994 return r; 3995 } 3996 3997 static void wbinvd_ipi(void *garbage) 3998 { 3999 wbinvd(); 4000 } 4001 4002 static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) 4003 { 4004 return kvm_arch_has_noncoherent_dma(vcpu->kvm); 4005 } 4006 4007 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 4008 { 4009 /* Address WBINVD may be executed by guest */ 4010 if (need_emulate_wbinvd(vcpu)) { 4011 if (static_call(kvm_x86_has_wbinvd_exit)()) 4012 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); 4013 else if (vcpu->cpu != -1 && vcpu->cpu != cpu) 4014 smp_call_function_single(vcpu->cpu, 4015 wbinvd_ipi, NULL, 1); 4016 } 4017 4018 static_call(kvm_x86_vcpu_load)(vcpu, cpu); 4019 4020 /* Save host pkru register if supported */ 4021 vcpu->arch.host_pkru = read_pkru(); 4022 4023 /* Apply any externally detected TSC adjustments (due to suspend) */ 4024 if (unlikely(vcpu->arch.tsc_offset_adjustment)) { 4025 adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); 4026 vcpu->arch.tsc_offset_adjustment = 0; 4027 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 4028 } 4029 4030 if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) { 4031 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : 4032 rdtsc() - vcpu->arch.last_host_tsc; 4033 if (tsc_delta < 0) 4034 mark_tsc_unstable("KVM discovered backwards TSC"); 4035 4036 if (kvm_check_tsc_unstable()) { 4037 u64 offset = kvm_compute_tsc_offset(vcpu, 4038 vcpu->arch.last_guest_tsc); 4039 kvm_vcpu_write_tsc_offset(vcpu, offset); 4040 vcpu->arch.tsc_catchup = 1; 4041 } 4042 4043 if (kvm_lapic_hv_timer_in_use(vcpu)) 4044 kvm_lapic_restart_hv_timer(vcpu); 4045 4046 /* 4047 * On a host with synchronized TSC, there is no need to update 4048 * kvmclock on vcpu->cpu migration 4049 */ 4050 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) 4051 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); 4052 if (vcpu->cpu != cpu) 4053 kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu); 4054 vcpu->cpu = cpu; 4055 } 4056 4057 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); 4058 } 4059 4060 static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) 4061 { 4062 struct kvm_host_map map; 4063 struct kvm_steal_time *st; 4064 4065 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) 4066 return; 4067 4068 if (vcpu->arch.st.preempted) 4069 return; 4070 4071 if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, 4072 &vcpu->arch.st.cache, true)) 4073 return; 4074 4075 st = map.hva + 4076 offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); 4077 4078 st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; 4079 4080 kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); 4081 } 4082 4083 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 4084 { 4085 int idx; 4086 4087 if (vcpu->preempted && !vcpu->arch.guest_state_protected) 4088 vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu); 4089 4090 /* 4091 * Take the srcu lock as memslots will be accessed to check the gfn 4092 * cache generation against the memslots generation. 4093 */ 4094 idx = srcu_read_lock(&vcpu->kvm->srcu); 4095 if (kvm_xen_msr_enabled(vcpu->kvm)) 4096 kvm_xen_runstate_set_preempted(vcpu); 4097 else 4098 kvm_steal_time_set_preempted(vcpu); 4099 srcu_read_unlock(&vcpu->kvm->srcu, idx); 4100 4101 static_call(kvm_x86_vcpu_put)(vcpu); 4102 vcpu->arch.last_host_tsc = rdtsc(); 4103 /* 4104 * If userspace has set any breakpoints or watchpoints, dr6 is restored 4105 * on every vmexit, but if not, we might have a stale dr6 from the 4106 * guest. do_debug expects dr6 to be cleared after it runs, do the same. 4107 */ 4108 set_debugreg(0, 6); 4109 } 4110 4111 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 4112 struct kvm_lapic_state *s) 4113 { 4114 if (vcpu->arch.apicv_active) 4115 static_call(kvm_x86_sync_pir_to_irr)(vcpu); 4116 4117 return kvm_apic_get_state(vcpu, s); 4118 } 4119 4120 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 4121 struct kvm_lapic_state *s) 4122 { 4123 int r; 4124 4125 r = kvm_apic_set_state(vcpu, s); 4126 if (r) 4127 return r; 4128 update_cr8_intercept(vcpu); 4129 4130 return 0; 4131 } 4132 4133 static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu) 4134 { 4135 /* 4136 * We can accept userspace's request for interrupt injection 4137 * as long as we have a place to store the interrupt number. 4138 * The actual injection will happen when the CPU is able to 4139 * deliver the interrupt. 4140 */ 4141 if (kvm_cpu_has_extint(vcpu)) 4142 return false; 4143 4144 /* Acknowledging ExtINT does not happen if LINT0 is masked. */ 4145 return (!lapic_in_kernel(vcpu) || 4146 kvm_apic_accept_pic_intr(vcpu)); 4147 } 4148 4149 static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu) 4150 { 4151 return kvm_arch_interrupt_allowed(vcpu) && 4152 kvm_cpu_accept_dm_intr(vcpu); 4153 } 4154 4155 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 4156 struct kvm_interrupt *irq) 4157 { 4158 if (irq->irq >= KVM_NR_INTERRUPTS) 4159 return -EINVAL; 4160 4161 if (!irqchip_in_kernel(vcpu->kvm)) { 4162 kvm_queue_interrupt(vcpu, irq->irq, false); 4163 kvm_make_request(KVM_REQ_EVENT, vcpu); 4164 return 0; 4165 } 4166 4167 /* 4168 * With in-kernel LAPIC, we only use this to inject EXTINT, so 4169 * fail for in-kernel 8259. 4170 */ 4171 if (pic_in_kernel(vcpu->kvm)) 4172 return -ENXIO; 4173 4174 if (vcpu->arch.pending_external_vector != -1) 4175 return -EEXIST; 4176 4177 vcpu->arch.pending_external_vector = irq->irq; 4178 kvm_make_request(KVM_REQ_EVENT, vcpu); 4179 return 0; 4180 } 4181 4182 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) 4183 { 4184 kvm_inject_nmi(vcpu); 4185 4186 return 0; 4187 } 4188 4189 static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu) 4190 { 4191 kvm_make_request(KVM_REQ_SMI, vcpu); 4192 4193 return 0; 4194 } 4195 4196 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, 4197 struct kvm_tpr_access_ctl *tac) 4198 { 4199 if (tac->flags) 4200 return -EINVAL; 4201 vcpu->arch.tpr_access_reporting = !!tac->enabled; 4202 return 0; 4203 } 4204 4205 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, 4206 u64 mcg_cap) 4207 { 4208 int r; 4209 unsigned bank_num = mcg_cap & 0xff, bank; 4210 4211 r = -EINVAL; 4212 if (!bank_num || bank_num > KVM_MAX_MCE_BANKS) 4213 goto out; 4214 if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000)) 4215 goto out; 4216 r = 0; 4217 vcpu->arch.mcg_cap = mcg_cap; 4218 /* Init IA32_MCG_CTL to all 1s */ 4219 if (mcg_cap & MCG_CTL_P) 4220 vcpu->arch.mcg_ctl = ~(u64)0; 4221 /* Init IA32_MCi_CTL to all 1s */ 4222 for (bank = 0; bank < bank_num; bank++) 4223 vcpu->arch.mce_banks[bank*4] = ~(u64)0; 4224 4225 static_call(kvm_x86_setup_mce)(vcpu); 4226 out: 4227 return r; 4228 } 4229 4230 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, 4231 struct kvm_x86_mce *mce) 4232 { 4233 u64 mcg_cap = vcpu->arch.mcg_cap; 4234 unsigned bank_num = mcg_cap & 0xff; 4235 u64 *banks = vcpu->arch.mce_banks; 4236 4237 if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) 4238 return -EINVAL; 4239 /* 4240 * if IA32_MCG_CTL is not all 1s, the uncorrected error 4241 * reporting is disabled 4242 */ 4243 if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && 4244 vcpu->arch.mcg_ctl != ~(u64)0) 4245 return 0; 4246 banks += 4 * mce->bank; 4247 /* 4248 * if IA32_MCi_CTL is not all 1s, the uncorrected error 4249 * reporting is disabled for the bank 4250 */ 4251 if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0) 4252 return 0; 4253 if (mce->status & MCI_STATUS_UC) { 4254 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || 4255 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { 4256 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 4257 return 0; 4258 } 4259 if (banks[1] & MCI_STATUS_VAL) 4260 mce->status |= MCI_STATUS_OVER; 4261 banks[2] = mce->addr; 4262 banks[3] = mce->misc; 4263 vcpu->arch.mcg_status = mce->mcg_status; 4264 banks[1] = mce->status; 4265 kvm_queue_exception(vcpu, MC_VECTOR); 4266 } else if (!(banks[1] & MCI_STATUS_VAL) 4267 || !(banks[1] & MCI_STATUS_UC)) { 4268 if (banks[1] & MCI_STATUS_VAL) 4269 mce->status |= MCI_STATUS_OVER; 4270 banks[2] = mce->addr; 4271 banks[3] = mce->misc; 4272 banks[1] = mce->status; 4273 } else 4274 banks[1] |= MCI_STATUS_OVER; 4275 return 0; 4276 } 4277 4278 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, 4279 struct kvm_vcpu_events *events) 4280 { 4281 process_nmi(vcpu); 4282 4283 if (kvm_check_request(KVM_REQ_SMI, vcpu)) 4284 process_smi(vcpu); 4285 4286 /* 4287 * In guest mode, payload delivery should be deferred, 4288 * so that the L1 hypervisor can intercept #PF before 4289 * CR2 is modified (or intercept #DB before DR6 is 4290 * modified under nVMX). Unless the per-VM capability, 4291 * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of 4292 * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we 4293 * opportunistically defer the exception payload, deliver it if the 4294 * capability hasn't been requested before processing a 4295 * KVM_GET_VCPU_EVENTS. 4296 */ 4297 if (!vcpu->kvm->arch.exception_payload_enabled && 4298 vcpu->arch.exception.pending && vcpu->arch.exception.has_payload) 4299 kvm_deliver_exception_payload(vcpu); 4300 4301 /* 4302 * The API doesn't provide the instruction length for software 4303 * exceptions, so don't report them. As long as the guest RIP 4304 * isn't advanced, we should expect to encounter the exception 4305 * again. 4306 */ 4307 if (kvm_exception_is_soft(vcpu->arch.exception.nr)) { 4308 events->exception.injected = 0; 4309 events->exception.pending = 0; 4310 } else { 4311 events->exception.injected = vcpu->arch.exception.injected; 4312 events->exception.pending = vcpu->arch.exception.pending; 4313 /* 4314 * For ABI compatibility, deliberately conflate 4315 * pending and injected exceptions when 4316 * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled. 4317 */ 4318 if (!vcpu->kvm->arch.exception_payload_enabled) 4319 events->exception.injected |= 4320 vcpu->arch.exception.pending; 4321 } 4322 events->exception.nr = vcpu->arch.exception.nr; 4323 events->exception.has_error_code = vcpu->arch.exception.has_error_code; 4324 events->exception.error_code = vcpu->arch.exception.error_code; 4325 events->exception_has_payload = vcpu->arch.exception.has_payload; 4326 events->exception_payload = vcpu->arch.exception.payload; 4327 4328 events->interrupt.injected = 4329 vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft; 4330 events->interrupt.nr = vcpu->arch.interrupt.nr; 4331 events->interrupt.soft = 0; 4332 events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); 4333 4334 events->nmi.injected = vcpu->arch.nmi_injected; 4335 events->nmi.pending = vcpu->arch.nmi_pending != 0; 4336 events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu); 4337 events->nmi.pad = 0; 4338 4339 events->sipi_vector = 0; /* never valid when reporting to user space */ 4340 4341 events->smi.smm = is_smm(vcpu); 4342 events->smi.pending = vcpu->arch.smi_pending; 4343 events->smi.smm_inside_nmi = 4344 !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK); 4345 events->smi.latched_init = kvm_lapic_latched_init(vcpu); 4346 4347 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 4348 | KVM_VCPUEVENT_VALID_SHADOW 4349 | KVM_VCPUEVENT_VALID_SMM); 4350 if (vcpu->kvm->arch.exception_payload_enabled) 4351 events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD; 4352 4353 memset(&events->reserved, 0, sizeof(events->reserved)); 4354 } 4355 4356 static void kvm_smm_changed(struct kvm_vcpu *vcpu); 4357 4358 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, 4359 struct kvm_vcpu_events *events) 4360 { 4361 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING 4362 | KVM_VCPUEVENT_VALID_SIPI_VECTOR 4363 | KVM_VCPUEVENT_VALID_SHADOW 4364 | KVM_VCPUEVENT_VALID_SMM 4365 | KVM_VCPUEVENT_VALID_PAYLOAD)) 4366 return -EINVAL; 4367 4368 if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) { 4369 if (!vcpu->kvm->arch.exception_payload_enabled) 4370 return -EINVAL; 4371 if (events->exception.pending) 4372 events->exception.injected = 0; 4373 else 4374 events->exception_has_payload = 0; 4375 } else { 4376 events->exception.pending = 0; 4377 events->exception_has_payload = 0; 4378 } 4379 4380 if ((events->exception.injected || events->exception.pending) && 4381 (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR)) 4382 return -EINVAL; 4383 4384 /* INITs are latched while in SMM */ 4385 if (events->flags & KVM_VCPUEVENT_VALID_SMM && 4386 (events->smi.smm || events->smi.pending) && 4387 vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4388 return -EINVAL; 4389 4390 process_nmi(vcpu); 4391 vcpu->arch.exception.injected = events->exception.injected; 4392 vcpu->arch.exception.pending = events->exception.pending; 4393 vcpu->arch.exception.nr = events->exception.nr; 4394 vcpu->arch.exception.has_error_code = events->exception.has_error_code; 4395 vcpu->arch.exception.error_code = events->exception.error_code; 4396 vcpu->arch.exception.has_payload = events->exception_has_payload; 4397 vcpu->arch.exception.payload = events->exception_payload; 4398 4399 vcpu->arch.interrupt.injected = events->interrupt.injected; 4400 vcpu->arch.interrupt.nr = events->interrupt.nr; 4401 vcpu->arch.interrupt.soft = events->interrupt.soft; 4402 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) 4403 static_call(kvm_x86_set_interrupt_shadow)(vcpu, 4404 events->interrupt.shadow); 4405 4406 vcpu->arch.nmi_injected = events->nmi.injected; 4407 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) 4408 vcpu->arch.nmi_pending = events->nmi.pending; 4409 static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked); 4410 4411 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && 4412 lapic_in_kernel(vcpu)) 4413 vcpu->arch.apic->sipi_vector = events->sipi_vector; 4414 4415 if (events->flags & KVM_VCPUEVENT_VALID_SMM) { 4416 if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) { 4417 if (events->smi.smm) 4418 vcpu->arch.hflags |= HF_SMM_MASK; 4419 else 4420 vcpu->arch.hflags &= ~HF_SMM_MASK; 4421 kvm_smm_changed(vcpu); 4422 } 4423 4424 vcpu->arch.smi_pending = events->smi.pending; 4425 4426 if (events->smi.smm) { 4427 if (events->smi.smm_inside_nmi) 4428 vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; 4429 else 4430 vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; 4431 } 4432 4433 if (lapic_in_kernel(vcpu)) { 4434 if (events->smi.latched_init) 4435 set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); 4436 else 4437 clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); 4438 } 4439 } 4440 4441 kvm_make_request(KVM_REQ_EVENT, vcpu); 4442 4443 return 0; 4444 } 4445 4446 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, 4447 struct kvm_debugregs *dbgregs) 4448 { 4449 unsigned long val; 4450 4451 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); 4452 kvm_get_dr(vcpu, 6, &val); 4453 dbgregs->dr6 = val; 4454 dbgregs->dr7 = vcpu->arch.dr7; 4455 dbgregs->flags = 0; 4456 memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); 4457 } 4458 4459 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, 4460 struct kvm_debugregs *dbgregs) 4461 { 4462 if (dbgregs->flags) 4463 return -EINVAL; 4464 4465 if (!kvm_dr6_valid(dbgregs->dr6)) 4466 return -EINVAL; 4467 if (!kvm_dr7_valid(dbgregs->dr7)) 4468 return -EINVAL; 4469 4470 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); 4471 kvm_update_dr0123(vcpu); 4472 vcpu->arch.dr6 = dbgregs->dr6; 4473 vcpu->arch.dr7 = dbgregs->dr7; 4474 kvm_update_dr7(vcpu); 4475 4476 return 0; 4477 } 4478 4479 #define XSTATE_COMPACTION_ENABLED (1ULL << 63) 4480 4481 static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) 4482 { 4483 struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; 4484 u64 xstate_bv = xsave->header.xfeatures; 4485 u64 valid; 4486 4487 /* 4488 * Copy legacy XSAVE area, to avoid complications with CPUID 4489 * leaves 0 and 1 in the loop below. 4490 */ 4491 memcpy(dest, xsave, XSAVE_HDR_OFFSET); 4492 4493 /* Set XSTATE_BV */ 4494 xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE; 4495 *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv; 4496 4497 /* 4498 * Copy each region from the possibly compacted offset to the 4499 * non-compacted offset. 4500 */ 4501 valid = xstate_bv & ~XFEATURE_MASK_FPSSE; 4502 while (valid) { 4503 u64 xfeature_mask = valid & -valid; 4504 int xfeature_nr = fls64(xfeature_mask) - 1; 4505 void *src = get_xsave_addr(xsave, xfeature_nr); 4506 4507 if (src) { 4508 u32 size, offset, ecx, edx; 4509 cpuid_count(XSTATE_CPUID, xfeature_nr, 4510 &size, &offset, &ecx, &edx); 4511 if (xfeature_nr == XFEATURE_PKRU) 4512 memcpy(dest + offset, &vcpu->arch.pkru, 4513 sizeof(vcpu->arch.pkru)); 4514 else 4515 memcpy(dest + offset, src, size); 4516 4517 } 4518 4519 valid -= xfeature_mask; 4520 } 4521 } 4522 4523 static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) 4524 { 4525 struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; 4526 u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET); 4527 u64 valid; 4528 4529 /* 4530 * Copy legacy XSAVE area, to avoid complications with CPUID 4531 * leaves 0 and 1 in the loop below. 4532 */ 4533 memcpy(xsave, src, XSAVE_HDR_OFFSET); 4534 4535 /* Set XSTATE_BV and possibly XCOMP_BV. */ 4536 xsave->header.xfeatures = xstate_bv; 4537 if (boot_cpu_has(X86_FEATURE_XSAVES)) 4538 xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; 4539 4540 /* 4541 * Copy each region from the non-compacted offset to the 4542 * possibly compacted offset. 4543 */ 4544 valid = xstate_bv & ~XFEATURE_MASK_FPSSE; 4545 while (valid) { 4546 u64 xfeature_mask = valid & -valid; 4547 int xfeature_nr = fls64(xfeature_mask) - 1; 4548 void *dest = get_xsave_addr(xsave, xfeature_nr); 4549 4550 if (dest) { 4551 u32 size, offset, ecx, edx; 4552 cpuid_count(XSTATE_CPUID, xfeature_nr, 4553 &size, &offset, &ecx, &edx); 4554 if (xfeature_nr == XFEATURE_PKRU) 4555 memcpy(&vcpu->arch.pkru, src + offset, 4556 sizeof(vcpu->arch.pkru)); 4557 else 4558 memcpy(dest, src + offset, size); 4559 } 4560 4561 valid -= xfeature_mask; 4562 } 4563 } 4564 4565 static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, 4566 struct kvm_xsave *guest_xsave) 4567 { 4568 if (!vcpu->arch.guest_fpu) 4569 return; 4570 4571 if (boot_cpu_has(X86_FEATURE_XSAVE)) { 4572 memset(guest_xsave, 0, sizeof(struct kvm_xsave)); 4573 fill_xsave((u8 *) guest_xsave->region, vcpu); 4574 } else { 4575 memcpy(guest_xsave->region, 4576 &vcpu->arch.guest_fpu->state.fxsave, 4577 sizeof(struct fxregs_state)); 4578 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = 4579 XFEATURE_MASK_FPSSE; 4580 } 4581 } 4582 4583 #define XSAVE_MXCSR_OFFSET 24 4584 4585 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, 4586 struct kvm_xsave *guest_xsave) 4587 { 4588 u64 xstate_bv; 4589 u32 mxcsr; 4590 4591 if (!vcpu->arch.guest_fpu) 4592 return 0; 4593 4594 xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; 4595 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)]; 4596 4597 if (boot_cpu_has(X86_FEATURE_XSAVE)) { 4598 /* 4599 * Here we allow setting states that are not present in 4600 * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility 4601 * with old userspace. 4602 */ 4603 if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask) 4604 return -EINVAL; 4605 load_xsave(vcpu, (u8 *)guest_xsave->region); 4606 } else { 4607 if (xstate_bv & ~XFEATURE_MASK_FPSSE || 4608 mxcsr & ~mxcsr_feature_mask) 4609 return -EINVAL; 4610 memcpy(&vcpu->arch.guest_fpu->state.fxsave, 4611 guest_xsave->region, sizeof(struct fxregs_state)); 4612 } 4613 return 0; 4614 } 4615 4616 static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, 4617 struct kvm_xcrs *guest_xcrs) 4618 { 4619 if (!boot_cpu_has(X86_FEATURE_XSAVE)) { 4620 guest_xcrs->nr_xcrs = 0; 4621 return; 4622 } 4623 4624 guest_xcrs->nr_xcrs = 1; 4625 guest_xcrs->flags = 0; 4626 guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK; 4627 guest_xcrs->xcrs[0].value = vcpu->arch.xcr0; 4628 } 4629 4630 static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, 4631 struct kvm_xcrs *guest_xcrs) 4632 { 4633 int i, r = 0; 4634 4635 if (!boot_cpu_has(X86_FEATURE_XSAVE)) 4636 return -EINVAL; 4637 4638 if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) 4639 return -EINVAL; 4640 4641 for (i = 0; i < guest_xcrs->nr_xcrs; i++) 4642 /* Only support XCR0 currently */ 4643 if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) { 4644 r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, 4645 guest_xcrs->xcrs[i].value); 4646 break; 4647 } 4648 if (r) 4649 r = -EINVAL; 4650 return r; 4651 } 4652 4653 /* 4654 * kvm_set_guest_paused() indicates to the guest kernel that it has been 4655 * stopped by the hypervisor. This function will be called from the host only. 4656 * EINVAL is returned when the host attempts to set the flag for a guest that 4657 * does not support pv clocks. 4658 */ 4659 static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) 4660 { 4661 if (!vcpu->arch.pv_time_enabled) 4662 return -EINVAL; 4663 vcpu->arch.pvclock_set_guest_stopped_request = true; 4664 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 4665 return 0; 4666 } 4667 4668 static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, 4669 struct kvm_enable_cap *cap) 4670 { 4671 int r; 4672 uint16_t vmcs_version; 4673 void __user *user_ptr; 4674 4675 if (cap->flags) 4676 return -EINVAL; 4677 4678 switch (cap->cap) { 4679 case KVM_CAP_HYPERV_SYNIC2: 4680 if (cap->args[0]) 4681 return -EINVAL; 4682 fallthrough; 4683 4684 case KVM_CAP_HYPERV_SYNIC: 4685 if (!irqchip_in_kernel(vcpu->kvm)) 4686 return -EINVAL; 4687 return kvm_hv_activate_synic(vcpu, cap->cap == 4688 KVM_CAP_HYPERV_SYNIC2); 4689 case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: 4690 if (!kvm_x86_ops.nested_ops->enable_evmcs) 4691 return -ENOTTY; 4692 r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version); 4693 if (!r) { 4694 user_ptr = (void __user *)(uintptr_t)cap->args[0]; 4695 if (copy_to_user(user_ptr, &vmcs_version, 4696 sizeof(vmcs_version))) 4697 r = -EFAULT; 4698 } 4699 return r; 4700 case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: 4701 if (!kvm_x86_ops.enable_direct_tlbflush) 4702 return -ENOTTY; 4703 4704 return static_call(kvm_x86_enable_direct_tlbflush)(vcpu); 4705 4706 case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: 4707 vcpu->arch.pv_cpuid.enforce = cap->args[0]; 4708 if (vcpu->arch.pv_cpuid.enforce) 4709 kvm_update_pv_runtime(vcpu); 4710 4711 return 0; 4712 4713 default: 4714 return -EINVAL; 4715 } 4716 } 4717 4718 long kvm_arch_vcpu_ioctl(struct file *filp, 4719 unsigned int ioctl, unsigned long arg) 4720 { 4721 struct kvm_vcpu *vcpu = filp->private_data; 4722 void __user *argp = (void __user *)arg; 4723 int r; 4724 union { 4725 struct kvm_lapic_state *lapic; 4726 struct kvm_xsave *xsave; 4727 struct kvm_xcrs *xcrs; 4728 void *buffer; 4729 } u; 4730 4731 vcpu_load(vcpu); 4732 4733 u.buffer = NULL; 4734 switch (ioctl) { 4735 case KVM_GET_LAPIC: { 4736 r = -EINVAL; 4737 if (!lapic_in_kernel(vcpu)) 4738 goto out; 4739 u.lapic = kzalloc(sizeof(struct kvm_lapic_state), 4740 GFP_KERNEL_ACCOUNT); 4741 4742 r = -ENOMEM; 4743 if (!u.lapic) 4744 goto out; 4745 r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic); 4746 if (r) 4747 goto out; 4748 r = -EFAULT; 4749 if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state))) 4750 goto out; 4751 r = 0; 4752 break; 4753 } 4754 case KVM_SET_LAPIC: { 4755 r = -EINVAL; 4756 if (!lapic_in_kernel(vcpu)) 4757 goto out; 4758 u.lapic = memdup_user(argp, sizeof(*u.lapic)); 4759 if (IS_ERR(u.lapic)) { 4760 r = PTR_ERR(u.lapic); 4761 goto out_nofree; 4762 } 4763 4764 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); 4765 break; 4766 } 4767 case KVM_INTERRUPT: { 4768 struct kvm_interrupt irq; 4769 4770 r = -EFAULT; 4771 if (copy_from_user(&irq, argp, sizeof(irq))) 4772 goto out; 4773 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 4774 break; 4775 } 4776 case KVM_NMI: { 4777 r = kvm_vcpu_ioctl_nmi(vcpu); 4778 break; 4779 } 4780 case KVM_SMI: { 4781 r = kvm_vcpu_ioctl_smi(vcpu); 4782 break; 4783 } 4784 case KVM_SET_CPUID: { 4785 struct kvm_cpuid __user *cpuid_arg = argp; 4786 struct kvm_cpuid cpuid; 4787 4788 r = -EFAULT; 4789 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) 4790 goto out; 4791 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); 4792 break; 4793 } 4794 case KVM_SET_CPUID2: { 4795 struct kvm_cpuid2 __user *cpuid_arg = argp; 4796 struct kvm_cpuid2 cpuid; 4797 4798 r = -EFAULT; 4799 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) 4800 goto out; 4801 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, 4802 cpuid_arg->entries); 4803 break; 4804 } 4805 case KVM_GET_CPUID2: { 4806 struct kvm_cpuid2 __user *cpuid_arg = argp; 4807 struct kvm_cpuid2 cpuid; 4808 4809 r = -EFAULT; 4810 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) 4811 goto out; 4812 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, 4813 cpuid_arg->entries); 4814 if (r) 4815 goto out; 4816 r = -EFAULT; 4817 if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) 4818 goto out; 4819 r = 0; 4820 break; 4821 } 4822 case KVM_GET_MSRS: { 4823 int idx = srcu_read_lock(&vcpu->kvm->srcu); 4824 r = msr_io(vcpu, argp, do_get_msr, 1); 4825 srcu_read_unlock(&vcpu->kvm->srcu, idx); 4826 break; 4827 } 4828 case KVM_SET_MSRS: { 4829 int idx = srcu_read_lock(&vcpu->kvm->srcu); 4830 r = msr_io(vcpu, argp, do_set_msr, 0); 4831 srcu_read_unlock(&vcpu->kvm->srcu, idx); 4832 break; 4833 } 4834 case KVM_TPR_ACCESS_REPORTING: { 4835 struct kvm_tpr_access_ctl tac; 4836 4837 r = -EFAULT; 4838 if (copy_from_user(&tac, argp, sizeof(tac))) 4839 goto out; 4840 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); 4841 if (r) 4842 goto out; 4843 r = -EFAULT; 4844 if (copy_to_user(argp, &tac, sizeof(tac))) 4845 goto out; 4846 r = 0; 4847 break; 4848 }; 4849 case KVM_SET_VAPIC_ADDR: { 4850 struct kvm_vapic_addr va; 4851 int idx; 4852 4853 r = -EINVAL; 4854 if (!lapic_in_kernel(vcpu)) 4855 goto out; 4856 r = -EFAULT; 4857 if (copy_from_user(&va, argp, sizeof(va))) 4858 goto out; 4859 idx = srcu_read_lock(&vcpu->kvm->srcu); 4860 r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); 4861 srcu_read_unlock(&vcpu->kvm->srcu, idx); 4862 break; 4863 } 4864 case KVM_X86_SETUP_MCE: { 4865 u64 mcg_cap; 4866 4867 r = -EFAULT; 4868 if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap))) 4869 goto out; 4870 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); 4871 break; 4872 } 4873 case KVM_X86_SET_MCE: { 4874 struct kvm_x86_mce mce; 4875 4876 r = -EFAULT; 4877 if (copy_from_user(&mce, argp, sizeof(mce))) 4878 goto out; 4879 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 4880 break; 4881 } 4882 case KVM_GET_VCPU_EVENTS: { 4883 struct kvm_vcpu_events events; 4884 4885 kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events); 4886 4887 r = -EFAULT; 4888 if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events))) 4889 break; 4890 r = 0; 4891 break; 4892 } 4893 case KVM_SET_VCPU_EVENTS: { 4894 struct kvm_vcpu_events events; 4895 4896 r = -EFAULT; 4897 if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events))) 4898 break; 4899 4900 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); 4901 break; 4902 } 4903 case KVM_GET_DEBUGREGS: { 4904 struct kvm_debugregs dbgregs; 4905 4906 kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs); 4907 4908 r = -EFAULT; 4909 if (copy_to_user(argp, &dbgregs, 4910 sizeof(struct kvm_debugregs))) 4911 break; 4912 r = 0; 4913 break; 4914 } 4915 case KVM_SET_DEBUGREGS: { 4916 struct kvm_debugregs dbgregs; 4917 4918 r = -EFAULT; 4919 if (copy_from_user(&dbgregs, argp, 4920 sizeof(struct kvm_debugregs))) 4921 break; 4922 4923 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); 4924 break; 4925 } 4926 case KVM_GET_XSAVE: { 4927 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT); 4928 r = -ENOMEM; 4929 if (!u.xsave) 4930 break; 4931 4932 kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); 4933 4934 r = -EFAULT; 4935 if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave))) 4936 break; 4937 r = 0; 4938 break; 4939 } 4940 case KVM_SET_XSAVE: { 4941 u.xsave = memdup_user(argp, sizeof(*u.xsave)); 4942 if (IS_ERR(u.xsave)) { 4943 r = PTR_ERR(u.xsave); 4944 goto out_nofree; 4945 } 4946 4947 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); 4948 break; 4949 } 4950 case KVM_GET_XCRS: { 4951 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT); 4952 r = -ENOMEM; 4953 if (!u.xcrs) 4954 break; 4955 4956 kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); 4957 4958 r = -EFAULT; 4959 if (copy_to_user(argp, u.xcrs, 4960 sizeof(struct kvm_xcrs))) 4961 break; 4962 r = 0; 4963 break; 4964 } 4965 case KVM_SET_XCRS: { 4966 u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); 4967 if (IS_ERR(u.xcrs)) { 4968 r = PTR_ERR(u.xcrs); 4969 goto out_nofree; 4970 } 4971 4972 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); 4973 break; 4974 } 4975 case KVM_SET_TSC_KHZ: { 4976 u32 user_tsc_khz; 4977 4978 r = -EINVAL; 4979 user_tsc_khz = (u32)arg; 4980 4981 if (kvm_has_tsc_control && 4982 user_tsc_khz >= kvm_max_guest_tsc_khz) 4983 goto out; 4984 4985 if (user_tsc_khz == 0) 4986 user_tsc_khz = tsc_khz; 4987 4988 if (!kvm_set_tsc_khz(vcpu, user_tsc_khz)) 4989 r = 0; 4990 4991 goto out; 4992 } 4993 case KVM_GET_TSC_KHZ: { 4994 r = vcpu->arch.virtual_tsc_khz; 4995 goto out; 4996 } 4997 case KVM_KVMCLOCK_CTRL: { 4998 r = kvm_set_guest_paused(vcpu); 4999 goto out; 5000 } 5001 case KVM_ENABLE_CAP: { 5002 struct kvm_enable_cap cap; 5003 5004 r = -EFAULT; 5005 if (copy_from_user(&cap, argp, sizeof(cap))) 5006 goto out; 5007 r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); 5008 break; 5009 } 5010 case KVM_GET_NESTED_STATE: { 5011 struct kvm_nested_state __user *user_kvm_nested_state = argp; 5012 u32 user_data_size; 5013 5014 r = -EINVAL; 5015 if (!kvm_x86_ops.nested_ops->get_state) 5016 break; 5017 5018 BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); 5019 r = -EFAULT; 5020 if (get_user(user_data_size, &user_kvm_nested_state->size)) 5021 break; 5022 5023 r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state, 5024 user_data_size); 5025 if (r < 0) 5026 break; 5027 5028 if (r > user_data_size) { 5029 if (put_user(r, &user_kvm_nested_state->size)) 5030 r = -EFAULT; 5031 else 5032 r = -E2BIG; 5033 break; 5034 } 5035 5036 r = 0; 5037 break; 5038 } 5039 case KVM_SET_NESTED_STATE: { 5040 struct kvm_nested_state __user *user_kvm_nested_state = argp; 5041 struct kvm_nested_state kvm_state; 5042 int idx; 5043 5044 r = -EINVAL; 5045 if (!kvm_x86_ops.nested_ops->set_state) 5046 break; 5047 5048 r = -EFAULT; 5049 if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state))) 5050 break; 5051 5052 r = -EINVAL; 5053 if (kvm_state.size < sizeof(kvm_state)) 5054 break; 5055 5056 if (kvm_state.flags & 5057 ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE 5058 | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING 5059 | KVM_STATE_NESTED_GIF_SET)) 5060 break; 5061 5062 /* nested_run_pending implies guest_mode. */ 5063 if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING) 5064 && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE)) 5065 break; 5066 5067 idx = srcu_read_lock(&vcpu->kvm->srcu); 5068 r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state); 5069 srcu_read_unlock(&vcpu->kvm->srcu, idx); 5070 break; 5071 } 5072 case KVM_GET_SUPPORTED_HV_CPUID: 5073 r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp); 5074 break; 5075 #ifdef CONFIG_KVM_XEN 5076 case KVM_XEN_VCPU_GET_ATTR: { 5077 struct kvm_xen_vcpu_attr xva; 5078 5079 r = -EFAULT; 5080 if (copy_from_user(&xva, argp, sizeof(xva))) 5081 goto out; 5082 r = kvm_xen_vcpu_get_attr(vcpu, &xva); 5083 if (!r && copy_to_user(argp, &xva, sizeof(xva))) 5084 r = -EFAULT; 5085 break; 5086 } 5087 case KVM_XEN_VCPU_SET_ATTR: { 5088 struct kvm_xen_vcpu_attr xva; 5089 5090 r = -EFAULT; 5091 if (copy_from_user(&xva, argp, sizeof(xva))) 5092 goto out; 5093 r = kvm_xen_vcpu_set_attr(vcpu, &xva); 5094 break; 5095 } 5096 #endif 5097 default: 5098 r = -EINVAL; 5099 } 5100 out: 5101 kfree(u.buffer); 5102 out_nofree: 5103 vcpu_put(vcpu); 5104 return r; 5105 } 5106 5107 vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) 5108 { 5109 return VM_FAULT_SIGBUS; 5110 } 5111 5112 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) 5113 { 5114 int ret; 5115 5116 if (addr > (unsigned int)(-3 * PAGE_SIZE)) 5117 return -EINVAL; 5118 ret = static_call(kvm_x86_set_tss_addr)(kvm, addr); 5119 return ret; 5120 } 5121 5122 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, 5123 u64 ident_addr) 5124 { 5125 return static_call(kvm_x86_set_identity_map_addr)(kvm, ident_addr); 5126 } 5127 5128 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, 5129 unsigned long kvm_nr_mmu_pages) 5130 { 5131 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) 5132 return -EINVAL; 5133 5134 mutex_lock(&kvm->slots_lock); 5135 5136 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 5137 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 5138 5139 mutex_unlock(&kvm->slots_lock); 5140 return 0; 5141 } 5142 5143 static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 5144 { 5145 return kvm->arch.n_max_mmu_pages; 5146 } 5147 5148 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 5149 { 5150 struct kvm_pic *pic = kvm->arch.vpic; 5151 int r; 5152 5153 r = 0; 5154 switch (chip->chip_id) { 5155 case KVM_IRQCHIP_PIC_MASTER: 5156 memcpy(&chip->chip.pic, &pic->pics[0], 5157 sizeof(struct kvm_pic_state)); 5158 break; 5159 case KVM_IRQCHIP_PIC_SLAVE: 5160 memcpy(&chip->chip.pic, &pic->pics[1], 5161 sizeof(struct kvm_pic_state)); 5162 break; 5163 case KVM_IRQCHIP_IOAPIC: 5164 kvm_get_ioapic(kvm, &chip->chip.ioapic); 5165 break; 5166 default: 5167 r = -EINVAL; 5168 break; 5169 } 5170 return r; 5171 } 5172 5173 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 5174 { 5175 struct kvm_pic *pic = kvm->arch.vpic; 5176 int r; 5177 5178 r = 0; 5179 switch (chip->chip_id) { 5180 case KVM_IRQCHIP_PIC_MASTER: 5181 spin_lock(&pic->lock); 5182 memcpy(&pic->pics[0], &chip->chip.pic, 5183 sizeof(struct kvm_pic_state)); 5184 spin_unlock(&pic->lock); 5185 break; 5186 case KVM_IRQCHIP_PIC_SLAVE: 5187 spin_lock(&pic->lock); 5188 memcpy(&pic->pics[1], &chip->chip.pic, 5189 sizeof(struct kvm_pic_state)); 5190 spin_unlock(&pic->lock); 5191 break; 5192 case KVM_IRQCHIP_IOAPIC: 5193 kvm_set_ioapic(kvm, &chip->chip.ioapic); 5194 break; 5195 default: 5196 r = -EINVAL; 5197 break; 5198 } 5199 kvm_pic_update_irq(pic); 5200 return r; 5201 } 5202 5203 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) 5204 { 5205 struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state; 5206 5207 BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels)); 5208 5209 mutex_lock(&kps->lock); 5210 memcpy(ps, &kps->channels, sizeof(*ps)); 5211 mutex_unlock(&kps->lock); 5212 return 0; 5213 } 5214 5215 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) 5216 { 5217 int i; 5218 struct kvm_pit *pit = kvm->arch.vpit; 5219 5220 mutex_lock(&pit->pit_state.lock); 5221 memcpy(&pit->pit_state.channels, ps, sizeof(*ps)); 5222 for (i = 0; i < 3; i++) 5223 kvm_pit_load_count(pit, i, ps->channels[i].count, 0); 5224 mutex_unlock(&pit->pit_state.lock); 5225 return 0; 5226 } 5227 5228 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 5229 { 5230 mutex_lock(&kvm->arch.vpit->pit_state.lock); 5231 memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, 5232 sizeof(ps->channels)); 5233 ps->flags = kvm->arch.vpit->pit_state.flags; 5234 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 5235 memset(&ps->reserved, 0, sizeof(ps->reserved)); 5236 return 0; 5237 } 5238 5239 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 5240 { 5241 int start = 0; 5242 int i; 5243 u32 prev_legacy, cur_legacy; 5244 struct kvm_pit *pit = kvm->arch.vpit; 5245 5246 mutex_lock(&pit->pit_state.lock); 5247 prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; 5248 cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; 5249 if (!prev_legacy && cur_legacy) 5250 start = 1; 5251 memcpy(&pit->pit_state.channels, &ps->channels, 5252 sizeof(pit->pit_state.channels)); 5253 pit->pit_state.flags = ps->flags; 5254 for (i = 0; i < 3; i++) 5255 kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count, 5256 start && i == 0); 5257 mutex_unlock(&pit->pit_state.lock); 5258 return 0; 5259 } 5260 5261 static int kvm_vm_ioctl_reinject(struct kvm *kvm, 5262 struct kvm_reinject_control *control) 5263 { 5264 struct kvm_pit *pit = kvm->arch.vpit; 5265 5266 /* pit->pit_state.lock was overloaded to prevent userspace from getting 5267 * an inconsistent state after running multiple KVM_REINJECT_CONTROL 5268 * ioctls in parallel. Use a separate lock if that ioctl isn't rare. 5269 */ 5270 mutex_lock(&pit->pit_state.lock); 5271 kvm_pit_set_reinject(pit, control->pit_reinject); 5272 mutex_unlock(&pit->pit_state.lock); 5273 5274 return 0; 5275 } 5276 5277 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) 5278 { 5279 5280 /* 5281 * Flush all CPUs' dirty log buffers to the dirty_bitmap. Called 5282 * before reporting dirty_bitmap to userspace. KVM flushes the buffers 5283 * on all VM-Exits, thus we only need to kick running vCPUs to force a 5284 * VM-Exit. 5285 */ 5286 struct kvm_vcpu *vcpu; 5287 int i; 5288 5289 kvm_for_each_vcpu(i, vcpu, kvm) 5290 kvm_vcpu_kick(vcpu); 5291 } 5292 5293 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, 5294 bool line_status) 5295 { 5296 if (!irqchip_in_kernel(kvm)) 5297 return -ENXIO; 5298 5299 irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 5300 irq_event->irq, irq_event->level, 5301 line_status); 5302 return 0; 5303 } 5304 5305 int kvm_vm_ioctl_enable_cap(struct kvm *kvm, 5306 struct kvm_enable_cap *cap) 5307 { 5308 int r; 5309 5310 if (cap->flags) 5311 return -EINVAL; 5312 5313 switch (cap->cap) { 5314 case KVM_CAP_DISABLE_QUIRKS: 5315 kvm->arch.disabled_quirks = cap->args[0]; 5316 r = 0; 5317 break; 5318 case KVM_CAP_SPLIT_IRQCHIP: { 5319 mutex_lock(&kvm->lock); 5320 r = -EINVAL; 5321 if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS) 5322 goto split_irqchip_unlock; 5323 r = -EEXIST; 5324 if (irqchip_in_kernel(kvm)) 5325 goto split_irqchip_unlock; 5326 if (kvm->created_vcpus) 5327 goto split_irqchip_unlock; 5328 r = kvm_setup_empty_irq_routing(kvm); 5329 if (r) 5330 goto split_irqchip_unlock; 5331 /* Pairs with irqchip_in_kernel. */ 5332 smp_wmb(); 5333 kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT; 5334 kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; 5335 r = 0; 5336 split_irqchip_unlock: 5337 mutex_unlock(&kvm->lock); 5338 break; 5339 } 5340 case KVM_CAP_X2APIC_API: 5341 r = -EINVAL; 5342 if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS) 5343 break; 5344 5345 if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS) 5346 kvm->arch.x2apic_format = true; 5347 if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) 5348 kvm->arch.x2apic_broadcast_quirk_disabled = true; 5349 5350 r = 0; 5351 break; 5352 case KVM_CAP_X86_DISABLE_EXITS: 5353 r = -EINVAL; 5354 if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS) 5355 break; 5356 5357 if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) && 5358 kvm_can_mwait_in_guest()) 5359 kvm->arch.mwait_in_guest = true; 5360 if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT) 5361 kvm->arch.hlt_in_guest = true; 5362 if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) 5363 kvm->arch.pause_in_guest = true; 5364 if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) 5365 kvm->arch.cstate_in_guest = true; 5366 r = 0; 5367 break; 5368 case KVM_CAP_MSR_PLATFORM_INFO: 5369 kvm->arch.guest_can_read_msr_platform_info = cap->args[0]; 5370 r = 0; 5371 break; 5372 case KVM_CAP_EXCEPTION_PAYLOAD: 5373 kvm->arch.exception_payload_enabled = cap->args[0]; 5374 r = 0; 5375 break; 5376 case KVM_CAP_X86_USER_SPACE_MSR: 5377 kvm->arch.user_space_msr_mask = cap->args[0]; 5378 r = 0; 5379 break; 5380 case KVM_CAP_X86_BUS_LOCK_EXIT: 5381 r = -EINVAL; 5382 if (cap->args[0] & ~KVM_BUS_LOCK_DETECTION_VALID_MODE) 5383 break; 5384 5385 if ((cap->args[0] & KVM_BUS_LOCK_DETECTION_OFF) && 5386 (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)) 5387 break; 5388 5389 if (kvm_has_bus_lock_exit && 5390 cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT) 5391 kvm->arch.bus_lock_detection_enabled = true; 5392 r = 0; 5393 break; 5394 default: 5395 r = -EINVAL; 5396 break; 5397 } 5398 return r; 5399 } 5400 5401 static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow) 5402 { 5403 struct kvm_x86_msr_filter *msr_filter; 5404 5405 msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT); 5406 if (!msr_filter) 5407 return NULL; 5408 5409 msr_filter->default_allow = default_allow; 5410 return msr_filter; 5411 } 5412 5413 static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter) 5414 { 5415 u32 i; 5416 5417 if (!msr_filter) 5418 return; 5419 5420 for (i = 0; i < msr_filter->count; i++) 5421 kfree(msr_filter->ranges[i].bitmap); 5422 5423 kfree(msr_filter); 5424 } 5425 5426 static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, 5427 struct kvm_msr_filter_range *user_range) 5428 { 5429 struct msr_bitmap_range range; 5430 unsigned long *bitmap = NULL; 5431 size_t bitmap_size; 5432 int r; 5433 5434 if (!user_range->nmsrs) 5435 return 0; 5436 5437 bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long); 5438 if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE) 5439 return -EINVAL; 5440 5441 bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size); 5442 if (IS_ERR(bitmap)) 5443 return PTR_ERR(bitmap); 5444 5445 range = (struct msr_bitmap_range) { 5446 .flags = user_range->flags, 5447 .base = user_range->base, 5448 .nmsrs = user_range->nmsrs, 5449 .bitmap = bitmap, 5450 }; 5451 5452 if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) { 5453 r = -EINVAL; 5454 goto err; 5455 } 5456 5457 if (!range.flags) { 5458 r = -EINVAL; 5459 goto err; 5460 } 5461 5462 /* Everything ok, add this range identifier. */ 5463 msr_filter->ranges[msr_filter->count] = range; 5464 msr_filter->count++; 5465 5466 return 0; 5467 err: 5468 kfree(bitmap); 5469 return r; 5470 } 5471 5472 static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) 5473 { 5474 struct kvm_msr_filter __user *user_msr_filter = argp; 5475 struct kvm_x86_msr_filter *new_filter, *old_filter; 5476 struct kvm_msr_filter filter; 5477 bool default_allow; 5478 bool empty = true; 5479 int r = 0; 5480 u32 i; 5481 5482 if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) 5483 return -EFAULT; 5484 5485 for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) 5486 empty &= !filter.ranges[i].nmsrs; 5487 5488 default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY); 5489 if (empty && !default_allow) 5490 return -EINVAL; 5491 5492 new_filter = kvm_alloc_msr_filter(default_allow); 5493 if (!new_filter) 5494 return -ENOMEM; 5495 5496 for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { 5497 r = kvm_add_msr_filter(new_filter, &filter.ranges[i]); 5498 if (r) { 5499 kvm_free