1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * AMD SVM support 5 * 6 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 8 * 9 * Authors: 10 * Yaniv Kamay <yaniv@qumranet.com> 11 * Avi Kivity <avi@qumranet.com> 12 * 13 * This work is licensed under the terms of the GNU GPL, version 2. See 14 * the COPYING file in the top-level directory. 15 * 16 */ 17 18 #define pr_fmt(fmt) "SVM: " fmt 19 20 #include <linux/kvm_host.h> 21 22 #include "irq.h" 23 #include "mmu.h" 24 #include "kvm_cache_regs.h" 25 #include "x86.h" 26 #include "cpuid.h" 27 #include "pmu.h" 28 29 #include <linux/module.h> 30 #include <linux/mod_devicetable.h> 31 #include <linux/kernel.h> 32 #include <linux/vmalloc.h> 33 #include <linux/highmem.h> 34 #include <linux/sched.h> 35 #include <linux/trace_events.h> 36 #include <linux/slab.h> 37 #include <linux/amd-iommu.h> 38 #include <linux/hashtable.h> 39 #include <linux/frame.h> 40 #include <linux/psp-sev.h> 41 #include <linux/file.h> 42 #include <linux/pagemap.h> 43 #include <linux/swap.h> 44 45 #include <asm/apic.h> 46 #include <asm/perf_event.h> 47 #include <asm/tlbflush.h> 48 #include <asm/desc.h> 49 #include <asm/debugreg.h> 50 #include <asm/kvm_para.h> 51 #include <asm/irq_remapping.h> 52 #include <asm/spec-ctrl.h> 53 54 #include <asm/virtext.h> 55 #include "trace.h" 56 57 #define __ex(x) __kvm_handle_fault_on_reboot(x) 58 59 MODULE_AUTHOR("Qumranet"); 60 MODULE_LICENSE("GPL"); 61 62 static const struct x86_cpu_id svm_cpu_id[] = { 63 X86_FEATURE_MATCH(X86_FEATURE_SVM), 64 {} 65 }; 66 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); 67 68 #define IOPM_ALLOC_ORDER 2 69 #define MSRPM_ALLOC_ORDER 1 70 71 #define SEG_TYPE_LDT 2 72 #define SEG_TYPE_BUSY_TSS16 3 73 74 #define SVM_FEATURE_NPT (1 << 0) 75 #define SVM_FEATURE_LBRV (1 << 1) 76 #define SVM_FEATURE_SVML (1 << 2) 77 #define SVM_FEATURE_NRIP (1 << 3) 78 #define SVM_FEATURE_TSC_RATE (1 << 4) 79 #define SVM_FEATURE_VMCB_CLEAN (1 << 5) 80 #define SVM_FEATURE_FLUSH_ASID (1 << 6) 81 #define SVM_FEATURE_DECODE_ASSIST (1 << 7) 82 #define SVM_FEATURE_PAUSE_FILTER (1 << 10) 83 84 #define SVM_AVIC_DOORBELL 0xc001011b 85 86 #define NESTED_EXIT_HOST 0 /* Exit handled on host level */ 87 #define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ 88 #define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ 89 90 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 91 92 #define TSC_RATIO_RSVD 0xffffff0000000000ULL 93 #define TSC_RATIO_MIN 0x0000000000000001ULL 94 #define TSC_RATIO_MAX 0x000000ffffffffffULL 95 96 #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) 97 98 /* 99 * 0xff is broadcast, so the max index allowed for physical APIC ID 100 * table is 0xfe. APIC IDs above 0xff are reserved. 101 */ 102 #define AVIC_MAX_PHYSICAL_ID_COUNT 255 103 104 #define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 105 #define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 106 #define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF 107 108 /* AVIC GATAG is encoded using VM and VCPU IDs */ 109 #define AVIC_VCPU_ID_BITS 8 110 #define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1) 111 112 #define AVIC_VM_ID_BITS 24 113 #define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS) 114 #define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1) 115 116 #define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \ 117 (y & AVIC_VCPU_ID_MASK)) 118 #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK) 119 #define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) 120 121 static bool erratum_383_found __read_mostly; 122 123 static const u32 host_save_user_msrs[] = { 124 #ifdef CONFIG_X86_64 125 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, 126 MSR_FS_BASE, 127 #endif 128 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 129 MSR_TSC_AUX, 130 }; 131 132 #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) 133 134 struct kvm_sev_info { 135 bool active; /* SEV enabled guest */ 136 unsigned int asid; /* ASID used for this guest */ 137 unsigned int handle; /* SEV firmware handle */ 138 int fd; /* SEV device fd */ 139 unsigned long pages_locked; /* Number of pages locked */ 140 struct list_head regions_list; /* List of registered regions */ 141 }; 142 143 struct kvm_svm { 144 struct kvm kvm; 145 146 /* Struct members for AVIC */ 147 u32 avic_vm_id; 148 u32 ldr_mode; 149 struct page *avic_logical_id_table_page; 150 struct page *avic_physical_id_table_page; 151 struct hlist_node hnode; 152 153 struct kvm_sev_info sev_info; 154 }; 155 156 struct kvm_vcpu; 157 158 struct nested_state { 159 struct vmcb *hsave; 160 u64 hsave_msr; 161 u64 vm_cr_msr; 162 u64 vmcb; 163 164 /* These are the merged vectors */ 165 u32 *msrpm; 166 167 /* gpa pointers to the real vectors */ 168 u64 vmcb_msrpm; 169 u64 vmcb_iopm; 170 171 /* A VMEXIT is required but not yet emulated */ 172 bool exit_required; 173 174 /* cache for intercepts of the guest */ 175 u32 intercept_cr; 176 u32 intercept_dr; 177 u32 intercept_exceptions; 178 u64 intercept; 179 180 /* Nested Paging related state */ 181 u64 nested_cr3; 182 }; 183 184 #define MSRPM_OFFSETS 16 185 static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; 186 187 /* 188 * Set osvw_len to higher value when updated Revision Guides 189 * are published and we know what the new status bits are 190 */ 191 static uint64_t osvw_len = 4, osvw_status; 192 193 struct vcpu_svm { 194 struct kvm_vcpu vcpu; 195 struct vmcb *vmcb; 196 unsigned long vmcb_pa; 197 struct svm_cpu_data *svm_data; 198 uint64_t asid_generation; 199 uint64_t sysenter_esp; 200 uint64_t sysenter_eip; 201 uint64_t tsc_aux; 202 203 u64 msr_decfg; 204 205 u64 next_rip; 206 207 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; 208 struct { 209 u16 fs; 210 u16 gs; 211 u16 ldt; 212 u64 gs_base; 213 } host; 214 215 u64 spec_ctrl; 216 /* 217 * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be 218 * translated into the appropriate L2_CFG bits on the host to 219 * perform speculative control. 220 */ 221 u64 virt_spec_ctrl; 222 223 u32 *msrpm; 224 225 ulong nmi_iret_rip; 226 227 struct nested_state nested; 228 229 bool nmi_singlestep; 230 u64 nmi_singlestep_guest_rflags; 231 232 unsigned int3_injected; 233 unsigned long int3_rip; 234 235 /* cached guest cpuid flags for faster access */ 236 bool nrips_enabled : 1; 237 238 u32 ldr_reg; 239 struct page *avic_backing_page; 240 u64 *avic_physical_id_cache; 241 bool avic_is_running; 242 243 /* 244 * Per-vcpu list of struct amd_svm_iommu_ir: 245 * This is used mainly to store interrupt remapping information used 246 * when update the vcpu affinity. This avoids the need to scan for 247 * IRTE and try to match ga_tag in the IOMMU driver. 248 */ 249 struct list_head ir_list; 250 spinlock_t ir_list_lock; 251 252 /* which host CPU was used for running this vcpu */ 253 unsigned int last_cpu; 254 }; 255 256 /* 257 * This is a wrapper of struct amd_iommu_ir_data. 258 */ 259 struct amd_svm_iommu_ir { 260 struct list_head node; /* Used by SVM for per-vcpu ir_list */ 261 void *data; /* Storing pointer to struct amd_ir_data */ 262 }; 263 264 #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) 265 #define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) 266 267 #define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL) 268 #define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12) 269 #define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) 270 #define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) 271 272 static DEFINE_PER_CPU(u64, current_tsc_ratio); 273 #define TSC_RATIO_DEFAULT 0x0100000000ULL 274 275 #define MSR_INVALID 0xffffffffU 276 277 static const struct svm_direct_access_msrs { 278 u32 index; /* Index of the MSR */ 279 bool always; /* True if intercept is always on */ 280 } direct_access_msrs[] = { 281 { .index = MSR_STAR, .always = true }, 282 { .index = MSR_IA32_SYSENTER_CS, .always = true }, 283 #ifdef CONFIG_X86_64 284 { .index = MSR_GS_BASE, .always = true }, 285 { .index = MSR_FS_BASE, .always = true }, 286 { .index = MSR_KERNEL_GS_BASE, .always = true }, 287 { .index = MSR_LSTAR, .always = true }, 288 { .index = MSR_CSTAR, .always = true }, 289 { .index = MSR_SYSCALL_MASK, .always = true }, 290 #endif 291 { .index = MSR_IA32_SPEC_CTRL, .always = false }, 292 { .index = MSR_IA32_PRED_CMD, .always = false }, 293 { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, 294 { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, 295 { .index = MSR_IA32_LASTINTFROMIP, .always = false }, 296 { .index = MSR_IA32_LASTINTTOIP, .always = false }, 297 { .index = MSR_INVALID, .always = false }, 298 }; 299 300 /* enable NPT for AMD64 and X86 with PAE */ 301 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 302 static bool npt_enabled = true; 303 #else 304 static bool npt_enabled; 305 #endif 306 307 /* 308 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 309 * pause_filter_count: On processors that support Pause filtering(indicated 310 * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter 311 * count value. On VMRUN this value is loaded into an internal counter. 312 * Each time a pause instruction is executed, this counter is decremented 313 * until it reaches zero at which time a #VMEXIT is generated if pause 314 * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause 315 * Intercept Filtering for more details. 316 * This also indicate if ple logic enabled. 317 * 318 * pause_filter_thresh: In addition, some processor families support advanced 319 * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on 320 * the amount of time a guest is allowed to execute in a pause loop. 321 * In this mode, a 16-bit pause filter threshold field is added in the 322 * VMCB. The threshold value is a cycle count that is used to reset the 323 * pause counter. As with simple pause filtering, VMRUN loads the pause 324 * count value from VMCB into an internal counter. Then, on each pause 325 * instruction the hardware checks the elapsed number of cycles since 326 * the most recent pause instruction against the pause filter threshold. 327 * If the elapsed cycle count is greater than the pause filter threshold, 328 * then the internal pause count is reloaded from the VMCB and execution 329 * continues. If the elapsed cycle count is less than the pause filter 330 * threshold, then the internal pause count is decremented. If the count 331 * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is 332 * triggered. If advanced pause filtering is supported and pause filter 333 * threshold field is set to zero, the filter will operate in the simpler, 334 * count only mode. 335 */ 336 337 static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP; 338 module_param(pause_filter_thresh, ushort, 0444); 339 340 static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW; 341 module_param(pause_filter_count, ushort, 0444); 342 343 /* Default doubles per-vcpu window every exit. */ 344 static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW; 345 module_param(pause_filter_count_grow, ushort, 0444); 346 347 /* Default resets per-vcpu window every exit to pause_filter_count. */ 348 static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; 349 module_param(pause_filter_count_shrink, ushort, 0444); 350 351 /* Default is to compute the maximum so we can never overflow. */ 352 static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX; 353 module_param(pause_filter_count_max, ushort, 0444); 354 355 /* allow nested paging (virtualized MMU) for all guests */ 356 static int npt = true; 357 module_param(npt, int, S_IRUGO); 358 359 /* allow nested virtualization in KVM/SVM */ 360 static int nested = true; 361 module_param(nested, int, S_IRUGO); 362 363 /* enable / disable AVIC */ 364 static int avic; 365 #ifdef CONFIG_X86_LOCAL_APIC 366 module_param(avic, int, S_IRUGO); 367 #endif 368 369 /* enable/disable Virtual VMLOAD VMSAVE */ 370 static int vls = true; 371 module_param(vls, int, 0444); 372 373 /* enable/disable Virtual GIF */ 374 static int vgif = true; 375 module_param(vgif, int, 0444); 376 377 /* enable/disable SEV support */ 378 static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT); 379 module_param(sev, int, 0444); 380 381 static u8 rsm_ins_bytes[] = "\x0f\xaa"; 382 383 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); 384 static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa); 385 static void svm_complete_interrupts(struct vcpu_svm *svm); 386 387 static int nested_svm_exit_handled(struct vcpu_svm *svm); 388 static int nested_svm_intercept(struct vcpu_svm *svm); 389 static int nested_svm_vmexit(struct vcpu_svm *svm); 390 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 391 bool has_error_code, u32 error_code); 392 393 enum { 394 VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, 395 pause filter count */ 396 VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */ 397 VMCB_ASID, /* ASID */ 398 VMCB_INTR, /* int_ctl, int_vector */ 399 VMCB_NPT, /* npt_en, nCR3, gPAT */ 400 VMCB_CR, /* CR0, CR3, CR4, EFER */ 401 VMCB_DR, /* DR6, DR7 */ 402 VMCB_DT, /* GDT, IDT */ 403 VMCB_SEG, /* CS, DS, SS, ES, CPL */ 404 VMCB_CR2, /* CR2 only */ 405 VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */ 406 VMCB_AVIC, /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE, 407 * AVIC PHYSICAL_TABLE pointer, 408 * AVIC LOGICAL_TABLE pointer 409 */ 410 VMCB_DIRTY_MAX, 411 }; 412 413 /* TPR and CR2 are always written before VMRUN */ 414 #define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2)) 415 416 #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL 417 418 static unsigned int max_sev_asid; 419 static unsigned int min_sev_asid; 420 static unsigned long *sev_asid_bitmap; 421 #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) 422 423 struct enc_region { 424 struct list_head list; 425 unsigned long npages; 426 struct page **pages; 427 unsigned long uaddr; 428 unsigned long size; 429 }; 430 431 432 static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) 433 { 434 return container_of(kvm, struct kvm_svm, kvm); 435 } 436 437 static inline bool svm_sev_enabled(void) 438 { 439 return IS_ENABLED(CONFIG_KVM_AMD_SEV) ? max_sev_asid : 0; 440 } 441 442 static inline bool sev_guest(struct kvm *kvm) 443 { 444 #ifdef CONFIG_KVM_AMD_SEV 445 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; 446 447 return sev->active; 448 #else 449 return false; 450 #endif 451 } 452 453 static inline int sev_get_asid(struct kvm *kvm) 454 { 455 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; 456 457 return sev->asid; 458 } 459 460 static inline void mark_all_dirty(struct vmcb *vmcb) 461 { 462 vmcb->control.clean = 0; 463 } 464 465 static inline void mark_all_clean(struct vmcb *vmcb) 466 { 467 vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1) 468 & ~VMCB_ALWAYS_DIRTY_MASK; 469 } 470 471 static inline void mark_dirty(struct vmcb *vmcb, int bit) 472 { 473 vmcb->control.clean &= ~(1 << bit); 474 } 475 476 static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) 477 { 478 return container_of(vcpu, struct vcpu_svm, vcpu); 479 } 480 481 static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) 482 { 483 svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK; 484 mark_dirty(svm->vmcb, VMCB_AVIC); 485 } 486 487 static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu) 488 { 489 struct vcpu_svm *svm = to_svm(vcpu); 490 u64 *entry = svm->avic_physical_id_cache; 491 492 if (!entry) 493 return false; 494 495 return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); 496 } 497 498 static void recalc_intercepts(struct vcpu_svm *svm) 499 { 500 struct vmcb_control_area *c, *h; 501 struct nested_state *g; 502 503 mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 504 505 if (!is_guest_mode(&svm->vcpu)) 506 return; 507 508 c = &svm->vmcb->control; 509 h = &svm->nested.hsave->control; 510 g = &svm->nested; 511 512 c->intercept_cr = h->intercept_cr | g->intercept_cr; 513 c->intercept_dr = h->intercept_dr | g->intercept_dr; 514 c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions; 515 c->intercept = h->intercept | g->intercept; 516 } 517 518 static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm) 519 { 520 if (is_guest_mode(&svm->vcpu)) 521 return svm->nested.hsave; 522 else 523 return svm->vmcb; 524 } 525 526 static inline void set_cr_intercept(struct vcpu_svm *svm, int bit) 527 { 528 struct vmcb *vmcb = get_host_vmcb(svm); 529 530 vmcb->control.intercept_cr |= (1U << bit); 531 532 recalc_intercepts(svm); 533 } 534 535 static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit) 536 { 537 struct vmcb *vmcb = get_host_vmcb(svm); 538 539 vmcb->control.intercept_cr &= ~(1U << bit); 540 541 recalc_intercepts(svm); 542 } 543 544 static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit) 545 { 546 struct vmcb *vmcb = get_host_vmcb(svm); 547 548 return vmcb->control.intercept_cr & (1U << bit); 549 } 550 551 static inline void set_dr_intercepts(struct vcpu_svm *svm) 552 { 553 struct vmcb *vmcb = get_host_vmcb(svm); 554 555 vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ) 556 | (1 << INTERCEPT_DR1_READ) 557 | (1 << INTERCEPT_DR2_READ) 558 | (1 << INTERCEPT_DR3_READ) 559 | (1 << INTERCEPT_DR4_READ) 560 | (1 << INTERCEPT_DR5_READ) 561 | (1 << INTERCEPT_DR6_READ) 562 | (1 << INTERCEPT_DR7_READ) 563 | (1 << INTERCEPT_DR0_WRITE) 564 | (1 << INTERCEPT_DR1_WRITE) 565 | (1 << INTERCEPT_DR2_WRITE) 566 | (1 << INTERCEPT_DR3_WRITE) 567 | (1 << INTERCEPT_DR4_WRITE) 568 | (1 << INTERCEPT_DR5_WRITE) 569 | (1 << INTERCEPT_DR6_WRITE) 570 | (1 << INTERCEPT_DR7_WRITE); 571 572 recalc_intercepts(svm); 573 } 574 575 static inline void clr_dr_intercepts(struct vcpu_svm *svm) 576 { 577 struct vmcb *vmcb = get_host_vmcb(svm); 578 579 vmcb->control.intercept_dr = 0; 580 581 recalc_intercepts(svm); 582 } 583 584 static inline void set_exception_intercept(struct vcpu_svm *svm, int bit) 585 { 586 struct vmcb *vmcb = get_host_vmcb(svm); 587 588 vmcb->control.intercept_exceptions |= (1U << bit); 589 590 recalc_intercepts(svm); 591 } 592 593 static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit) 594 { 595 struct vmcb *vmcb = get_host_vmcb(svm); 596 597 vmcb->control.intercept_exceptions &= ~(1U << bit); 598 599 recalc_intercepts(svm); 600 } 601 602 static inline void set_intercept(struct vcpu_svm *svm, int bit) 603 { 604 struct vmcb *vmcb = get_host_vmcb(svm); 605 606 vmcb->control.intercept |= (1ULL << bit); 607 608 recalc_intercepts(svm); 609 } 610 611 static inline void clr_intercept(struct vcpu_svm *svm, int bit) 612 { 613 struct vmcb *vmcb = get_host_vmcb(svm); 614 615 vmcb->control.intercept &= ~(1ULL << bit); 616 617 recalc_intercepts(svm); 618 } 619 620 static inline bool vgif_enabled(struct vcpu_svm *svm) 621 { 622 return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK); 623 } 624 625 static inline void enable_gif(struct vcpu_svm *svm) 626 { 627 if (vgif_enabled(svm)) 628 svm->vmcb->control.int_ctl |= V_GIF_MASK; 629 else 630 svm->vcpu.arch.hflags |= HF_GIF_MASK; 631 } 632 633 static inline void disable_gif(struct vcpu_svm *svm) 634 { 635 if (vgif_enabled(svm)) 636 svm->vmcb->control.int_ctl &= ~V_GIF_MASK; 637 else 638 svm->vcpu.arch.hflags &= ~HF_GIF_MASK; 639 } 640 641 static inline bool gif_set(struct vcpu_svm *svm) 642 { 643 if (vgif_enabled(svm)) 644 return !!(svm->vmcb->control.int_ctl & V_GIF_MASK); 645 else 646 return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); 647 } 648 649 static unsigned long iopm_base; 650 651 struct kvm_ldttss_desc { 652 u16 limit0; 653 u16 base0; 654 unsigned base1:8, type:5, dpl:2, p:1; 655 unsigned limit1:4, zero0:3, g:1, base2:8; 656 u32 base3; 657 u32 zero1; 658 } __attribute__((packed)); 659 660 struct svm_cpu_data { 661 int cpu; 662 663 u64 asid_generation; 664 u32 max_asid; 665 u32 next_asid; 666 u32 min_asid; 667 struct kvm_ldttss_desc *tss_desc; 668 669 struct page *save_area; 670 struct vmcb *current_vmcb; 671 672 /* index = sev_asid, value = vmcb pointer */ 673 struct vmcb **sev_vmcbs; 674 }; 675 676 static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); 677 678 struct svm_init_data { 679 int cpu; 680 int r; 681 }; 682 683 static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; 684 685 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) 686 #define MSRS_RANGE_SIZE 2048 687 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) 688 689 static u32 svm_msrpm_offset(u32 msr) 690 { 691 u32 offset; 692 int i; 693 694 for (i = 0; i < NUM_MSR_MAPS; i++) { 695 if (msr < msrpm_ranges[i] || 696 msr >= msrpm_ranges[i] + MSRS_IN_RANGE) 697 continue; 698 699 offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */ 700 offset += (i * MSRS_RANGE_SIZE); /* add range offset */ 701 702 /* Now we have the u8 offset - but need the u32 offset */ 703 return offset / 4; 704 } 705 706 /* MSR not in any range */ 707 return MSR_INVALID; 708 } 709 710 #define MAX_INST_SIZE 15 711 712 static inline void clgi(void) 713 { 714 asm volatile (__ex(SVM_CLGI)); 715 } 716 717 static inline void stgi(void) 718 { 719 asm volatile (__ex(SVM_STGI)); 720 } 721 722 static inline void invlpga(unsigned long addr, u32 asid) 723 { 724 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); 725 } 726 727 static int get_npt_level(struct kvm_vcpu *vcpu) 728 { 729 #ifdef CONFIG_X86_64 730 return PT64_ROOT_4LEVEL; 731 #else 732 return PT32E_ROOT_LEVEL; 733 #endif 734 } 735 736 static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) 737 { 738 vcpu->arch.efer = efer; 739 if (!npt_enabled && !(efer & EFER_LMA)) 740 efer &= ~EFER_LME; 741 742 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; 743 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); 744 } 745 746 static int is_external_interrupt(u32 info) 747 { 748 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; 749 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); 750 } 751 752 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu) 753 { 754 struct vcpu_svm *svm = to_svm(vcpu); 755 u32 ret = 0; 756 757 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) 758 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; 759 return ret; 760 } 761 762 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 763 { 764 struct vcpu_svm *svm = to_svm(vcpu); 765 766 if (mask == 0) 767 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; 768 else 769 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK; 770 771 } 772 773 static void skip_emulated_instruction(struct kvm_vcpu *vcpu) 774 { 775 struct vcpu_svm *svm = to_svm(vcpu); 776 777 if (svm->vmcb->control.next_rip != 0) { 778 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS)); 779 svm->next_rip = svm->vmcb->control.next_rip; 780 } 781 782 if (!svm->next_rip) { 783 if (kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) != 784 EMULATE_DONE) 785 printk(KERN_DEBUG "%s: NOP\n", __func__); 786 return; 787 } 788 if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) 789 printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n", 790 __func__, kvm_rip_read(vcpu), svm->next_rip); 791 792 kvm_rip_write(vcpu, svm->next_rip); 793 svm_set_interrupt_shadow(vcpu, 0); 794 } 795 796 static void svm_queue_exception(struct kvm_vcpu *vcpu) 797 { 798 struct vcpu_svm *svm = to_svm(vcpu); 799 unsigned nr = vcpu->arch.exception.nr; 800 bool has_error_code = vcpu->arch.exception.has_error_code; 801 bool reinject = vcpu->arch.exception.injected; 802 u32 error_code = vcpu->arch.exception.error_code; 803 804 /* 805 * If we are within a nested VM we'd better #VMEXIT and let the guest 806 * handle the exception 807 */ 808 if (!reinject && 809 nested_svm_check_exception(svm, nr, has_error_code, error_code)) 810 return; 811 812 kvm_deliver_exception_payload(&svm->vcpu); 813 814 if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) { 815 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); 816 817 /* 818 * For guest debugging where we have to reinject #BP if some 819 * INT3 is guest-owned: 820 * Emulate nRIP by moving RIP forward. Will fail if injection 821 * raises a fault that is not intercepted. Still better than 822 * failing in all cases. 823 */ 824 skip_emulated_instruction(&svm->vcpu); 825 rip = kvm_rip_read(&svm->vcpu); 826 svm->int3_rip = rip + svm->vmcb->save.cs.base; 827 svm->int3_injected = rip - old_rip; 828 } 829 830 svm->vmcb->control.event_inj = nr 831 | SVM_EVTINJ_VALID 832 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) 833 | SVM_EVTINJ_TYPE_EXEPT; 834 svm->vmcb->control.event_inj_err = error_code; 835 } 836 837 static void svm_init_erratum_383(void) 838 { 839 u32 low, high; 840 int err; 841 u64 val; 842 843 if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH)) 844 return; 845 846 /* Use _safe variants to not break nested virtualization */ 847 val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); 848 if (err) 849 return; 850 851 val |= (1ULL << 47); 852 853 low = lower_32_bits(val); 854 high = upper_32_bits(val); 855 856 native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); 857 858 erratum_383_found = true; 859 } 860 861 static void svm_init_osvw(struct kvm_vcpu *vcpu) 862 { 863 /* 864 * Guests should see errata 400 and 415 as fixed (assuming that 865 * HLT and IO instructions are intercepted). 866 */ 867 vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3; 868 vcpu->arch.osvw.status = osvw_status & ~(6ULL); 869 870 /* 871 * By increasing VCPU's osvw.length to 3 we are telling the guest that 872 * all osvw.status bits inside that length, including bit 0 (which is 873 * reserved for erratum 298), are valid. However, if host processor's 874 * osvw_len is 0 then osvw_status[0] carries no information. We need to 875 * be conservative here and therefore we tell the guest that erratum 298 876 * is present (because we really don't know). 877 */ 878 if (osvw_len == 0 && boot_cpu_data.x86 == 0x10) 879 vcpu->arch.osvw.status |= 1; 880 } 881 882 static int has_svm(void) 883 { 884 const char *msg; 885 886 if (!cpu_has_svm(&msg)) { 887 printk(KERN_INFO "has_svm: %s\n", msg); 888 return 0; 889 } 890 891 return 1; 892 } 893 894 static void svm_hardware_disable(void) 895 { 896 /* Make sure we clean up behind us */ 897 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) 898 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); 899 900 cpu_svm_disable(); 901 902 amd_pmu_disable_virt(); 903 } 904 905 static int svm_hardware_enable(void) 906 { 907 908 struct svm_cpu_data *sd; 909 uint64_t efer; 910 struct desc_struct *gdt; 911 int me = raw_smp_processor_id(); 912 913 rdmsrl(MSR_EFER, efer); 914 if (efer & EFER_SVME) 915 return -EBUSY; 916 917 if (!has_svm()) { 918 pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me); 919 return -EINVAL; 920 } 921 sd = per_cpu(svm_data, me); 922 if (!sd) { 923 pr_err("%s: svm_data is NULL on %d\n", __func__, me); 924 return -EINVAL; 925 } 926 927 sd->asid_generation = 1; 928 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 929 sd->next_asid = sd->max_asid + 1; 930 sd->min_asid = max_sev_asid + 1; 931 932 gdt = get_current_gdt_rw(); 933 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 934 935 wrmsrl(MSR_EFER, efer | EFER_SVME); 936 937 wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); 938 939 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { 940 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); 941 __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT); 942 } 943 944 945 /* 946 * Get OSVW bits. 947 * 948 * Note that it is possible to have a system with mixed processor 949 * revisions and therefore different OSVW bits. If bits are not the same 950 * on different processors then choose the worst case (i.e. if erratum 951 * is present on one processor and not on another then assume that the 952 * erratum is present everywhere). 953 */ 954 if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { 955 uint64_t len, status = 0; 956 int err; 957 958 len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); 959 if (!err) 960 status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, 961 &err); 962 963 if (err) 964 osvw_status = osvw_len = 0; 965 else { 966 if (len < osvw_len) 967 osvw_len = len; 968 osvw_status |= status; 969 osvw_status &= (1ULL << osvw_len) - 1; 970 } 971 } else 972 osvw_status = osvw_len = 0; 973 974 svm_init_erratum_383(); 975 976 amd_pmu_enable_virt(); 977 978 return 0; 979 } 980 981 static void svm_cpu_uninit(int cpu) 982 { 983 struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id()); 984 985 if (!sd) 986 return; 987 988 per_cpu(svm_data, raw_smp_processor_id()) = NULL; 989 kfree(sd->sev_vmcbs); 990 __free_page(sd->save_area); 991 kfree(sd); 992 } 993 994 static int svm_cpu_init(int cpu) 995 { 996 struct svm_cpu_data *sd; 997 int r; 998 999 sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); 1000 if (!sd) 1001 return -ENOMEM; 1002 sd->cpu = cpu; 1003 r = -ENOMEM; 1004 sd->save_area = alloc_page(GFP_KERNEL); 1005 if (!sd->save_area) 1006 goto err_1; 1007 1008 if (svm_sev_enabled()) { 1009 r = -ENOMEM; 1010 sd->sev_vmcbs = kmalloc_array(max_sev_asid + 1, 1011 sizeof(void *), 1012 GFP_KERNEL); 1013 if (!sd->sev_vmcbs) 1014 goto err_1; 1015 } 1016 1017 per_cpu(svm_data, cpu) = sd; 1018 1019 return 0; 1020 1021 err_1: 1022 kfree(sd); 1023 return r; 1024 1025 } 1026 1027 static bool valid_msr_intercept(u32 index) 1028 { 1029 int i; 1030 1031 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) 1032 if (direct_access_msrs[i].index == index) 1033 return true; 1034 1035 return false; 1036 } 1037 1038 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr) 1039 { 1040 u8 bit_write; 1041 unsigned long tmp; 1042 u32 offset; 1043 u32 *msrpm; 1044 1045 msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm: 1046 to_svm(vcpu)->msrpm; 1047 1048 offset = svm_msrpm_offset(msr); 1049 bit_write = 2 * (msr & 0x0f) + 1; 1050 tmp = msrpm[offset]; 1051 1052 BUG_ON(offset == MSR_INVALID); 1053 1054 return !!test_bit(bit_write, &tmp); 1055 } 1056 1057 static void set_msr_interception(u32 *msrpm, unsigned msr, 1058 int read, int write) 1059 { 1060 u8 bit_read, bit_write; 1061 unsigned long tmp; 1062 u32 offset; 1063 1064 /* 1065 * If this warning triggers extend the direct_access_msrs list at the 1066 * beginning of the file 1067 */ 1068 WARN_ON(!valid_msr_intercept(msr)); 1069 1070 offset = svm_msrpm_offset(msr); 1071 bit_read = 2 * (msr & 0x0f); 1072 bit_write = 2 * (msr & 0x0f) + 1; 1073 tmp = msrpm[offset]; 1074 1075 BUG_ON(offset == MSR_INVALID); 1076 1077 read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp); 1078 write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp); 1079 1080 msrpm[offset] = tmp; 1081 } 1082 1083 static void svm_vcpu_init_msrpm(u32 *msrpm) 1084 { 1085 int i; 1086 1087 memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); 1088 1089 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { 1090 if (!direct_access_msrs[i].always) 1091 continue; 1092 1093 set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1); 1094 } 1095 } 1096 1097 static void add_msr_offset(u32 offset) 1098 { 1099 int i; 1100 1101 for (i = 0; i < MSRPM_OFFSETS; ++i) { 1102 1103 /* Offset already in list? */ 1104 if (msrpm_offsets[i] == offset) 1105 return; 1106 1107 /* Slot used by another offset? */ 1108 if (msrpm_offsets[i] != MSR_INVALID) 1109 continue; 1110 1111 /* Add offset to list */ 1112 msrpm_offsets[i] = offset; 1113 1114 return; 1115 } 1116 1117 /* 1118 * If this BUG triggers the msrpm_offsets table has an overflow. Just 1119 * increase MSRPM_OFFSETS in this case. 1120 */ 1121 BUG(); 1122 } 1123 1124 static void init_msrpm_offsets(void) 1125 { 1126 int i; 1127 1128 memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets)); 1129 1130 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { 1131 u32 offset; 1132 1133 offset = svm_msrpm_offset(direct_access_msrs[i].index); 1134 BUG_ON(offset == MSR_INVALID); 1135 1136 add_msr_offset(offset); 1137 } 1138 } 1139 1140 static void svm_enable_lbrv(struct vcpu_svm *svm) 1141 { 1142 u32 *msrpm = svm->msrpm; 1143 1144 svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; 1145 set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); 1146 set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); 1147 set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); 1148 set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1); 1149 } 1150 1151 static void svm_disable_lbrv(struct vcpu_svm *svm) 1152 { 1153 u32 *msrpm = svm->msrpm; 1154 1155 svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; 1156 set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); 1157 set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); 1158 set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); 1159 set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); 1160 } 1161 1162 static void disable_nmi_singlestep(struct vcpu_svm *svm) 1163 { 1164 svm->nmi_singlestep = false; 1165 1166 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) { 1167 /* Clear our flags if they were not set by the guest */ 1168 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) 1169 svm->vmcb->save.rflags &= ~X86_EFLAGS_TF; 1170 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) 1171 svm->vmcb->save.rflags &= ~X86_EFLAGS_RF; 1172 } 1173 } 1174 1175 /* Note: 1176 * This hash table is used to map VM_ID to a struct kvm_svm, 1177 * when handling AMD IOMMU GALOG notification to schedule in 1178 * a particular vCPU. 1179 */ 1180 #define SVM_VM_DATA_HASH_BITS 8 1181 static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); 1182 static u32 next_vm_id = 0; 1183 static bool next_vm_id_wrapped = 0; 1184 static DEFINE_SPINLOCK(svm_vm_data_hash_lock); 1185 1186 /* Note: 1187 * This function is called from IOMMU driver to notify 1188 * SVM to schedule in a particular vCPU of a particular VM. 1189 */ 1190 static int avic_ga_log_notifier(u32 ga_tag) 1191 { 1192 unsigned long flags; 1193 struct kvm_svm *kvm_svm; 1194 struct kvm_vcpu *vcpu = NULL; 1195 u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); 1196 u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag); 1197 1198 pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id); 1199 1200 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 1201 hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) { 1202 if (kvm_svm->avic_vm_id != vm_id) 1203 continue; 1204 vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id); 1205 break; 1206 } 1207 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 1208 1209 /* Note: 1210 * At this point, the IOMMU should have already set the pending 1211 * bit in the vAPIC backing page. So, we just need to schedule 1212 * in the vcpu. 1213 */ 1214 if (vcpu) 1215 kvm_vcpu_wake_up(vcpu); 1216 1217 return 0; 1218 } 1219 1220 static __init int sev_hardware_setup(void) 1221 { 1222 struct sev_user_data_status *status; 1223 int rc; 1224 1225 /* Maximum number of encrypted guests supported simultaneously */ 1226 max_sev_asid = cpuid_ecx(0x8000001F); 1227 1228 if (!max_sev_asid) 1229 return 1; 1230 1231 /* Minimum ASID value that should be used for SEV guest */ 1232 min_sev_asid = cpuid_edx(0x8000001F); 1233 1234 /* Initialize SEV ASID bitmap */ 1235 sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); 1236 if (!sev_asid_bitmap) 1237 return 1; 1238 1239 status = kmalloc(sizeof(*status), GFP_KERNEL); 1240 if (!status) 1241 return 1; 1242 1243 /* 1244 * Check SEV platform status. 1245 * 1246 * PLATFORM_STATUS can be called in any state, if we failed to query 1247 * the PLATFORM status then either PSP firmware does not support SEV 1248 * feature or SEV firmware is dead. 1249 */ 1250 rc = sev_platform_status(status, NULL); 1251 if (rc) 1252 goto err; 1253 1254 pr_info("SEV supported\n"); 1255 1256 err: 1257 kfree(status); 1258 return rc; 1259 } 1260 1261 static void grow_ple_window(struct kvm_vcpu *vcpu) 1262 { 1263 struct vcpu_svm *svm = to_svm(vcpu); 1264 struct vmcb_control_area *control = &svm->vmcb->control; 1265 int old = control->pause_filter_count; 1266 1267 control->pause_filter_count = __grow_ple_window(old, 1268 pause_filter_count, 1269 pause_filter_count_grow, 1270 pause_filter_count_max); 1271 1272 if (control->pause_filter_count != old) 1273 mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 1274 1275 trace_kvm_ple_window_grow(vcpu->vcpu_id, 1276 control->pause_filter_count, old); 1277 } 1278 1279 static void shrink_ple_window(struct kvm_vcpu *vcpu) 1280 { 1281 struct vcpu_svm *svm = to_svm(vcpu); 1282 struct vmcb_control_area *control = &svm->vmcb->control; 1283 int old = control->pause_filter_count; 1284 1285 control->pause_filter_count = 1286 __shrink_ple_window(old, 1287 pause_filter_count, 1288 pause_filter_count_shrink, 1289 pause_filter_count); 1290 if (control->pause_filter_count != old) 1291 mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 1292 1293 trace_kvm_ple_window_shrink(vcpu->vcpu_id, 1294 control->pause_filter_count, old); 1295 } 1296 1297 static __init int svm_hardware_setup(void) 1298 { 1299 int cpu; 1300 struct page *iopm_pages; 1301 void *iopm_va; 1302 int r; 1303 1304 iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER); 1305 1306 if (!iopm_pages) 1307 return -ENOMEM; 1308 1309 iopm_va = page_address(iopm_pages); 1310 memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); 1311 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; 1312 1313 init_msrpm_offsets(); 1314 1315 if (boot_cpu_has(X86_FEATURE_NX)) 1316 kvm_enable_efer_bits(EFER_NX); 1317 1318 if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) 1319 kvm_enable_efer_bits(EFER_FFXSR); 1320 1321 if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { 1322 kvm_has_tsc_control = true; 1323 kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; 1324 kvm_tsc_scaling_ratio_frac_bits = 32; 1325 } 1326 1327 /* Check for pause filtering support */ 1328 if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { 1329 pause_filter_count = 0; 1330 pause_filter_thresh = 0; 1331 } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) { 1332 pause_filter_thresh = 0; 1333 } 1334 1335 if (nested) { 1336 printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); 1337 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); 1338 } 1339 1340 if (sev) { 1341 if (boot_cpu_has(X86_FEATURE_SEV) && 1342 IS_ENABLED(CONFIG_KVM_AMD_SEV)) { 1343 r = sev_hardware_setup(); 1344 if (r) 1345 sev = false; 1346 } else { 1347 sev = false; 1348 } 1349 } 1350 1351 for_each_possible_cpu(cpu) { 1352 r = svm_cpu_init(cpu); 1353 if (r) 1354 goto err; 1355 } 1356 1357 if (!boot_cpu_has(X86_FEATURE_NPT)) 1358 npt_enabled = false; 1359 1360 if (npt_enabled && !npt) { 1361 printk(KERN_INFO "kvm: Nested Paging disabled\n"); 1362 npt_enabled = false; 1363 } 1364 1365 if (npt_enabled) { 1366 printk(KERN_INFO "kvm: Nested Paging enabled\n"); 1367 kvm_enable_tdp(); 1368 } else 1369 kvm_disable_tdp(); 1370 1371 if (avic) { 1372 if (!npt_enabled || 1373 !boot_cpu_has(X86_FEATURE_AVIC) || 1374 !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) { 1375 avic = false; 1376 } else { 1377 pr_info("AVIC enabled\n"); 1378 1379 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); 1380 } 1381 } 1382 1383 if (vls) { 1384 if (!npt_enabled || 1385 !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || 1386 !IS_ENABLED(CONFIG_X86_64)) { 1387 vls = false; 1388 } else { 1389 pr_info("Virtual VMLOAD VMSAVE supported\n"); 1390 } 1391 } 1392 1393 if (vgif) { 1394 if (!boot_cpu_has(X86_FEATURE_VGIF)) 1395 vgif = false; 1396 else 1397 pr_info("Virtual GIF supported\n"); 1398 } 1399 1400 return 0; 1401 1402 err: 1403 __free_pages(iopm_pages, IOPM_ALLOC_ORDER); 1404 iopm_base = 0; 1405 return r; 1406 } 1407 1408 static __exit void svm_hardware_unsetup(void) 1409 { 1410 int cpu; 1411 1412 if (svm_sev_enabled()) 1413 bitmap_free(sev_asid_bitmap); 1414 1415 for_each_possible_cpu(cpu) 1416 svm_cpu_uninit(cpu); 1417 1418 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); 1419 iopm_base = 0; 1420 } 1421 1422 static void init_seg(struct vmcb_seg *seg) 1423 { 1424 seg->selector = 0; 1425 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | 1426 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ 1427 seg->limit = 0xffff; 1428 seg->base = 0; 1429 } 1430 1431 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) 1432 { 1433 seg->selector = 0; 1434 seg->attrib = SVM_SELECTOR_P_MASK | type; 1435 seg->limit = 0xffff; 1436 seg->base = 0; 1437 } 1438 1439 static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu) 1440 { 1441 struct vcpu_svm *svm = to_svm(vcpu); 1442 1443 if (is_guest_mode(vcpu)) 1444 return svm->nested.hsave->control.tsc_offset; 1445 1446 return vcpu->arch.tsc_offset; 1447 } 1448 1449 static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 1450 { 1451 struct vcpu_svm *svm = to_svm(vcpu); 1452 u64 g_tsc_offset = 0; 1453 1454 if (is_guest_mode(vcpu)) { 1455 /* Write L1's TSC offset. */ 1456 g_tsc_offset = svm->vmcb->control.tsc_offset - 1457 svm->nested.hsave->control.tsc_offset; 1458 svm->nested.hsave->control.tsc_offset = offset; 1459 } else 1460 trace_kvm_write_tsc_offset(vcpu->vcpu_id, 1461 svm->vmcb->control.tsc_offset, 1462 offset); 1463 1464 svm->vmcb->control.tsc_offset = offset + g_tsc_offset; 1465 1466 mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 1467 return svm->vmcb->control.tsc_offset; 1468 } 1469 1470 static void avic_init_vmcb(struct vcpu_svm *svm) 1471 { 1472 struct vmcb *vmcb = svm->vmcb; 1473 struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm); 1474 phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page)); 1475 phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page)); 1476 phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page)); 1477 1478 vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; 1479 vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; 1480 vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; 1481 vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; 1482 vmcb->control.int_ctl |= AVIC_ENABLE_MASK; 1483 } 1484 1485 static void init_vmcb(struct vcpu_svm *svm) 1486 { 1487 struct vmcb_control_area *control = &svm->vmcb->control; 1488 struct vmcb_save_area *save = &svm->vmcb->save; 1489 1490 svm->vcpu.arch.hflags = 0; 1491 1492 set_cr_intercept(svm, INTERCEPT_CR0_READ); 1493 set_cr_intercept(svm, INTERCEPT_CR3_READ); 1494 set_cr_intercept(svm, INTERCEPT_CR4_READ); 1495 set_cr_intercept(svm, INTERCEPT_CR0_WRITE); 1496 set_cr_intercept(svm, INTERCEPT_CR3_WRITE); 1497 set_cr_intercept(svm, INTERCEPT_CR4_WRITE); 1498 if (!kvm_vcpu_apicv_active(&svm->vcpu)) 1499 set_cr_intercept(svm, INTERCEPT_CR8_WRITE); 1500 1501 set_dr_intercepts(svm); 1502 1503 set_exception_intercept(svm, PF_VECTOR); 1504 set_exception_intercept(svm, UD_VECTOR); 1505 set_exception_intercept(svm, MC_VECTOR); 1506 set_exception_intercept(svm, AC_VECTOR); 1507 set_exception_intercept(svm, DB_VECTOR); 1508 /* 1509 * Guest access to VMware backdoor ports could legitimately 1510 * trigger #GP because of TSS I/O permission bitmap. 1511 * We intercept those #GP and allow access to them anyway 1512 * as VMware does. 1513 */ 1514 if (enable_vmware_backdoor) 1515 set_exception_intercept(svm, GP_VECTOR); 1516 1517 set_intercept(svm, INTERCEPT_INTR); 1518 set_intercept(svm, INTERCEPT_NMI); 1519 set_intercept(svm, INTERCEPT_SMI); 1520 set_intercept(svm, INTERCEPT_SELECTIVE_CR0); 1521 set_intercept(svm, INTERCEPT_RDPMC); 1522 set_intercept(svm, INTERCEPT_CPUID); 1523 set_intercept(svm, INTERCEPT_INVD); 1524 set_intercept(svm, INTERCEPT_INVLPG); 1525 set_intercept(svm, INTERCEPT_INVLPGA); 1526 set_intercept(svm, INTERCEPT_IOIO_PROT); 1527 set_intercept(svm, INTERCEPT_MSR_PROT); 1528 set_intercept(svm, INTERCEPT_TASK_SWITCH); 1529 set_intercept(svm, INTERCEPT_SHUTDOWN); 1530 set_intercept(svm, INTERCEPT_VMRUN); 1531 set_intercept(svm, INTERCEPT_VMMCALL); 1532 set_intercept(svm, INTERCEPT_VMLOAD); 1533 set_intercept(svm, INTERCEPT_VMSAVE); 1534 set_intercept(svm, INTERCEPT_STGI); 1535 set_intercept(svm, INTERCEPT_CLGI); 1536 set_intercept(svm, INTERCEPT_SKINIT); 1537 set_intercept(svm, INTERCEPT_WBINVD); 1538 set_intercept(svm, INTERCEPT_XSETBV); 1539 set_intercept(svm, INTERCEPT_RSM); 1540 1541 if (!kvm_mwait_in_guest(svm->vcpu.kvm)) { 1542 set_intercept(svm, INTERCEPT_MONITOR); 1543 set_intercept(svm, INTERCEPT_MWAIT); 1544 } 1545 1546 if (!kvm_hlt_in_guest(svm->vcpu.kvm)) 1547 set_intercept(svm, INTERCEPT_HLT); 1548 1549 control->iopm_base_pa = __sme_set(iopm_base); 1550 control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); 1551 control->int_ctl = V_INTR_MASKING_MASK; 1552 1553 init_seg(&save->es); 1554 init_seg(&save->ss); 1555 init_seg(&save->ds); 1556 init_seg(&save->fs); 1557 init_seg(&save->gs); 1558 1559 save->cs.selector = 0xf000; 1560 save->cs.base = 0xffff0000; 1561 /* Executable/Readable Code Segment */ 1562 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | 1563 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; 1564 save->cs.limit = 0xffff; 1565 1566 save->gdtr.limit = 0xffff; 1567 save->idtr.limit = 0xffff; 1568 1569 init_sys_seg(&save->ldtr, SEG_TYPE_LDT); 1570 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); 1571 1572 svm_set_efer(&svm->vcpu, 0); 1573 save->dr6 = 0xffff0ff0; 1574 kvm_set_rflags(&svm->vcpu, 2); 1575 save->rip = 0x0000fff0; 1576 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; 1577 1578 /* 1579 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. 1580 * It also updates the guest-visible cr0 value. 1581 */ 1582 svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); 1583 kvm_mmu_reset_context(&svm->vcpu); 1584 1585 save->cr4 = X86_CR4_PAE; 1586 /* rdx = ?? */ 1587 1588 if (npt_enabled) { 1589 /* Setup VMCB for Nested Paging */ 1590 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; 1591 clr_intercept(svm, INTERCEPT_INVLPG); 1592 clr_exception_intercept(svm, PF_VECTOR); 1593 clr_cr_intercept(svm, INTERCEPT_CR3_READ); 1594 clr_cr_intercept(svm, INTERCEPT_CR3_WRITE); 1595 save->g_pat = svm->vcpu.arch.pat; 1596 save->cr3 = 0; 1597 save->cr4 = 0; 1598 } 1599 svm->asid_generation = 0; 1600 1601 svm->nested.vmcb = 0; 1602 svm->vcpu.arch.hflags = 0; 1603 1604 if (pause_filter_count) { 1605 control->pause_filter_count = pause_filter_count; 1606 if (pause_filter_thresh) 1607 control->pause_filter_thresh = pause_filter_thresh; 1608 set_intercept(svm, INTERCEPT_PAUSE); 1609 } else { 1610 clr_intercept(svm, INTERCEPT_PAUSE); 1611 } 1612 1613 if (kvm_vcpu_apicv_active(&svm->vcpu)) 1614 avic_init_vmcb(svm); 1615 1616 /* 1617 * If hardware supports Virtual VMLOAD VMSAVE then enable it 1618 * in VMCB and clear intercepts to avoid #VMEXIT. 1619 */ 1620 if (vls) { 1621 clr_intercept(svm, INTERCEPT_VMLOAD); 1622 clr_intercept(svm, INTERCEPT_VMSAVE); 1623 svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 1624 } 1625 1626 if (vgif) { 1627 clr_intercept(svm, INTERCEPT_STGI); 1628 clr_intercept(svm, INTERCEPT_CLGI); 1629 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; 1630 } 1631 1632 if (sev_guest(svm->vcpu.kvm)) { 1633 svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE; 1634 clr_exception_intercept(svm, UD_VECTOR); 1635 } 1636 1637 mark_all_dirty(svm->vmcb); 1638 1639 enable_gif(svm); 1640 1641 } 1642 1643 static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, 1644 unsigned int index) 1645 { 1646 u64 *avic_physical_id_table; 1647 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 1648 1649 if (index >= AVIC_MAX_PHYSICAL_ID_COUNT) 1650 return NULL; 1651 1652 avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page); 1653 1654 return &avic_physical_id_table[index]; 1655 } 1656 1657 /** 1658 * Note: 1659 * AVIC hardware walks the nested page table to check permissions, 1660 * but does not use the SPA address specified in the leaf page 1661 * table entry since it uses address in the AVIC_BACKING_PAGE pointer 1662 * field of the VMCB. Therefore, we set up the 1663 * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here. 1664 */ 1665 static int avic_init_access_page(struct kvm_vcpu *vcpu) 1666 { 1667 struct kvm *kvm = vcpu->kvm; 1668 int ret = 0; 1669 1670 mutex_lock(&kvm->slots_lock); 1671 if (kvm->arch.apic_access_page_done) 1672 goto out; 1673 1674 ret = __x86_set_memory_region(kvm, 1675 APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 1676 APIC_DEFAULT_PHYS_BASE, 1677 PAGE_SIZE); 1678 if (ret) 1679 goto out; 1680 1681 kvm->arch.apic_access_page_done = true; 1682 out: 1683 mutex_unlock(&kvm->slots_lock); 1684 return ret; 1685 } 1686 1687 static int avic_init_backing_page(struct kvm_vcpu *vcpu) 1688 { 1689 int ret; 1690 u64 *entry, new_entry; 1691 int id = vcpu->vcpu_id; 1692 struct vcpu_svm *svm = to_svm(vcpu); 1693 1694 ret = avic_init_access_page(vcpu); 1695 if (ret) 1696 return ret; 1697 1698 if (id >= AVIC_MAX_PHYSICAL_ID_COUNT) 1699 return -EINVAL; 1700 1701 if (!svm->vcpu.arch.apic->regs) 1702 return -EINVAL; 1703 1704 svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs); 1705 1706 /* Setting AVIC backing page address in the phy APIC ID table */ 1707 entry = avic_get_physical_id_entry(vcpu, id); 1708 if (!entry) 1709 return -EINVAL; 1710 1711 new_entry = READ_ONCE(*entry); 1712 new_entry = __sme_set((page_to_phys(svm->avic_backing_page) & 1713 AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | 1714 AVIC_PHYSICAL_ID_ENTRY_VALID_MASK); 1715 WRITE_ONCE(*entry, new_entry); 1716 1717 svm->avic_physical_id_cache = entry; 1718 1719 return 0; 1720 } 1721 1722 static void __sev_asid_free(int asid) 1723 { 1724 struct svm_cpu_data *sd; 1725 int cpu, pos; 1726 1727 pos = asid - 1; 1728 clear_bit(pos, sev_asid_bitmap); 1729 1730 for_each_possible_cpu(cpu) { 1731 sd = per_cpu(svm_data, cpu); 1732 sd->sev_vmcbs[pos] = NULL; 1733 } 1734 } 1735 1736 static void sev_asid_free(struct kvm *kvm) 1737 { 1738 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; 1739 1740 __sev_asid_free(sev->asid); 1741 } 1742 1743 static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) 1744 { 1745 struct sev_data_decommission *decommission; 1746 struct sev_data_deactivate *data; 1747 1748 if (!handle) 1749 return; 1750 1751 data = kzalloc(sizeof(*data), GFP_KERNEL); 1752 if (!data) 1753 return; 1754 1755 /* deactivate handle */ 1756 data->handle = handle; 1757 sev_guest_deactivate(data, NULL); 1758 1759 wbinvd_on_all_cpus(); 1760 sev_guest_df_flush(NULL); 1761 kfree(data); 1762 1763 decommission = kzalloc(sizeof(*decommission), GFP_KERNEL); 1764 if (!decommission) 1765 return; 1766 1767 /* decommission handle */ 1768 decommission->handle = handle; 1769 sev_guest_decommission(decommission, NULL); 1770 1771 kfree(decommission); 1772 } 1773 1774 static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, 1775 unsigned long ulen, unsigned long *n, 1776 int write) 1777 { 1778 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; 1779 unsigned long npages, npinned, size; 1780 unsigned long locked, lock_limit; 1781 struct page **pages; 1782 unsigned long first, last; 1783 1784 if (ulen == 0 || uaddr + ulen < uaddr) 1785 return NULL; 1786 1787 /* Calculate number of pages. */ 1788 first = (uaddr & PAGE_MASK) >> PAGE_SHIFT; 1789 last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT; 1790 npages = (last - first + 1); 1791 1792 locked = sev->pages_locked + npages; 1793 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 1794 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { 1795 pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit); 1796 return NULL; 1797 } 1798 1799 /* Avoid using vmalloc for smaller buffers. */ 1800 size = npages * sizeof(struct page *); 1801 if (size > PAGE_SIZE) 1802 pages = vmalloc(size); 1803 else 1804 pages = kmalloc(size, GFP_KERNEL); 1805 1806 if (!pages) 1807 return NULL; 1808 1809 /* Pin the user virtual address. */ 1810 npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); 1811 if (npinned != npages) { 1812 pr_err("SEV: Failure locking %lu pages.\n", npages); 1813 goto err; 1814 } 1815 1816 *n = npages; 1817 sev->pages_locked = locked; 1818 1819 return pages; 1820 1821 err: 1822 if (npinned > 0) 1823 release_pages(pages, npinned); 1824 1825 kvfree(pages); 1826 return NULL; 1827 } 1828 1829 static void sev_unpin_memory(struct kvm *kvm, struct page **pages, 1830 unsigned long npages) 1831 { 1832 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; 1833 1834 release_pages(pages, npages); 1835 kvfree(pages); 1836 sev->pages_locked -= npages; 1837 } 1838 1839 static void sev_clflush_pages(struct page *pages[], unsigned long npages) 1840 { 1841 uint8_t *page_virtual; 1842 unsigned long i; 1843 1844 if (npages == 0 || pages == NULL) 1845 return; 1846 1847 for (i = 0; i < npages; i++) { 1848 page_virtual = kmap_atomic(pages[i]); 1849 clflush_cache_range(page_virtual, PAGE_SIZE); 1850 kunmap_atomic(page_virtual); 1851 } 1852 } 1853 1854 static void __unregister_enc_region_locked(struct kvm *kvm, 1855 struct enc_region *region) 1856 { 1857 /* 1858 * The guest may change the memory encryption attribute from C=0 -> C=1 1859 * or vice versa for this memory range. Lets make sure caches are 1860 * flushed to ensure that guest data gets written into memory with 1861 * correct C-bit. 1862 */ 1863 sev_clflush_pages(region->pages, region->npages); 1864 1865 sev_unpin_memory(kvm, region->pages, region->npages); 1866 list_del(®ion->list); 1867 kfree(region); 1868 } 1869 1870 static struct kvm *svm_vm_alloc(void) 1871 { 1872 struct kvm_svm *kvm_svm = vzalloc(sizeof(struct kvm_svm)); 1873 return &kvm_svm->kvm; 1874 } 1875 1876 static void svm_vm_free(struct kvm *kvm) 1877 { 1878 vfree(to_kvm_svm(kvm)); 1879 } 1880 1881 static void sev_vm_destroy(struct kvm *kvm) 1882 { 1883 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; 1884 struct list_head *head = &sev->regions_list; 1885 struct list_head *pos, *q; 1886 1887 if (!sev_guest(kvm)) 1888 return; 1889 1890 mutex_lock(&kvm->lock); 1891 1892 /* 1893 * if userspace was terminated before unregistering the memory regions 1894 * then lets unpin all the registered memory. 1895 */ 1896 if (!list_empty(head)) { 1897 list_for_each_safe(pos, q, head) { 1898 __unregister_enc_region_locked(kvm, 1899 list_entry(pos, struct enc_region, list)); 1900 } 1901 } 1902 1903 mutex_unlock(&kvm->lock); 1904 1905 sev_unbind_asid(kvm, sev->handle); 1906 sev_asid_free(kvm); 1907 } 1908 1909 static void avic_vm_destroy(struct kvm *kvm) 1910 { 1911 unsigned long flags; 1912 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 1913 1914 if (!avic) 1915 return; 1916 1917 if (kvm_svm->avic_logical_id_table_page) 1918 __free_page(kvm_svm->avic_logical_id_table_page); 1919 if (kvm_svm->avic_physical_id_table_page) 1920 __free_page(kvm_svm->avic_physical_id_table_page); 1921 1922 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 1923 hash_del(&kvm_svm->hnode); 1924 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 1925 } 1926 1927 static void svm_vm_destroy(struct kvm *kvm) 1928 { 1929 avic_vm_destroy(kvm); 1930 sev_vm_destroy(kvm); 1931 } 1932 1933 static int avic_vm_init(struct kvm *kvm) 1934 { 1935 unsigned long flags; 1936 int err = -ENOMEM; 1937 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 1938 struct kvm_svm *k2; 1939 struct page *p_page; 1940 struct page *l_page; 1941 u32 vm_id; 1942 1943 if (!avic) 1944 return 0; 1945 1946 /* Allocating physical APIC ID table (4KB) */ 1947 p_page = alloc_page(GFP_KERNEL); 1948 if (!p_page) 1949 goto free_avic; 1950 1951 kvm_svm->avic_physical_id_table_page = p_page; 1952 clear_page(page_address(p_page)); 1953 1954 /* Allocating logical APIC ID table (4KB) */ 1955 l_page = alloc_page(GFP_KERNEL); 1956 if (!l_page) 1957 goto free_avic; 1958 1959 kvm_svm->avic_logical_id_table_page = l_page; 1960 clear_page(page_address(l_page)); 1961 1962 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 1963 again: 1964 vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK; 1965 if (vm_id == 0) { /* id is 1-based, zero is not okay */ 1966 next_vm_id_wrapped = 1; 1967 goto again; 1968 } 1969 /* Is it still in use? Only possible if wrapped at least once */ 1970 if (next_vm_id_wrapped) { 1971 hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) { 1972 if (k2->avic_vm_id == vm_id) 1973 goto again; 1974 } 1975 } 1976 kvm_svm->avic_vm_id = vm_id; 1977 hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id); 1978 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 1979 1980 return 0; 1981 1982 free_avic: 1983 avic_vm_destroy(kvm); 1984 return err; 1985 } 1986 1987 static inline int 1988 avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) 1989 { 1990 int ret = 0; 1991 unsigned long flags; 1992 struct amd_svm_iommu_ir *ir; 1993 struct vcpu_svm *svm = to_svm(vcpu); 1994 1995 if (!kvm_arch_has_assigned_device(vcpu->kvm)) 1996 return 0; 1997 1998 /* 1999 * Here, we go through the per-vcpu ir_list to update all existing 2000 * interrupt remapping table entry targeting this vcpu. 2001 */ 2002 spin_lock_irqsave(&svm->ir_list_lock, flags); 2003 2004 if (list_empty(&svm->ir_list)) 2005 goto out; 2006 2007 list_for_each_entry(ir, &svm->ir_list, node) { 2008 ret = amd_iommu_update_ga(cpu, r, ir->data); 2009 if (ret) 2010 break; 2011 } 2012 out: 2013 spin_unlock_irqrestore(&svm->ir_list_lock, flags); 2014 return ret; 2015 } 2016 2017 static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 2018 { 2019 u64 entry; 2020 /* ID = 0xff (broadcast), ID > 0xff (reserved) */ 2021 int h_physical_id = kvm_cpu_get_apicid(cpu); 2022 struct vcpu_svm *svm = to_svm(vcpu); 2023 2024 if (!kvm_vcpu_apicv_active(vcpu)) 2025 return; 2026 2027 if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT)) 2028 return; 2029 2030 entry = READ_ONCE(*(svm->avic_physical_id_cache)); 2031 WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); 2032 2033 entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; 2034 entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); 2035 2036 entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 2037 if (svm->avic_is_running) 2038 entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 2039 2040 WRITE_ONCE(*(svm->avic_physical_id_cache), entry); 2041 avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, 2042 svm->avic_is_running); 2043 } 2044 2045 static void avic_vcpu_put(struct kvm_vcpu *vcpu) 2046 { 2047 u64 entry; 2048 struct vcpu_svm *svm = to_svm(vcpu); 2049 2050 if (!kvm_vcpu_apicv_active(vcpu)) 2051 return; 2052 2053 entry = READ_ONCE(*(svm->avic_physical_id_cache)); 2054 if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) 2055 avic_update_iommu_vcpu_affinity(vcpu, -1, 0); 2056 2057 entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 2058 WRITE_ONCE(*(svm->avic_physical_id_cache), entry); 2059 } 2060 2061 /** 2062 * This function is called during VCPU halt/unhalt. 2063 */ 2064 static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) 2065 { 2066 struct vcpu_svm *svm = to_svm(vcpu); 2067 2068 svm->avic_is_running = is_run; 2069 if (is_run) 2070 avic_vcpu_load(vcpu, vcpu->cpu); 2071 else 2072 avic_vcpu_put(vcpu); 2073 } 2074 2075 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 2076 { 2077 struct vcpu_svm *svm = to_svm(vcpu); 2078 u32 dummy; 2079 u32 eax = 1; 2080 2081 vcpu->arch.microcode_version = 0x01000065; 2082 svm->spec_ctrl = 0; 2083 svm->virt_spec_ctrl = 0; 2084 2085 if (!init_event) { 2086 svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | 2087 MSR_IA32_APICBASE_ENABLE; 2088 if (kvm_vcpu_is_reset_bsp(&svm->vcpu)) 2089 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; 2090 } 2091 init_vmcb(svm); 2092 2093 kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true); 2094 kvm_register_write(vcpu, VCPU_REGS_RDX, eax); 2095 2096 if (kvm_vcpu_apicv_active(vcpu) && !init_event) 2097 avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); 2098 } 2099 2100 static int avic_init_vcpu(struct vcpu_svm *svm) 2101 { 2102 int ret; 2103 2104 if (!kvm_vcpu_apicv_active(&svm->vcpu)) 2105 return 0; 2106 2107 ret = avic_init_backing_page(&svm->vcpu); 2108 if (ret) 2109 return ret; 2110 2111 INIT_LIST_HEAD(&svm->ir_list); 2112 spin_lock_init(&svm->ir_list_lock); 2113 2114 return ret; 2115 } 2116 2117 static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) 2118 { 2119 struct vcpu_svm *svm; 2120 struct page *page; 2121 struct page *msrpm_pages; 2122 struct page *hsave_page; 2123 struct page *nested_msrpm_pages; 2124 int err; 2125 2126 svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 2127 if (!svm) { 2128 err = -ENOMEM; 2129 goto out; 2130 } 2131 2132 err = kvm_vcpu_init(&svm->vcpu, kvm, id); 2133 if (err) 2134 goto free_svm; 2135 2136 err = -ENOMEM; 2137 page = alloc_page(GFP_KERNEL); 2138 if (!page) 2139 goto uninit; 2140 2141 msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); 2142 if (!msrpm_pages) 2143 goto free_page1; 2144 2145 nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); 2146 if (!nested_msrpm_pages) 2147 goto free_page2; 2148 2149 hsave_page = alloc_page(GFP_KERNEL); 2150 if (!hsave_page) 2151 goto free_page3; 2152 2153 err = avic_init_vcpu(svm); 2154 if (err) 2155 goto free_page4; 2156 2157 /* We initialize this flag to true to make sure that the is_running 2158 * bit would be set the first time the vcpu is loaded. 2159 */ 2160 svm->avic_is_running = true; 2161 2162 svm->nested.hsave = page_address(hsave_page); 2163 2164 svm->msrpm = page_address(msrpm_pages); 2165 svm_vcpu_init_msrpm(svm->msrpm); 2166 2167 svm->nested.msrpm = page_address(nested_msrpm_pages); 2168 svm_vcpu_init_msrpm(svm->nested.msrpm); 2169 2170 svm->vmcb = page_address(page); 2171 clear_page(svm->vmcb); 2172 svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT); 2173 svm->asid_generation = 0; 2174 init_vmcb(svm); 2175 2176 svm_init_osvw(&svm->vcpu); 2177 2178 return &svm->vcpu; 2179 2180 free_page4: 2181 __free_page(hsave_page); 2182 free_page3: 2183 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); 2184 free_page2: 2185 __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); 2186 free_page1: 2187 __free_page(page); 2188 uninit: 2189 kvm_vcpu_uninit(&svm->vcpu); 2190 free_svm: 2191 kmem_cache_free(kvm_vcpu_cache, svm); 2192 out: 2193 return ERR_PTR(err); 2194 } 2195 2196 static void svm_clear_current_vmcb(struct vmcb *vmcb) 2197 { 2198 int i; 2199 2200 for_each_online_cpu(i) 2201 cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL); 2202 } 2203 2204 static void svm_free_vcpu(struct kvm_vcpu *vcpu) 2205 { 2206 struct vcpu_svm *svm = to_svm(vcpu); 2207 2208 /* 2209 * The vmcb page can be recycled, causing a false negative in 2210 * svm_vcpu_load(). So, ensure that no logical CPU has this 2211 * vmcb page recorded as its current vmcb. 2212 */ 2213 svm_clear_current_vmcb(svm->vmcb); 2214 2215 __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT)); 2216 __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); 2217 __free_page(virt_to_page(svm->nested.hsave)); 2218 __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); 2219 kvm_vcpu_uninit(vcpu); 2220 kmem_cache_free(kvm_vcpu_cache, svm); 2221 } 2222 2223 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 2224 { 2225 struct vcpu_svm *svm = to_svm(vcpu); 2226 struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 2227 int i; 2228 2229 if (unlikely(cpu != vcpu->cpu)) { 2230 svm->asid_generation = 0; 2231 mark_all_dirty(svm->vmcb); 2232 } 2233 2234 #ifdef CONFIG_X86_64 2235 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base); 2236 #endif 2237 savesegment(fs, svm->host.fs); 2238 savesegment(gs, svm->host.gs); 2239 svm->host.ldt = kvm_read_ldt(); 2240 2241 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 2242 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 2243 2244 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { 2245 u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio; 2246 if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) { 2247 __this_cpu_write(current_tsc_ratio, tsc_ratio); 2248 wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio); 2249 } 2250 } 2251 /* This assumes that the kernel never uses MSR_TSC_AUX */ 2252 if (static_cpu_has(X86_FEATURE_RDTSCP)) 2253 wrmsrl(MSR_TSC_AUX, svm->tsc_aux); 2254 2255 if (sd->current_vmcb != svm->vmcb) { 2256 sd->current_vmcb = svm->vmcb; 2257 indirect_branch_prediction_barrier(); 2258 } 2259 avic_vcpu_load(vcpu, cpu); 2260 } 2261 2262 static void svm_vcpu_put(struct kvm_vcpu *vcpu) 2263 { 2264 struct vcpu_svm *svm = to_svm(vcpu); 2265 int i; 2266 2267 avic_vcpu_put(vcpu); 2268 2269 ++vcpu->stat.host_state_reload; 2270 kvm_load_ldt(svm->host.ldt); 2271 #ifdef CONFIG_X86_64 2272 loadsegment(fs, svm->host.fs); 2273 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase); 2274 load_gs_index(svm->host.gs); 2275 #else 2276 #ifdef CONFIG_X86_32_LAZY_GS 2277 loadsegment(gs, svm->host.gs); 2278 #endif 2279 #endif 2280 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 2281 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 2282 } 2283 2284 static void svm_vcpu_blocking(struct kvm_vcpu *vcpu) 2285 { 2286 avic_set_running(vcpu, false); 2287 } 2288 2289 static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) 2290 { 2291 avic_set_running(vcpu, true); 2292 } 2293 2294 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 2295 { 2296 struct vcpu_svm *svm = to_svm(vcpu); 2297 unsigned long rflags = svm->vmcb->save.rflags; 2298 2299 if (svm->nmi_singlestep) { 2300 /* Hide our flags if they were not set by the guest */ 2301 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) 2302 rflags &= ~X86_EFLAGS_TF; 2303 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) 2304 rflags &= ~X86_EFLAGS_RF; 2305 } 2306 return rflags; 2307 } 2308 2309 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 2310 { 2311 if (to_svm(vcpu)->nmi_singlestep) 2312 rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 2313 2314 /* 2315 * Any change of EFLAGS.VM is accompanied by a reload of SS 2316 * (caused by either a task switch or an inter-privilege IRET), 2317 * so we do not need to update the CPL here. 2318 */ 2319 to_svm(vcpu)->vmcb->save.rflags = rflags; 2320 } 2321 2322 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 2323 { 2324 switch (reg) { 2325 case VCPU_EXREG_PDPTR: 2326 BUG_ON(!npt_enabled); 2327 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); 2328 break; 2329 default: 2330 BUG(); 2331 } 2332 } 2333 2334 static void svm_set_vintr(struct vcpu_svm *svm) 2335 { 2336 set_intercept(svm, INTERCEPT_VINTR); 2337 } 2338 2339 static void svm_clear_vintr(struct vcpu_svm *svm) 2340 { 2341 clr_intercept(svm, INTERCEPT_VINTR); 2342 } 2343 2344 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) 2345 { 2346 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; 2347 2348 switch (seg) { 2349 case VCPU_SREG_CS: return &save->cs; 2350 case VCPU_SREG_DS: return &save->ds; 2351 case VCPU_SREG_ES: return &save->es; 2352 case VCPU_SREG_FS: return &save->fs; 2353 case VCPU_SREG_GS: return &save->gs; 2354 case VCPU_SREG_SS: return &save->ss; 2355 case VCPU_SREG_TR: return &save->tr; 2356 case VCPU_SREG_LDTR: return &save->ldtr; 2357 } 2358 BUG(); 2359 return NULL; 2360 } 2361 2362 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg) 2363 { 2364 struct vmcb_seg *s = svm_seg(vcpu, seg); 2365 2366 return s->base; 2367 } 2368 2369 static void svm_get_segment(struct kvm_vcpu *vcpu, 2370 struct kvm_segment *var, int seg) 2371 { 2372 struct vmcb_seg *s = svm_seg(vcpu, seg); 2373 2374 var->base = s->base; 2375 var->limit = s->limit; 2376 var->selector = s->selector; 2377 var->type = s->attrib & SVM_SELECTOR_TYPE_MASK; 2378 var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1; 2379 var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; 2380 var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1; 2381 var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1; 2382 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; 2383 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; 2384 2385 /* 2386 * AMD CPUs circa 2014 track the G bit for all segments except CS. 2387 * However, the SVM spec states that the G bit is not observed by the 2388 * CPU, and some VMware virtual CPUs drop the G bit for all segments. 2389 * So let's synthesize a legal G bit for all segments, this helps 2390 * running KVM nested. It also helps cross-vendor migration, because 2391 * Intel's vmentry has a check on the 'G' bit. 2392 */ 2393 var->g = s->limit > 0xfffff; 2394 2395 /* 2396 * AMD's VMCB does not have an explicit unusable field, so emulate it 2397 * for cross vendor migration purposes by "not present" 2398 */ 2399 var->unusable = !var->present; 2400 2401 switch (seg) { 2402 case VCPU_SREG_TR: 2403 /* 2404 * Work around a bug where the busy flag in the tr selector 2405 * isn't exposed 2406 */ 2407 var->type |= 0x2; 2408 break; 2409 case VCPU_SREG_DS: 2410 case VCPU_SREG_ES: 2411 case VCPU_SREG_FS: 2412 case VCPU_SREG_GS: 2413 /* 2414 * The accessed bit must always be set in the segment 2415 * descriptor cache, although it can be cleared in the 2416 * descriptor, the cached bit always remains at 1. Since 2417 * Intel has a check on this, set it here to support 2418 * cross-vendor migration. 2419 */ 2420 if (!var->unusable) 2421 var->type |= 0x1; 2422 break; 2423 case VCPU_SREG_SS: 2424 /* 2425 * On AMD CPUs sometimes the DB bit in the segment 2426 * descriptor is left as 1, although the whole segment has 2427 * been made unusable. Clear it here to pass an Intel VMX 2428 * entry check when cross vendor migrating. 2429 */ 2430 if (var->unusable) 2431 var->db = 0; 2432 /* This is symmetric with svm_set_segment() */ 2433 var->dpl = to_svm(vcpu)->vmcb->save.cpl; 2434 break; 2435 } 2436 } 2437 2438 static int svm_get_cpl(struct kvm_vcpu *vcpu) 2439 { 2440 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; 2441 2442 return save->cpl; 2443 } 2444 2445 static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 2446 { 2447 struct vcpu_svm *svm = to_svm(vcpu); 2448 2449 dt->size = svm->vmcb->save.idtr.limit; 2450 dt->address = svm->vmcb->save.idtr.base; 2451 } 2452 2453 static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 2454 { 2455 struct vcpu_svm *svm = to_svm(vcpu); 2456 2457 svm->vmcb->save.idtr.limit = dt->size; 2458 svm->vmcb->save.idtr.base = dt->address ; 2459 mark_dirty(svm->vmcb, VMCB_DT); 2460 } 2461 2462 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 2463 { 2464 struct vcpu_svm *svm = to_svm(vcpu); 2465 2466 dt->size = svm->vmcb->save.gdtr.limit; 2467 dt->address = svm->vmcb->save.gdtr.base; 2468 } 2469 2470 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 2471 { 2472 struct vcpu_svm *svm = to_svm(vcpu); 2473 2474 svm->vmcb->save.gdtr.limit = dt->size; 2475 svm->vmcb->save.gdtr.base = dt->address ; 2476 mark_dirty(svm->vmcb, VMCB_DT); 2477 } 2478 2479 static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 2480 { 2481 } 2482 2483 static void svm_decache_cr3(struct kvm_vcpu *vcpu) 2484 { 2485 } 2486 2487 static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 2488 { 2489 } 2490 2491 static void update_cr0_intercept(struct vcpu_svm *svm) 2492 { 2493 ulong gcr0 = svm->vcpu.arch.cr0; 2494 u64 *hcr0 = &svm->vmcb->save.cr0; 2495 2496 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) 2497 | (gcr0 & SVM_CR0_SELECTIVE_MASK); 2498 2499 mark_dirty(svm->vmcb, VMCB_CR); 2500 2501 if (gcr0 == *hcr0) { 2502 clr_cr_intercept(svm, INTERCEPT_CR0_READ); 2503 clr_cr_intercept(svm, INTERCEPT_CR0_WRITE); 2504 } else { 2505 set_cr_intercept(svm, INTERCEPT_CR0_READ); 2506 set_cr_intercept(svm, INTERCEPT_CR0_WRITE); 2507 } 2508 } 2509 2510 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 2511 { 2512 struct vcpu_svm *svm = to_svm(vcpu); 2513 2514 #ifdef CONFIG_X86_64 2515 if (vcpu->arch.efer & EFER_LME) { 2516 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 2517 vcpu->arch.efer |= EFER_LMA; 2518 svm->vmcb->save.efer |= EFER_LMA | EFER_LME; 2519 } 2520 2521 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { 2522 vcpu->arch.efer &= ~EFER_LMA; 2523 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); 2524 } 2525 } 2526 #endif 2527 vcpu->arch.cr0 = cr0; 2528 2529 if (!npt_enabled) 2530 cr0 |= X86_CR0_PG | X86_CR0_WP; 2531 2532 /* 2533 * re-enable caching here because the QEMU bios 2534 * does not do it - this results in some delay at 2535 * reboot 2536 */ 2537 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) 2538 cr0 &= ~(X86_CR0_CD | X86_CR0_NW); 2539 svm->vmcb->save.cr0 = cr0; 2540 mark_dirty(svm->vmcb, VMCB_CR); 2541 update_cr0_intercept(svm); 2542 } 2543 2544 static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 2545 { 2546 unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; 2547 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; 2548 2549 if (cr4 & X86_CR4_VMXE) 2550 return 1; 2551 2552 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) 2553 svm_flush_tlb(vcpu, true); 2554 2555 vcpu->arch.cr4 = cr4; 2556 if (!npt_enabled) 2557 cr4 |= X86_CR4_PAE; 2558 cr4 |= host_cr4_mce; 2559 to_svm(vcpu)->vmcb->save.cr4 = cr4; 2560 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); 2561 return 0; 2562 } 2563 2564 static void svm_set_segment(struct kvm_vcpu *vcpu, 2565 struct kvm_segment *var, int seg) 2566 { 2567 struct vcpu_svm *svm = to_svm(vcpu); 2568 struct vmcb_seg *s = svm_seg(vcpu, seg); 2569 2570 s->base = var->base; 2571 s->limit = var->limit; 2572 s->selector = var->selector; 2573 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); 2574 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; 2575 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; 2576 s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT; 2577 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; 2578 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; 2579 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; 2580 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; 2581 2582 /* 2583 * This is always accurate, except if SYSRET returned to a segment 2584 * with SS.DPL != 3. Intel does not have this quirk, and always 2585 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it 2586 * would entail passing the CPL to userspace and back. 2587 */ 2588 if (seg == VCPU_SREG_SS) 2589 /* This is symmetric with svm_get_segment() */ 2590 svm->vmcb->save.cpl = (var->dpl & 3); 2591 2592 mark_dirty(svm->vmcb, VMCB_SEG); 2593 } 2594 2595 static void update_bp_intercept(struct kvm_vcpu *vcpu) 2596 { 2597 struct vcpu_svm *svm = to_svm(vcpu); 2598 2599 clr_exception_intercept(svm, BP_VECTOR); 2600 2601 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 2602 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 2603 set_exception_intercept(svm, BP_VECTOR); 2604 } else 2605 vcpu->guest_debug = 0; 2606 } 2607 2608 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) 2609 { 2610 if (sd->next_asid > sd->max_asid) { 2611 ++sd->asid_generation; 2612 sd->next_asid = sd->min_asid; 2613 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; 2614 } 2615 2616 svm->asid_generation = sd->asid_generation; 2617 svm->vmcb->control.asid = sd->next_asid++; 2618 2619 mark_dirty(svm->vmcb, VMCB_ASID); 2620 } 2621 2622 static u64 svm_get_dr6(struct kvm_vcpu *vcpu) 2623 { 2624 return to_svm(vcpu)->vmcb->save.dr6; 2625 } 2626 2627 static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) 2628 { 2629 struct vcpu_svm *svm = to_svm(vcpu); 2630 2631 svm->vmcb->save.dr6 = value; 2632 mark_dirty(svm->vmcb, VMCB_DR); 2633 } 2634 2635 static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 2636 { 2637 struct vcpu_svm *svm = to_svm(vcpu); 2638 2639 get_debugreg(vcpu->arch.db[0], 0); 2640 get_debugreg(vcpu->arch.db[1], 1); 2641 get_debugreg(vcpu->arch.db[2], 2); 2642 get_debugreg(vcpu->arch.db[3], 3); 2643 vcpu->arch.dr6 = svm_get_dr6(vcpu); 2644 vcpu->arch.dr7 = svm->vmcb->save.dr7; 2645 2646 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 2647 set_dr_intercepts(svm); 2648 } 2649 2650 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) 2651 { 2652 struct vcpu_svm *svm = to_svm(vcpu); 2653 2654 svm->vmcb->save.dr7 = value; 2655 mark_dirty(svm->vmcb, VMCB_DR); 2656 } 2657 2658 static int pf_interception(struct vcpu_svm *svm) 2659 { 2660 u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2); 2661 u64 error_code = svm->vmcb->control.exit_info_1; 2662 2663 return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address, 2664 static_cpu_has(X86_FEATURE_DECODEASSISTS) ? 2665 svm->vmcb->control.insn_bytes : NULL, 2666 svm->vmcb->control.insn_len); 2667 } 2668 2669 static int npf_interception(struct vcpu_svm *svm) 2670 { 2671 u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2); 2672 u64 error_code = svm->vmcb->control.exit_info_1; 2673 2674 trace_kvm_page_fault(fault_address, error_code); 2675 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, 2676 static_cpu_has(X86_FEATURE_DECODEASSISTS) ? 2677 svm->vmcb->control.insn_bytes : NULL, 2678 svm->vmcb->control.insn_len); 2679 } 2680 2681 static int db_interception(struct vcpu_svm *svm) 2682 { 2683 struct kvm_run *kvm_run = svm->vcpu.run; 2684 2685 if (!(svm->vcpu.guest_debug & 2686 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && 2687 !svm->nmi_singlestep) { 2688 kvm_queue_exception(&svm->vcpu, DB_VECTOR); 2689 return 1; 2690 } 2691 2692 if (svm->nmi_singlestep) { 2693 disable_nmi_singlestep(svm); 2694 } 2695 2696 if (svm->vcpu.guest_debug & 2697 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) { 2698 kvm_run->exit_reason = KVM_EXIT_DEBUG; 2699 kvm_run->debug.arch.pc = 2700 svm->vmcb->save.cs.base + svm->vmcb->save.rip; 2701 kvm_run->debug.arch.exception = DB_VECTOR; 2702 return 0; 2703 } 2704 2705 return 1; 2706 } 2707 2708 static int bp_interception(struct vcpu_svm *svm) 2709 { 2710 struct kvm_run *kvm_run = svm->vcpu.run; 2711 2712 kvm_run->exit_reason = KVM_EXIT_DEBUG; 2713 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; 2714 kvm_run->debug.arch.exception = BP_VECTOR; 2715 return 0; 2716 } 2717 2718 static int ud_interception(struct vcpu_svm *svm) 2719 { 2720 return handle_ud(&svm->vcpu); 2721 } 2722 2723 static int ac_interception(struct vcpu_svm *svm) 2724 { 2725 kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0); 2726 return 1; 2727 } 2728 2729 static int gp_interception(struct vcpu_svm *svm) 2730 { 2731 struct kvm_vcpu *vcpu = &svm->vcpu; 2732 u32 error_code = svm->vmcb->control.exit_info_1; 2733 int er; 2734 2735 WARN_ON_ONCE(!enable_vmware_backdoor); 2736 2737 er = kvm_emulate_instruction(vcpu, 2738 EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); 2739 if (er == EMULATE_USER_EXIT) 2740 return 0; 2741 else if (er != EMULATE_DONE) 2742 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 2743 return 1; 2744 } 2745 2746 static bool is_erratum_383(void) 2747 { 2748 int err, i; 2749 u64 value; 2750 2751 if (!erratum_383_found) 2752 return false; 2753 2754 value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err); 2755 if (err) 2756 return false; 2757 2758 /* Bit 62 may or may not be set for this mce */ 2759 value &= ~(1ULL << 62); 2760 2761 if (value != 0xb600000000010015ULL) 2762 return false; 2763 2764 /* Clear MCi_STATUS registers */ 2765 for (i = 0; i < 6; ++i) 2766 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0); 2767 2768 value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err); 2769 if (!err) { 2770 u32 low, high; 2771 2772 value &= ~(1ULL << 2); 2773 low = lower_32_bits(value); 2774 high = upper_32_bits(value); 2775 2776 native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); 2777 } 2778 2779 /* Flush tlb to evict multi-match entries */ 2780 __flush_tlb_all(); 2781 2782 return true; 2783 } 2784 2785 static void svm_handle_mce(struct vcpu_svm *svm) 2786 { 2787 if (is_erratum_383()) { 2788 /* 2789 * Erratum 383 triggered. Guest state is corrupt so kill the 2790 * guest. 2791 */ 2792 pr_err("KVM: Guest triggered AMD Erratum 383\n"); 2793 2794 kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu); 2795 2796 return; 2797 } 2798 2799 /* 2800 * On an #MC intercept the MCE handler is not called automatically in 2801 * the host. So do it by hand here. 2802 */ 2803 asm volatile ( 2804 "int $0x12\n"); 2805 /* not sure if we ever come back to this point */ 2806 2807 return; 2808 } 2809 2810 static int mc_interception(struct vcpu_svm *svm) 2811 { 2812 return 1; 2813 } 2814 2815 static int shutdown_interception(struct vcpu_svm *svm) 2816 { 2817 struct kvm_run *kvm_run = svm->vcpu.run; 2818 2819 /* 2820 * VMCB is undefined after a SHUTDOWN intercept 2821 * so reinitialize it. 2822 */ 2823 clear_page(svm->vmcb); 2824 init_vmcb(svm); 2825 2826 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 2827 return 0; 2828 } 2829 2830 static int io_interception(struct vcpu_svm *svm) 2831 { 2832 struct kvm_vcpu *vcpu = &svm->vcpu; 2833 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 2834 int size, in, string; 2835 unsigned port; 2836 2837 ++svm->vcpu.stat.io_exits; 2838 string = (io_info & SVM_IOIO_STR_MASK) != 0; 2839 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 2840 if (string) 2841 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; 2842 2843 port = io_info >> 16; 2844 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 2845 svm->next_rip = svm->vmcb->control.exit_info_2; 2846 2847 return kvm_fast_pio(&svm->vcpu, size, port, in); 2848 } 2849 2850 static int nmi_interception(struct vcpu_svm *svm) 2851 { 2852 return 1; 2853 } 2854 2855 static int intr_interception(struct vcpu_svm *svm) 2856 { 2857 ++svm->vcpu.stat.irq_exits; 2858 return 1; 2859 } 2860 2861 static int nop_on_interception(struct vcpu_svm *svm) 2862 { 2863 return 1; 2864 } 2865 2866 static int halt_interception(struct vcpu_svm *svm) 2867 { 2868 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; 2869 return kvm_emulate_halt(&svm->vcpu); 2870 } 2871 2872 static int vmmcall_interception(struct vcpu_svm *svm) 2873 { 2874 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2875 return kvm_emulate_hypercall(&svm->vcpu); 2876 } 2877 2878 static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) 2879 { 2880 struct vcpu_svm *svm = to_svm(vcpu); 2881 2882 return svm->nested.nested_cr3; 2883 } 2884 2885 static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) 2886 { 2887 struct vcpu_svm *svm = to_svm(vcpu); 2888 u64 cr3 = svm->nested.nested_cr3; 2889 u64 pdpte; 2890 int ret; 2891 2892 ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte, 2893 offset_in_page(cr3) + index * 8, 8); 2894 if (ret) 2895 return 0; 2896 return pdpte; 2897 } 2898 2899 static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, 2900 unsigned long root) 2901 { 2902 struct vcpu_svm *svm = to_svm(vcpu); 2903 2904 svm->vmcb->control.nested_cr3 = __sme_set(root); 2905 mark_dirty(svm->vmcb, VMCB_NPT); 2906 } 2907 2908 static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, 2909 struct x86_exception *fault) 2910 { 2911 struct vcpu_svm *svm = to_svm(vcpu); 2912 2913 if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) { 2914 /* 2915 * TODO: track the cause of the nested page fault, and 2916 * correctly fill in the high bits of exit_info_1. 2917 */ 2918 svm->vmcb->control.exit_code = SVM_EXIT_NPF; 2919 svm->vmcb->control.exit_code_hi = 0; 2920 svm->vmcb->control.exit_info_1 = (1ULL << 32); 2921 svm->vmcb->control.exit_info_2 = fault->address; 2922 } 2923 2924 svm->vmcb->control.exit_info_1 &= ~0xffffffffULL; 2925 svm->vmcb->control.exit_info_1 |= fault->error_code; 2926 2927 /* 2928 * The present bit is always zero for page structure faults on real 2929 * hardware. 2930 */ 2931 if (svm->vmcb->control.exit_info_1 & (2ULL << 32)) 2932 svm->vmcb->control.exit_info_1 &= ~1; 2933 2934 nested_svm_vmexit(svm); 2935 } 2936 2937 static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) 2938 { 2939 WARN_ON(mmu_is_nested(vcpu)); 2940 2941 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 2942 kvm_init_shadow_mmu(vcpu); 2943 vcpu->arch.mmu->set_cr3 = nested_svm_set_tdp_cr3; 2944 vcpu->arch.mmu->get_cr3 = nested_svm_get_tdp_cr3; 2945 vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; 2946 vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit; 2947 vcpu->arch.mmu->shadow_root_level = get_npt_level(vcpu); 2948 reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu); 2949 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 2950 } 2951 2952 static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) 2953 { 2954 vcpu->arch.mmu = &vcpu->arch.root_mmu; 2955 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 2956 } 2957 2958 static int nested_svm_check_permissions(struct vcpu_svm *svm) 2959 { 2960 if (!(svm->vcpu.arch.efer & EFER_SVME) || 2961 !is_paging(&svm->vcpu)) { 2962 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 2963 return 1; 2964 } 2965 2966 if (svm->vmcb->save.cpl) { 2967 kvm_inject_gp(&svm->vcpu, 0); 2968 return 1; 2969 } 2970 2971 return 0; 2972 } 2973 2974 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 2975 bool has_error_code, u32 error_code) 2976 { 2977 int vmexit; 2978 2979 if (!is_guest_mode(&svm->vcpu)) 2980 return 0; 2981 2982 vmexit = nested_svm_intercept(svm); 2983 if (vmexit != NESTED_EXIT_DONE) 2984 return 0; 2985 2986 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; 2987 svm->vmcb->control.exit_code_hi = 0; 2988 svm->vmcb->control.exit_info_1 = error_code; 2989 2990 /* 2991 * EXITINFO2 is undefined for all exception intercepts other 2992 * than #PF. 2993 */ 2994 if (svm->vcpu.arch.exception.nested_apf) 2995 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; 2996 else if (svm->vcpu.arch.exception.has_payload) 2997 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload; 2998 else 2999 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; 3000 3001 svm->nested.exit_required = true; 3002 return vmexit; 3003 } 3004 3005 /* This function returns true if it is save to enable the irq window */ 3006 static inline bool nested_svm_intr(struct vcpu_svm *svm) 3007 { 3008 if (!is_guest_mode(&svm->vcpu)) 3009 return true; 3010 3011 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) 3012 return true; 3013 3014 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) 3015 return false; 3016 3017 /* 3018 * if vmexit was already requested (by intercepted exception 3019 * for instance) do not overwrite it with "external interrupt" 3020 * vmexit. 3021 */ 3022 if (svm->nested.exit_required) 3023 return false; 3024 3025 svm->vmcb->control.exit_code = SVM_EXIT_INTR; 3026 svm->vmcb->control.exit_info_1 = 0; 3027 svm->vmcb->control.exit_info_2 = 0; 3028 3029 if (svm->nested.intercept & 1ULL) { 3030 /* 3031 * The #vmexit can't be emulated here directly because this 3032 * code path runs with irqs and preemption disabled. A 3033 * #vmexit emulation might sleep. Only signal request for 3034 * the #vmexit here. 3035 */ 3036 svm->nested.exit_required = true; 3037 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); 3038 return false; 3039 } 3040 3041 return true; 3042 } 3043 3044 /* This function returns true if it is save to enable the nmi window */ 3045 static inline bool nested_svm_nmi(struct vcpu_svm *svm) 3046 { 3047 if (!is_guest_mode(&svm->vcpu)) 3048 return true; 3049 3050 if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) 3051 return true; 3052 3053 svm->vmcb->control.exit_code = SVM_EXIT_NMI; 3054 svm->nested.exit_required = true; 3055 3056 return false; 3057 } 3058 3059 static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page) 3060 { 3061 struct page *page; 3062 3063 might_sleep(); 3064 3065 page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT); 3066 if (is_error_page(page)) 3067 goto error; 3068 3069 *_page = page; 3070 3071 return kmap(page); 3072 3073 error: 3074 kvm_inject_gp(&svm->vcpu, 0); 3075 3076 return NULL; 3077 } 3078 3079 static void nested_svm_unmap(struct page *page) 3080 { 3081 kunmap(page); 3082 kvm_release_page_dirty(page); 3083 } 3084 3085 static int nested_svm_intercept_ioio(struct vcpu_svm *svm) 3086 { 3087 unsigned port, size, iopm_len; 3088 u16 val, mask; 3089 u8 start_bit; 3090 u64 gpa; 3091 3092 if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT))) 3093 return NESTED_EXIT_HOST; 3094 3095 port = svm->vmcb->control.exit_info_1 >> 16; 3096 size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >> 3097 SVM_IOIO_SIZE_SHIFT; 3098 gpa = svm->nested.vmcb_iopm + (port / 8); 3099 start_bit = port % 8; 3100 iopm_len = (start_bit + size > 8) ? 2 : 1; 3101 mask = (0xf >> (4 - size)) << start_bit; 3102 val = 0; 3103 3104 if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len)) 3105 return NESTED_EXIT_DONE; 3106 3107 return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; 3108 } 3109 3110 static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) 3111 { 3112 u32 offset, msr, value; 3113 int write, mask; 3114 3115 if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) 3116 return NESTED_EXIT_HOST; 3117 3118 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 3119 offset = svm_msrpm_offset(msr); 3120 write = svm->vmcb->control.exit_info_1 & 1; 3121 mask = 1 << ((2 * (msr & 0xf)) + write); 3122 3123 if (offset == MSR_INVALID) 3124 return NESTED_EXIT_DONE; 3125 3126 /* Offset is in 32 bit units but need in 8 bit units */ 3127 offset *= 4; 3128 3129 if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4)) 3130 return NESTED_EXIT_DONE; 3131 3132 return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; 3133 } 3134 3135 /* DB exceptions for our internal use must not cause vmexit */ 3136 static int nested_svm_intercept_db(struct vcpu_svm *svm) 3137 { 3138 unsigned long dr6; 3139 3140 /* if we're not singlestepping, it's not ours */ 3141 if (!svm->nmi_singlestep) 3142 return NESTED_EXIT_DONE; 3143 3144 /* if it's not a singlestep exception, it's not ours */ 3145 if (kvm_get_dr(&svm->vcpu, 6, &dr6)) 3146 return NESTED_EXIT_DONE; 3147 if (!(dr6 & DR6_BS)) 3148 return NESTED_EXIT_DONE; 3149 3150 /* if the guest is singlestepping, it should get the vmexit */ 3151 if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) { 3152 disable_nmi_singlestep(svm); 3153 return NESTED_EXIT_DONE; 3154 } 3155 3156 /* it's ours, the nested hypervisor must not see this one */ 3157 return NESTED_EXIT_HOST; 3158 } 3159 3160 static int nested_svm_exit_special(struct vcpu_svm *svm) 3161 { 3162 u32 exit_code = svm->vmcb->control.exit_code; 3163 3164 switch (exit_code) { 3165 case SVM_EXIT_INTR: 3166 case SVM_EXIT_NMI: 3167 case SVM_EXIT_EXCP_BASE + MC_VECTOR: 3168 return NESTED_EXIT_HOST; 3169 case SVM_EXIT_NPF: 3170 /* For now we are always handling NPFs when using them */ 3171 if (npt_enabled) 3172 return NESTED_EXIT_HOST; 3173 break; 3174 case SVM_EXIT_EXCP_BASE + PF_VECTOR: 3175 /* When we're shadowing, trap PFs, but not async PF */ 3176 if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0) 3177 return NESTED_EXIT_HOST; 3178 break; 3179 default: 3180 break; 3181 } 3182 3183 return NESTED_EXIT_CONTINUE; 3184 } 3185 3186 /* 3187 * If this function returns true, this #vmexit was already handled 3188 */ 3189 static int nested_svm_intercept(struct vcpu_svm *svm) 3190 { 3191 u32 exit_code = svm->vmcb->control.exit_code; 3192 int vmexit = NESTED_EXIT_HOST; 3193 3194 switch (exit_code) { 3195 case SVM_EXIT_MSR: 3196 vmexit = nested_svm_exit_handled_msr(svm); 3197 break; 3198 case SVM_EXIT_IOIO: 3199 vmexit = nested_svm_intercept_ioio(svm); 3200 break; 3201 case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { 3202 u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0); 3203 if (svm->nested.intercept_cr & bit) 3204 vmexit = NESTED_EXIT_DONE; 3205 break; 3206 } 3207 case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { 3208 u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0); 3209 if (svm->nested.intercept_dr & bit) 3210 vmexit = NESTED_EXIT_DONE; 3211 break; 3212 } 3213 case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { 3214 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); 3215 if (svm->nested.intercept_exceptions & excp_bits) { 3216 if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR) 3217 vmexit = nested_svm_intercept_db(svm); 3218 else 3219 vmexit = NESTED_EXIT_DONE; 3220 } 3221 /* async page fault always cause vmexit */ 3222 else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && 3223 svm->vcpu.arch.exception.nested_apf != 0) 3224 vmexit = NESTED_EXIT_DONE; 3225 break; 3226 } 3227 case SVM_EXIT_ERR: { 3228 vmexit = NESTED_EXIT_DONE; 3229 break; 3230 } 3231 default: { 3232 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); 3233 if (svm->nested.intercept & exit_bits) 3234 vmexit = NESTED_EXIT_DONE; 3235 } 3236 } 3237 3238 return vmexit; 3239 } 3240 3241 static int nested_svm_exit_handled(struct vcpu_svm *svm) 3242 { 3243 int vmexit; 3244 3245 vmexit = nested_svm_intercept(svm); 3246 3247 if (vmexit == NESTED_EXIT_DONE) 3248 nested_svm_vmexit(svm); 3249 3250 return vmexit; 3251 } 3252 3253 static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb) 3254 { 3255 struct vmcb_control_area *dst = &dst_vmcb->control; 3256 struct vmcb_control_area *from = &from_vmcb->control; 3257 3258 dst->intercept_cr = from->intercept_cr; 3259 dst->intercept_dr = from->intercept_dr; 3260 dst->intercept_exceptions = from->intercept_exceptions; 3261 dst->intercept = from->intercept; 3262 dst->iopm_base_pa = from->iopm_base_pa; 3263 dst->msrpm_base_pa = from->msrpm_base_pa; 3264 dst->tsc_offset = from->tsc_offset; 3265 dst->asid = from->asid; 3266 dst->tlb_ctl = from->tlb_ctl; 3267 dst->int_ctl = from->int_ctl; 3268 dst->int_vector = from->int_vector; 3269 dst->int_state = from->int_state; 3270 dst->exit_code = from->exit_code; 3271 dst->exit_code_hi = from->exit_code_hi; 3272 dst->exit_info_1 = from->exit_info_1; 3273 dst->exit_info_2 = from->exit_info_2; 3274 dst->exit_int_info = from->exit_int_info; 3275 dst->exit_int_info_err = from->exit_int_info_err; 3276 dst->nested_ctl = from->nested_ctl; 3277 dst->event_inj = from->event_inj; 3278 dst->event_inj_err = from->event_inj_err; 3279 dst->nested_cr3 = from->nested_cr3; 3280 dst->virt_ext = from->virt_ext; 3281 } 3282 3283 static int nested_svm_vmexit(struct vcpu_svm *svm) 3284 { 3285 struct vmcb *nested_vmcb; 3286 struct vmcb *hsave = svm->nested.hsave; 3287 struct vmcb *vmcb = svm->vmcb; 3288 struct page *page; 3289 3290 trace_kvm_nested_vmexit_inject(vmcb->control.exit_code, 3291 vmcb->control.exit_info_1, 3292 vmcb->control.exit_info_2, 3293 vmcb->control.exit_int_info, 3294 vmcb->control.exit_int_info_err, 3295 KVM_ISA_SVM); 3296 3297 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page); 3298 if (!nested_vmcb) 3299 return 1; 3300 3301 /* Exit Guest-Mode */ 3302 leave_guest_mode(&svm->vcpu); 3303 svm->nested.vmcb = 0; 3304 3305 /* Give the current vmcb to the guest */ 3306 disable_gif(svm); 3307 3308 nested_vmcb->save.es = vmcb->save.es; 3309 nested_vmcb->save.cs = vmcb->save.cs; 3310 nested_vmcb->save.ss = vmcb->save.ss; 3311 nested_vmcb->save.ds = vmcb->save.ds; 3312 nested_vmcb->save.gdtr = vmcb->save.gdtr; 3313 nested_vmcb->save.idtr = vmcb->save.idtr; 3314 nested_vmcb->save.efer = svm->vcpu.arch.efer; 3315 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); 3316 nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu); 3317 nested_vmcb->save.cr2 = vmcb->save.cr2; 3318 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; 3319 nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu); 3320 nested_vmcb->save.rip = vmcb->save.rip; 3321 nested_vmcb->save.rsp = vmcb->save.rsp; 3322 nested_vmcb->save.rax = vmcb->save.rax; 3323 nested_vmcb->save.dr7 = vmcb->save.dr7; 3324 nested_vmcb->save.dr6 = vmcb->save.dr6; 3325 nested_vmcb->save.cpl = vmcb->save.cpl; 3326 3327 nested_vmcb->control.int_ctl = vmcb->control.int_ctl; 3328 nested_vmcb->control.int_vector = vmcb->control.int_vector; 3329 nested_vmcb->control.int_state = vmcb->control.int_state; 3330 nested_vmcb->control.exit_code = vmcb->control.exit_code; 3331 nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi; 3332 nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1; 3333 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; 3334 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; 3335 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; 3336 3337 if (svm->nrips_enabled) 3338 nested_vmcb->control.next_rip = vmcb->control.next_rip; 3339 3340 /* 3341 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have 3342 * to make sure that we do not lose injected events. So check event_inj 3343 * here and copy it to exit_int_info if it is valid. 3344 * Exit_int_info and event_inj can't be both valid because the case 3345 * below only happens on a VMRUN instruction intercept which has 3346 * no valid exit_int_info set. 3347 */ 3348 if (vmcb->control.event_inj & SVM_EVTINJ_VALID) { 3349 struct vmcb_control_area *nc = &nested_vmcb->control; 3350 3351 nc->exit_int_info = vmcb->control.event_inj; 3352 nc->exit_int_info_err = vmcb->control.event_inj_err; 3353 } 3354 3355 nested_vmcb->control.tlb_ctl = 0; 3356 nested_vmcb->control.event_inj = 0; 3357 nested_vmcb->control.event_inj_err = 0; 3358 3359 /* We always set V_INTR_MASKING and remember the old value in hflags */ 3360 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) 3361 nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; 3362 3363 /* Restore the original control entries */ 3364 copy_vmcb_control_area(vmcb, hsave); 3365 3366 svm->vcpu.arch.tsc_offset = svm->vmcb->control.tsc_offset; 3367 kvm_clear_exception_queue(&svm->vcpu); 3368 kvm_clear_interrupt_queue(&svm->vcpu); 3369 3370 svm->nested.nested_cr3 = 0; 3371 3372 /* Restore selected save entries */ 3373 svm->vmcb->save.es = hsave->save.es; 3374 svm->vmcb->save.cs = hsave->save.cs; 3375 svm->vmcb->save.ss = hsave->save.ss; 3376 svm->vmcb->save.ds = hsave->save.ds; 3377 svm->vmcb->save.gdtr = hsave->save.gdtr; 3378 svm->vmcb->save.idtr = hsave->save.idtr; 3379 kvm_set_rflags(&svm->vcpu, hsave->save.rflags); 3380 svm_set_efer(&svm->vcpu, hsave->save.efer); 3381 svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); 3382 svm_set_cr4(&svm->vcpu, hsave->save.cr4); 3383 if (npt_enabled) { 3384 svm->vmcb->save.cr3 = hsave->save.cr3; 3385 svm->vcpu.arch.cr3 = hsave->save.cr3; 3386 } else { 3387 (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3); 3388 } 3389 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax); 3390 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp); 3391 kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip); 3392 svm->vmcb->save.dr7 = 0; 3393 svm->vmcb->save.cpl = 0; 3394 svm->vmcb->control.exit_int_info = 0; 3395 3396 mark_all_dirty(svm->vmcb); 3397 3398 nested_svm_unmap(page); 3399 3400 nested_svm_uninit_mmu_context(&svm->vcpu); 3401 kvm_mmu_reset_context(&svm->vcpu); 3402 kvm_mmu_load(&svm->vcpu); 3403 3404 /* 3405 * Drop what we picked up for L2 via svm_complete_interrupts() so it 3406 * doesn't end up in L1. 3407 */ 3408 svm->vcpu.arch.nmi_injected = false; 3409 kvm_clear_exception_queue(&svm->vcpu); 3410 kvm_clear_interrupt_queue(&svm->vcpu); 3411 3412 return 0; 3413 } 3414 3415 static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) 3416 { 3417 /* 3418 * This function merges the msr permission bitmaps of kvm and the 3419 * nested vmcb. It is optimized in that it only merges the parts where 3420 * the kvm msr permission bitmap may contain zero bits 3421 */ 3422 int i; 3423 3424 if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) 3425 return true; 3426 3427 for (i = 0; i < MSRPM_OFFSETS; i++) { 3428 u32 value, p; 3429 u64 offset; 3430 3431 if (msrpm_offsets[i] == 0xffffffff) 3432 break; 3433 3434 p = msrpm_offsets[i]; 3435 offset = svm->nested.vmcb_msrpm + (p * 4); 3436 3437 if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4)) 3438 return false; 3439 3440 svm->nested.msrpm[p] = svm->msrpm[p] | value; 3441 } 3442 3443 svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm)); 3444 3445 return true; 3446 } 3447 3448 static bool nested_vmcb_checks(struct vmcb *vmcb) 3449 { 3450 if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0) 3451 return false; 3452 3453 if (vmcb->control.asid == 0) 3454 return false; 3455 3456 if ((vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && 3457 !npt_enabled) 3458 return false; 3459 3460 return true; 3461 } 3462 3463 static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, 3464 struct vmcb *nested_vmcb, struct page *page) 3465 { 3466 if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF) 3467 svm->vcpu.arch.hflags |= HF_HIF_MASK; 3468 else 3469 svm->vcpu.arch.hflags &= ~HF_HIF_MASK; 3470 3471 if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) { 3472 svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3; 3473 nested_svm_init_mmu_context(&svm->vcpu); 3474 } 3475 3476 /* Load the nested guest state */ 3477 svm->vmcb->save.es = nested_vmcb->save.es; 3478 svm->vmcb->save.cs = nested_vmcb->save.cs; 3479 svm->vmcb->save.ss = nested_vmcb->save.ss; 3480 svm->vmcb->save.ds = nested_vmcb->save.ds; 3481 svm->vmcb->save.gdtr = nested_vmcb->save.gdtr; 3482 svm->vmcb->save.idtr = nested_vmcb->save.idtr; 3483 kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags); 3484 svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); 3485 svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); 3486 svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); 3487 if (npt_enabled) { 3488 svm->vmcb->save.cr3 = nested_vmcb->save.cr3; 3489 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; 3490 } else 3491 (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); 3492 3493 /* Guest paging mode is active - reset mmu */ 3494 kvm_mmu_reset_context(&svm->vcpu); 3495 3496 svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; 3497 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); 3498 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); 3499 kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); 3500 3501 /* In case we don't even reach vcpu_run, the fields are not updated */ 3502 svm->vmcb->save.rax = nested_vmcb->save.rax; 3503 svm->vmcb->save.rsp = nested_vmcb->save.rsp; 3504 svm->vmcb->save.rip = nested_vmcb->save.rip; 3505 svm->vmcb->save.dr7 = nested_vmcb->save.dr7; 3506 svm->vmcb->save.dr6 = nested_vmcb->save.dr6; 3507 svm->vmcb->save.cpl = nested_vmcb->save.cpl; 3508 3509 svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL; 3510 svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; 3511 3512 /* cache intercepts */ 3513 svm->nested.intercept_cr = nested_vmcb->control.intercept_cr; 3514 svm->nested.intercept_dr = nested_vmcb->control.intercept_dr; 3515 svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; 3516 svm->nested.intercept = nested_vmcb->control.intercept; 3517 3518 svm_flush_tlb(&svm->vcpu, true); 3519 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; 3520 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) 3521 svm->vcpu.arch.hflags |= HF_VINTR_MASK; 3522 else 3523 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; 3524 3525 if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { 3526 /* We only want the cr8 intercept bits of the guest */ 3527 clr_cr_intercept(svm, INTERCEPT_CR8_READ); 3528 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); 3529 } 3530 3531 /* We don't want to see VMMCALLs from a nested guest */ 3532 clr_intercept(svm, INTERCEPT_VMMCALL); 3533 3534 svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset; 3535 svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset; 3536 3537 svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext; 3538 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; 3539 svm->vmcb->control.int_state = nested_vmcb->control.int_state; 3540 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; 3541 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; 3542 3543 nested_svm_unmap(page); 3544 3545 /* Enter Guest-Mode */ 3546 enter_guest_mode(&svm->vcpu); 3547 3548 /* 3549 * Merge guest and host intercepts - must be called with vcpu in 3550 * guest-mode to take affect here 3551 */ 3552 recalc_intercepts(svm); 3553 3554 svm->nested.vmcb = vmcb_gpa; 3555 3556 enable_gif(svm); 3557 3558 mark_all_dirty(svm->vmcb); 3559 } 3560 3561 static bool nested_svm_vmrun(struct vcpu_svm *svm) 3562 { 3563 struct vmcb *nested_vmcb; 3564 struct vmcb *hsave = svm->nested.hsave; 3565 struct vmcb *vmcb = svm->vmcb; 3566 struct page *page; 3567 u64 vmcb_gpa; 3568 3569 vmcb_gpa = svm->vmcb->save.rax; 3570 3571 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); 3572 if (!nested_vmcb) 3573 return false; 3574 3575 if (!nested_vmcb_checks(nested_vmcb)) { 3576 nested_vmcb->control.exit_code = SVM_EXIT_ERR; 3577 nested_vmcb->control.exit_code_hi = 0; 3578 nested_vmcb->control.exit_info_1 = 0; 3579 nested_vmcb->control.exit_info_2 = 0; 3580 3581 nested_svm_unmap(page); 3582 3583 return false; 3584 } 3585 3586 trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, 3587 nested_vmcb->save.rip, 3588 nested_vmcb->control.int_ctl, 3589 nested_vmcb->control.event_inj, 3590 nested_vmcb->control.nested_ctl); 3591 3592 trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, 3593 nested_vmcb->control.intercept_cr >> 16, 3594 nested_vmcb->control.intercept_exceptions, 3595 nested_vmcb->control.intercept); 3596 3597 /* Clear internal status */ 3598 kvm_clear_exception_queue(&svm->vcpu); 3599 kvm_clear_interrupt_queue(&svm->vcpu); 3600 3601 /* 3602 * Save the old vmcb, so we don't need to pick what we save, but can 3603 * restore everything when a VMEXIT occurs 3604 */ 3605 hsave->save.es = vmcb->save.es; 3606 hsave->save.cs = vmcb->save.cs; 3607 hsave->save.ss = vmcb->save.ss; 3608 hsave->save.ds = vmcb->save.ds; 3609 hsave->save.gdtr = vmcb->save.gdtr; 3610 hsave->save.idtr = vmcb->save.idtr; 3611 hsave->save.efer = svm->vcpu.arch.efer; 3612 hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); 3613 hsave->save.cr4 = svm->vcpu.arch.cr4; 3614 hsave->save.rflags = kvm_get_rflags(&svm->vcpu); 3615 hsave->save.rip = kvm_rip_read(&svm->vcpu); 3616 hsave->save.rsp = vmcb->save.rsp; 3617 hsave->save.rax = vmcb->save.rax; 3618 if (npt_enabled) 3619 hsave->save.cr3 = vmcb->save.cr3; 3620 else 3621 hsave->save.cr3 = kvm_read_cr3(&svm->vcpu); 3622 3623 copy_vmcb_control_area(hsave, vmcb); 3624 3625 enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page); 3626 3627 return true; 3628 } 3629 3630 static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) 3631 { 3632 to_vmcb->save.fs = from_vmcb->save.fs; 3633 to_vmcb->save.gs = from_vmcb->save.gs; 3634 to_vmcb->save.tr = from_vmcb->save.tr; 3635 to_vmcb->save.ldtr = from_vmcb->save.ldtr; 3636 to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base; 3637 to_vmcb->save.star = from_vmcb->save.star; 3638 to_vmcb->save.lstar = from_vmcb->save.lstar; 3639 to_vmcb->save.cstar = from_vmcb->save.cstar; 3640 to_vmcb->save.sfmask = from_vmcb->save.sfmask; 3641 to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs; 3642 to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp; 3643 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; 3644 } 3645 3646 static int vmload_interception(struct vcpu_svm *svm) 3647 { 3648 struct vmcb *nested_vmcb; 3649 struct page *page; 3650 int ret; 3651 3652 if (nested_svm_check_permissions(svm)) 3653 return 1; 3654 3655 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); 3656 if (!nested_vmcb) 3657 return 1; 3658 3659 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3660 ret = kvm_skip_emulated_instruction(&svm->vcpu); 3661 3662 nested_svm_vmloadsave(nested_vmcb, svm->vmcb); 3663 nested_svm_unmap(page); 3664 3665 return ret; 3666 } 3667 3668 static int vmsave_interception(struct vcpu_svm *svm) 3669 { 3670 struct vmcb *nested_vmcb; 3671 struct page *page; 3672 int ret; 3673 3674 if (nested_svm_check_permissions(svm)) 3675 return 1; 3676 3677 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); 3678 if (!nested_vmcb) 3679 return 1; 3680 3681 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3682 ret = kvm_skip_emulated_instruction(&svm->vcpu); 3683 3684 nested_svm_vmloadsave(svm->vmcb, nested_vmcb); 3685 nested_svm_unmap(page); 3686 3687 return ret; 3688 } 3689 3690 static int vmrun_interception(struct vcpu_svm *svm) 3691 { 3692 if (nested_svm_check_permissions(svm)) 3693 return 1; 3694 3695 /* Save rip after vmrun instruction */ 3696 kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3); 3697 3698 if (!nested_svm_vmrun(svm)) 3699 return 1; 3700 3701 if (!nested_svm_vmrun_msrpm(svm)) 3702 goto failed; 3703 3704 return 1; 3705 3706 failed: 3707 3708 svm->vmcb->control.exit_code = SVM_EXIT_ERR; 3709 svm->vmcb->control.exit_code_hi = 0; 3710 svm->vmcb->control.exit_info_1 = 0; 3711 svm->vmcb->control.exit_info_2 = 0; 3712 3713 nested_svm_vmexit(svm); 3714 3715 return 1; 3716 } 3717 3718 static int stgi_interception(struct vcpu_svm *svm) 3719 { 3720 int ret; 3721 3722 if (nested_svm_check_permissions(svm)) 3723 return 1; 3724 3725 /* 3726 * If VGIF is enabled, the STGI intercept is only added to 3727 * detect the opening of the SMI/NMI window; remove it now. 3728 */ 3729 if (vgif_enabled(svm)) 3730 clr_intercept(svm, INTERCEPT_STGI); 3731 3732 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3733 ret = kvm_skip_emulated_instruction(&svm->vcpu); 3734 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 3735 3736 enable_gif(svm); 3737 3738 return ret; 3739 } 3740 3741 static int clgi_interception(struct vcpu_svm *svm) 3742 { 3743 int ret; 3744 3745 if (nested_svm_check_permissions(svm)) 3746 return 1; 3747 3748 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3749 ret = kvm_skip_emulated_instruction(&svm->vcpu); 3750 3751 disable_gif(svm); 3752 3753 /* After a CLGI no interrupts should come */ 3754 if (!kvm_vcpu_apicv_active(&svm->vcpu)) { 3755 svm_clear_vintr(svm); 3756 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 3757 mark_dirty(svm->vmcb, VMCB_INTR); 3758 } 3759 3760 return ret; 3761 } 3762 3763 static int invlpga_interception(struct vcpu_svm *svm) 3764 { 3765 struct kvm_vcpu *vcpu = &svm->vcpu; 3766 3767 trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX), 3768 kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); 3769 3770 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ 3771 kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); 3772 3773 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3774 return kvm_skip_emulated_instruction(&svm->vcpu); 3775 } 3776 3777 static int skinit_interception(struct vcpu_svm *svm) 3778 { 3779 trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); 3780 3781 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 3782 return 1; 3783 } 3784 3785 static int wbinvd_interception(struct vcpu_svm *svm) 3786 { 3787 return kvm_emulate_wbinvd(&svm->vcpu); 3788 } 3789 3790 static int xsetbv_interception(struct vcpu_svm *svm) 3791 { 3792 u64 new_bv = kvm_read_edx_eax(&svm->vcpu); 3793 u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); 3794 3795 if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { 3796 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3797 return kvm_skip_emulated_instruction(&svm->vcpu); 3798 } 3799 3800 return 1; 3801 } 3802 3803 static int task_switch_interception(struct vcpu_svm *svm) 3804 { 3805 u16 tss_selector; 3806 int reason; 3807 int int_type = svm->vmcb->control.exit_int_info & 3808 SVM_EXITINTINFO_TYPE_MASK; 3809 int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK; 3810 uint32_t type = 3811 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; 3812 uint32_t idt_v = 3813 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; 3814 bool has_error_code = false; 3815 u32 error_code = 0; 3816 3817 tss_selector = (u16)svm->vmcb->control.exit_info_1; 3818 3819 if (svm->vmcb->control.exit_info_2 & 3820 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET)) 3821 reason = TASK_SWITCH_IRET; 3822 else if (svm->vmcb->control.exit_info_2 & 3823 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) 3824 reason = TASK_SWITCH_JMP; 3825 else if (idt_v) 3826 reason = TASK_SWITCH_GATE; 3827 else 3828 reason = TASK_SWITCH_CALL; 3829 3830 if (reason == TASK_SWITCH_GATE) { 3831 switch (type) { 3832 case SVM_EXITINTINFO_TYPE_NMI: 3833 svm->vcpu.arch.nmi_injected = false; 3834 break; 3835 case SVM_EXITINTINFO_TYPE_EXEPT: 3836 if (svm->vmcb->control.exit_info_2 & 3837 (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) { 3838 has_error_code = true; 3839 error_code = 3840 (u32)svm->vmcb->control.exit_info_2; 3841 } 3842 kvm_clear_exception_queue(&svm->vcpu); 3843 break; 3844 case SVM_EXITINTINFO_TYPE_INTR: 3845 kvm_clear_interrupt_queue(&svm->vcpu); 3846 break; 3847 default: 3848 break; 3849 } 3850 } 3851 3852 if (reason != TASK_SWITCH_GATE || 3853 int_type == SVM_EXITINTINFO_TYPE_SOFT || 3854 (int_type == SVM_EXITINTINFO_TYPE_EXEPT && 3855 (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) 3856 skip_emulated_instruction(&svm->vcpu); 3857 3858 if (int_type != SVM_EXITINTINFO_TYPE_SOFT) 3859 int_vec = -1; 3860 3861 if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, 3862 has_error_code, error_code) == EMULATE_FAIL) { 3863 svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3864 svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 3865 svm->vcpu.run->internal.ndata = 0; 3866 return 0; 3867 } 3868 return 1; 3869 } 3870 3871 static int cpuid_interception(struct vcpu_svm *svm) 3872 { 3873 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 3874 return kvm_emulate_cpuid(&svm->vcpu); 3875 } 3876 3877 static int iret_interception(struct vcpu_svm *svm) 3878 { 3879 ++svm->vcpu.stat.nmi_window_exits; 3880 clr_intercept(svm, INTERCEPT_IRET); 3881 svm->vcpu.arch.hflags |= HF_IRET_MASK; 3882 svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); 3883 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 3884 return 1; 3885 } 3886 3887 static int invlpg_interception(struct vcpu_svm *svm) 3888 { 3889 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) 3890 return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; 3891 3892 kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); 3893 return kvm_skip_emulated_instruction(&svm->vcpu); 3894 } 3895 3896 static int emulate_on_interception(struct vcpu_svm *svm) 3897 { 3898 return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; 3899 } 3900 3901 static int rsm_interception(struct vcpu_svm *svm) 3902 { 3903 return kvm_emulate_instruction_from_buffer(&svm->vcpu, 3904 rsm_ins_bytes, 2) == EMULATE_DONE; 3905 } 3906 3907 static int rdpmc_interception(struct vcpu_svm *svm) 3908 { 3909 int err; 3910 3911 if (!static_cpu_has(X86_FEATURE_NRIPS)) 3912 return emulate_on_interception(svm); 3913 3914 err = kvm_rdpmc(&svm->vcpu); 3915 return kvm_complete_insn_gp(&svm->vcpu, err); 3916 } 3917 3918 static bool check_selective_cr0_intercepted(struct vcpu_svm *svm, 3919 unsigned long val) 3920 { 3921 unsigned long cr0 = svm->vcpu.arch.cr0; 3922 bool ret = false; 3923 u64 intercept; 3924 3925 intercept = svm->nested.intercept; 3926 3927 if (!is_guest_mode(&svm->vcpu) || 3928 (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))) 3929 return false; 3930 3931 cr0 &= ~SVM_CR0_SELECTIVE_MASK; 3932 val &= ~SVM_CR0_SELECTIVE_MASK; 3933 3934 if (cr0 ^ val) { 3935 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; 3936 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE); 3937 } 3938 3939 return ret; 3940 } 3941 3942 #define CR_VALID (1ULL << 63) 3943 3944 static int cr_interception(struct vcpu_svm *svm) 3945 { 3946 int reg, cr; 3947 unsigned long val; 3948 int err; 3949 3950 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) 3951 return emulate_on_interception(svm); 3952 3953 if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0)) 3954 return emulate_on_interception(svm); 3955 3956 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; 3957 if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE) 3958 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0; 3959 else 3960 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0; 3961 3962 err = 0; 3963 if (cr >= 16) { /* mov to cr */ 3964 cr -= 16; 3965 val = kvm_register_read(&svm->vcpu, reg); 3966 switch (cr) { 3967 case 0: 3968 if (!check_selective_cr0_intercepted(svm, val)) 3969 err = kvm_set_cr0(&svm->vcpu, val); 3970 else 3971 return 1; 3972 3973 break; 3974 case 3: 3975 err = kvm_set_cr3(&svm->vcpu, val); 3976 break; 3977 case 4: 3978 err = kvm_set_cr4(&svm->vcpu, val); 3979 break; 3980 case 8: 3981 err = kvm_set_cr8(&svm->vcpu, val); 3982 break; 3983 default: 3984 WARN(1, "unhandled write to CR%d", cr); 3985 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 3986 return 1; 3987 } 3988 } else { /* mov from cr */ 3989 switch (cr) { 3990 case 0: 3991 val = kvm_read_cr0(&svm->vcpu); 3992 break; 3993 case 2: 3994 val = svm->vcpu.arch.cr2; 3995 break; 3996 case 3: 3997 val = kvm_read_cr3(&svm->vcpu); 3998 break; 3999 case 4: 4000 val = kvm_read_cr4(&svm->vcpu); 4001 break; 4002 case 8: 4003 val = kvm_get_cr8(&svm->vcpu); 4004 break; 4005 default: 4006 WARN(1, "unhandled read from CR%d", cr); 4007 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 4008 return 1; 4009 } 4010 kvm_register_write(&svm->vcpu, reg, val); 4011 } 4012 return kvm_complete_insn_gp(&svm->vcpu, err); 4013 } 4014 4015 static int dr_interception(struct vcpu_svm *svm) 4016 { 4017 int reg, dr; 4018 unsigned long val; 4019 4020 if (svm->vcpu.guest_debug == 0) { 4021 /* 4022 * No more DR vmexits; force a reload of the debug registers 4023 * and reenter on this instruction. The next vmexit will 4024 * retrieve the full state of the debug registers. 4025 */ 4026 clr_dr_intercepts(svm); 4027 svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 4028 return 1; 4029 } 4030 4031 if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) 4032 return emulate_on_interception(svm); 4033 4034 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; 4035 dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; 4036 4037 if (dr >= 16) { /* mov to DRn */ 4038 if (!kvm_require_dr(&svm->vcpu, dr - 16)) 4039 return 1; 4040 val = kvm_register_read(&svm->vcpu, reg); 4041 kvm_set_dr(&svm->vcpu, dr - 16, val); 4042 } else { 4043 if (!kvm_require_dr(&svm->vcpu, dr)) 4044 return 1; 4045 kvm_get_dr(&svm->vcpu, dr, &val); 4046 kvm_register_write(&svm->vcpu, reg, val); 4047 } 4048 4049 return kvm_skip_emulated_instruction(&svm->vcpu); 4050 } 4051 4052 static int cr8_write_interception(struct vcpu_svm *svm) 4053 { 4054 struct kvm_run *kvm_run = svm->vcpu.run; 4055 int r; 4056 4057 u8 cr8_prev = kvm_get_cr8(&svm->vcpu); 4058 /* instruction emulation calls kvm_set_cr8() */ 4059 r = cr_interception(svm); 4060 if (lapic_in_kernel(&svm->vcpu)) 4061 return r; 4062 if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) 4063 return r; 4064 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 4065 return 0; 4066 } 4067 4068 static int svm_get_msr_feature(struct kvm_msr_entry *msr) 4069 { 4070 msr->data = 0; 4071 4072 switch (msr->index) { 4073 case MSR_F10H_DECFG: 4074 if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) 4075 msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE; 4076 break; 4077 default: 4078 return 1; 4079 } 4080 4081 return 0; 4082 } 4083 4084 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 4085 { 4086 struct vcpu_svm *svm = to_svm(vcpu); 4087 4088 switch (msr_info->index) { 4089 case MSR_STAR: 4090 msr_info->data = svm->vmcb->save.star; 4091 break; 4092 #ifdef CONFIG_X86_64 4093 case MSR_LSTAR: 4094 msr_info->data = svm->vmcb->save.lstar; 4095 break; 4096 case MSR_CSTAR: 4097 msr_info->data = svm->vmcb->save.cstar; 4098 break; 4099 case MSR_KERNEL_GS_BASE: 4100 msr_info->data = svm->vmcb->save.kernel_gs_base; 4101 break; 4102 case MSR_SYSCALL_MASK: 4103 msr_info->data = svm->vmcb->save.sfmask; 4104 break; 4105 #endif 4106 case MSR_IA32_SYSENTER_CS: 4107 msr_info->data = svm->vmcb->save.sysenter_cs; 4108 break; 4109 case MSR_IA32_SYSENTER_EIP: 4110 msr_info->data = svm->sysenter_eip; 4111 break; 4112 case MSR_IA32_SYSENTER_ESP: 4113 msr_info->data = svm->sysenter_esp; 4114 break; 4115 case MSR_TSC_AUX: 4116 if (!boot_cpu_has(X86_FEATURE_RDTSCP)) 4117 return 1; 4118 msr_info->data = svm->tsc_aux; 4119 break; 4120 /* 4121 * Nobody will change the following 5 values in the VMCB so we can 4122 * safely return them on rdmsr. They will always be 0 until LBRV is 4123 * implemented. 4124 */ 4125 case MSR_IA32_DEBUGCTLMSR: 4126 msr_info->data = svm->vmcb->save.dbgctl; 4127 break; 4128 case MSR_IA32_LASTBRANCHFROMIP: 4129 msr_info->data = svm->vmcb->save.br_from; 4130 break; 4131 case MSR_IA32_LASTBRANCHTOIP: 4132 msr_info->data = svm->vmcb->save.br_to; 4133 break; 4134 case MSR_IA32_LASTINTFROMIP: 4135 msr_info->data = svm->vmcb->save.last_excp_from; 4136 break; 4137 case MSR_IA32_LASTINTTOIP: 4138 msr_info->data = svm->vmcb->save.last_excp_to; 4139 break; 4140 case MSR_VM_HSAVE_PA: 4141 msr_info->data = svm->nested.hsave_msr; 4142 break; 4143 case MSR_VM_CR: 4144 msr_info->data = svm->nested.vm_cr_msr; 4145 break; 4146 case MSR_IA32_SPEC_CTRL: 4147 if (!msr_info->host_initiated && 4148 !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) && 4149 !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) 4150 return 1; 4151 4152 msr_info->data = svm->spec_ctrl; 4153 break; 4154 case MSR_AMD64_VIRT_SPEC_CTRL: 4155 if (!msr_info->host_initiated && 4156 !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) 4157 return 1; 4158 4159 msr_info->data = svm->virt_spec_ctrl; 4160 break; 4161 case MSR_F15H_IC_CFG: { 4162 4163 int family, model; 4164 4165 family = guest_cpuid_family(vcpu); 4166 model = guest_cpuid_model(vcpu); 4167 4168 if (family < 0 || model < 0) 4169 return kvm_get_msr_common(vcpu, msr_info); 4170 4171 msr_info->data = 0; 4172 4173 if (family == 0x15 && 4174 (model >= 0x2 && model < 0x20)) 4175 msr_info->data = 0x1E; 4176 } 4177 break; 4178 case MSR_F10H_DECFG: 4179 msr_info->data = svm->msr_decfg; 4180 break; 4181 default: 4182 return kvm_get_msr_common(vcpu, msr_info); 4183 } 4184 return 0; 4185 } 4186 4187 static int rdmsr_interception(struct vcpu_svm *svm) 4188 { 4189 u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); 4190 struct msr_data msr_info; 4191 4192 msr_info.index = ecx; 4193 msr_info.host_initiated = false; 4194 if (svm_get_msr(&svm->vcpu, &msr_info)) { 4195 trace_kvm_msr_read_ex(ecx); 4196 kvm_inject_gp(&svm->vcpu, 0); 4197 return 1; 4198 } else { 4199 trace_kvm_msr_read(ecx, msr_info.data); 4200 4201 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, 4202 msr_info.data & 0xffffffff); 4203 kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, 4204 msr_info.data >> 32); 4205 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 4206 return kvm_skip_emulated_instruction(&svm->vcpu); 4207 } 4208 } 4209 4210 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) 4211 { 4212 struct vcpu_svm *svm = to_svm(vcpu); 4213 int svm_dis, chg_mask; 4214 4215 if (data & ~SVM_VM_CR_VALID_MASK) 4216 return 1; 4217 4218 chg_mask = SVM_VM_CR_VALID_MASK; 4219 4220 if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK) 4221 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK); 4222 4223 svm->nested.vm_cr_msr &= ~chg_mask; 4224 svm->nested.vm_cr_msr |= (data & chg_mask); 4225 4226 svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK; 4227 4228 /* check for svm_disable while efer.svme is set */ 4229 if (svm_dis && (vcpu->arch.efer & EFER_SVME)) 4230 return 1; 4231 4232 return 0; 4233 } 4234 4235 static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 4236 { 4237 struct vcpu_svm *svm = to_svm(vcpu); 4238 4239 u32 ecx = msr->index; 4240 u64 data = msr->data; 4241 switch (ecx) { 4242 case MSR_IA32_CR_PAT: 4243 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) 4244 return 1; 4245 vcpu->arch.pat = data; 4246 svm->vmcb->save.g_pat = data; 4247 mark_dirty(svm->vmcb, VMCB_NPT); 4248 break; 4249 case MSR_IA32_SPEC_CTRL: 4250 if (!msr->host_initiated && 4251 !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) && 4252 !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) 4253 return 1; 4254 4255 /* The STIBP bit doesn't fault even if it's not advertised */ 4256 if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) 4257 return 1; 4258 4259 svm->spec_ctrl = data; 4260 4261 if (!data) 4262 break; 4263 4264 /* 4265 * For non-nested: 4266 * When it's written (to non-zero) for the first time, pass 4267 * it through. 4268 * 4269 * For nested: 4270 * The handling of the MSR bitmap for L2 guests is done in 4271 * nested_svm_vmrun_msrpm. 4272 * We update the L1 MSR bit as well since it will end up 4273 * touching the MSR anyway now. 4274 */ 4275 set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); 4276 break; 4277 case MSR_IA32_PRED_CMD: 4278 if (!msr->host_initiated && 4279 !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)) 4280 return 1; 4281 4282 if (data & ~PRED_CMD_IBPB) 4283 return 1; 4284 4285 if (!data) 4286 break; 4287 4288 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); 4289 if (is_guest_mode(vcpu)) 4290 break; 4291 set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); 4292 break; 4293 case MSR_AMD64_VIRT_SPEC_CTRL: 4294 if (!msr->host_initiated && 4295 !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) 4296 return 1; 4297 4298 if (data & ~SPEC_CTRL_SSBD) 4299 return 1; 4300 4301 svm->virt_spec_ctrl = data; 4302 break; 4303 case MSR_STAR: 4304 svm->vmcb->save.star = data; 4305 break; 4306 #ifdef CONFIG_X86_64 4307 case MSR_LSTAR: 4308 svm->vmcb->save.lstar = data; 4309 break; 4310 case MSR_CSTAR: 4311 svm->vmcb->save.cstar = data; 4312 break; 4313 case MSR_KERNEL_GS_BASE: 4314 svm->vmcb->save.kernel_gs_base = data; 4315 break; 4316 case MSR_SYSCALL_MASK: 4317 svm->vmcb->save.sfmask = data; 4318 break; 4319 #endif 4320 case MSR_IA32_SYSENTER_CS: 4321 svm->vmcb->save.sysenter_cs = data; 4322 break; 4323 case MSR_IA32_SYSENTER_EIP: 4324 svm->sysenter_eip = data; 4325 svm->vmcb->save.sysenter_eip = data; 4326 break; 4327 case MSR_IA32_SYSENTER_ESP: 4328 svm->sysenter_esp = data; 4329 svm->vmcb->save.sysenter_esp = data; 4330 break; 4331 case MSR_TSC_AUX: 4332 if (!boot_cpu_has(X86_FEATURE_RDTSCP)) 4333 return 1; 4334 4335 /* 4336 * This is rare, so we update the MSR here instead of using 4337 * direct_access_msrs. Doing that would require a rdmsr in 4338 * svm_vcpu_put. 4339 */ 4340 svm->tsc_aux = data; 4341 wrmsrl(MSR_TSC_AUX, svm->tsc_aux); 4342 break; 4343 case MSR_IA32_DEBUGCTLMSR: 4344 if (!boot_cpu_has(X86_FEATURE_LBRV)) { 4345 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", 4346 __func__, data); 4347 break; 4348 } 4349 if (data & DEBUGCTL_RESERVED_BITS) 4350 return 1; 4351 4352 svm->vmcb->save.dbgctl = data; 4353 mark_dirty(svm->vmcb, VMCB_LBR); 4354 if (data & (1ULL<<0)) 4355 svm_enable_lbrv(svm); 4356 else 4357 svm_disable_lbrv(svm); 4358 break; 4359 case MSR_VM_HSAVE_PA: 4360 svm->nested.hsave_msr = data; 4361 break; 4362 case MSR_VM_CR: 4363 return svm_set_vm_cr(vcpu, data); 4364 case MSR_VM_IGNNE: 4365 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); 4366 break; 4367 case MSR_F10H_DECFG: { 4368 struct kvm_msr_entry msr_entry; 4369 4370 msr_entry.index = msr->index; 4371 if (svm_get_msr_feature(&msr_entry)) 4372 return 1; 4373 4374 /* Check the supported bits */ 4375 if (data & ~msr_entry.data) 4376 return 1; 4377 4378 /* Don't allow the guest to change a bit, #GP */ 4379 if (!msr->host_initiated && (data ^ msr_entry.data)) 4380 return 1; 4381 4382 svm->msr_decfg = data; 4383 break; 4384 } 4385 case MSR_IA32_APICBASE: 4386 if (kvm_vcpu_apicv_active(vcpu)) 4387 avic_update_vapic_bar(to_svm(vcpu), data); 4388 /* Follow through */ 4389 default: 4390 return kvm_set_msr_common(vcpu, msr); 4391 } 4392 return 0; 4393 } 4394 4395 static int wrmsr_interception(struct vcpu_svm *svm) 4396 { 4397 struct msr_data msr; 4398 u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); 4399 u64 data = kvm_read_edx_eax(&svm->vcpu); 4400 4401 msr.data = data; 4402 msr.index = ecx; 4403 msr.host_initiated = false; 4404 4405 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 4406 if (kvm_set_msr(&svm->vcpu, &msr)) { 4407 trace_kvm_msr_write_ex(ecx, data); 4408 kvm_inject_gp(&svm->vcpu, 0); 4409 return 1; 4410 } else { 4411 trace_kvm_msr_write(ecx, data); 4412 return kvm_skip_emulated_instruction(&svm->vcpu); 4413 } 4414 } 4415 4416 static int msr_interception(struct vcpu_svm *svm) 4417 { 4418 if (svm->vmcb->control.exit_info_1) 4419 return wrmsr_interception(svm); 4420 else 4421 return rdmsr_interception(svm); 4422 } 4423 4424 static int interrupt_window_interception(struct vcpu_svm *svm) 4425 { 4426 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 4427 svm_clear_vintr(svm); 4428 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 4429 mark_dirty(svm->vmcb, VMCB_INTR); 4430 ++svm->vcpu.stat.irq_window_exits; 4431 return 1; 4432 } 4433 4434 static int pause_interception(struct vcpu_svm *svm) 4435 { 4436 struct kvm_vcpu *vcpu = &svm->vcpu; 4437 bool in_kernel = (svm_get_cpl(vcpu) == 0); 4438 4439 if (pause_filter_thresh) 4440 grow_ple_window(vcpu); 4441 4442 kvm_vcpu_on_spin(vcpu, in_kernel); 4443 return 1; 4444 } 4445 4446 static int nop_interception(struct vcpu_svm *svm) 4447 { 4448 return kvm_skip_emulated_instruction(&(svm->vcpu)); 4449 } 4450 4451 static int monitor_interception(struct vcpu_svm *svm) 4452 { 4453 printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); 4454 return nop_interception(svm); 4455 } 4456 4457 static int mwait_interception(struct vcpu_svm *svm) 4458 { 4459 printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); 4460 return nop_interception(svm); 4461 } 4462 4463 enum avic_ipi_failure_cause { 4464 AVIC_IPI_FAILURE_INVALID_INT_TYPE, 4465 AVIC_IPI_FAILURE_TARGET_NOT_RUNNING, 4466 AVIC_IPI_FAILURE_INVALID_TARGET, 4467 AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, 4468 }; 4469 4470 static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) 4471 { 4472 u32 icrh = svm->vmcb->control.exit_info_1 >> 32; 4473 u32 icrl = svm->vmcb->control.exit_info_1; 4474 u32 id = svm->vmcb->control.exit_info_2 >> 32; 4475 u32 index = svm->vmcb->control.exit_info_2 & 0xFF; 4476 struct kvm_lapic *apic = svm->vcpu.arch.apic; 4477 4478 trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index); 4479 4480 switch (id) { 4481 case AVIC_IPI_FAILURE_INVALID_INT_TYPE: 4482 /* 4483 * AVIC hardware handles the generation of 4484 * IPIs when the specified Message Type is Fixed 4485 * (also known as fixed delivery mode) and 4486 * the Trigger Mode is edge-triggered. The hardware 4487 * also supports self and broadcast delivery modes 4488 * specified via the Destination Shorthand(DSH) 4489 * field of the ICRL. Logical and physical APIC ID 4490 * formats are supported. All other IPI types cause 4491 * a #VMEXIT, which needs to emulated. 4492 */ 4493 kvm_lapic_reg_write(apic, APIC_ICR2, icrh); 4494 kvm_lapic_reg_write(apic, APIC_ICR, icrl); 4495 break; 4496 case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: { 4497 struct kvm_lapic *apic = svm->vcpu.arch.apic; 4498 4499 /* 4500 * Update ICR high and low, then emulate sending IPI, 4501 * which is handled when writing APIC_ICR. 4502 */ 4503 kvm_lapic_reg_write(apic, APIC_ICR2, icrh); 4504 kvm_lapic_reg_write(apic, APIC_ICR, icrl); 4505 break; 4506 } 4507 case AVIC_IPI_FAILURE_INVALID_TARGET: 4508 break; 4509 case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: 4510 WARN_ONCE(1, "Invalid backing page\n"); 4511 break; 4512 default: 4513 pr_err("Unknown IPI interception\n"); 4514 } 4515 4516 return 1; 4517 } 4518 4519 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) 4520 { 4521 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 4522 int index; 4523 u32 *logical_apic_id_table; 4524 int dlid = GET_APIC_LOGICAL_ID(ldr); 4525 4526 if (!dlid) 4527 return NULL; 4528 4529 if (flat) { /* flat */ 4530 index = ffs(dlid) - 1; 4531 if (index > 7) 4532 return NULL; 4533 } else { /* cluster */ 4534 int cluster = (dlid & 0xf0) >> 4; 4535 int apic = ffs(dlid & 0x0f) - 1; 4536 4537 if ((apic < 0) || (apic > 7) || 4538 (cluster >= 0xf)) 4539 return NULL; 4540 index = (cluster << 2) + apic; 4541 } 4542 4543 logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page); 4544 4545 return &logical_apic_id_table[index]; 4546 } 4547 4548 static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr, 4549 bool valid) 4550 { 4551 bool flat; 4552 u32 *entry, new_entry; 4553 4554 flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT; 4555 entry = avic_get_logical_id_entry(vcpu, ldr, flat); 4556 if (!entry) 4557 return -EINVAL; 4558 4559 new_entry = READ_ONCE(*entry); 4560 new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; 4561 new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); 4562 if (valid) 4563 new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; 4564 else 4565 new_entry &= ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK; 4566 WRITE_ONCE(*entry, new_entry); 4567 4568 return 0; 4569 } 4570 4571 static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) 4572 { 4573 int ret; 4574 struct vcpu_svm *svm = to_svm(vcpu); 4575 u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); 4576 4577 if (!ldr) 4578 return 1; 4579 4580 ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr, true); 4581 if (ret && svm->ldr_reg) { 4582 avic_ldr_write(vcpu, 0, svm->ldr_reg, false); 4583 svm->ldr_reg = 0; 4584 } else { 4585 svm->ldr_reg = ldr; 4586 } 4587 return ret; 4588 } 4589 4590 static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu) 4591 { 4592 u64 *old, *new; 4593 struct vcpu_svm *svm = to_svm(vcpu); 4594 u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID); 4595 u32 id = (apic_id_reg >> 24) & 0xff; 4596 4597 if (vcpu->vcpu_id == id) 4598 return 0; 4599 4600 old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id); 4601 new = avic_get_physical_id_entry(vcpu, id); 4602 if (!new || !old) 4603 return 1; 4604 4605 /* We need to move physical_id_entry to new offset */ 4606 *new = *old; 4607 *old = 0ULL; 4608 to_svm(vcpu)->avic_physical_id_cache = new; 4609 4610 /* 4611 * Also update the guest physical APIC ID in the logical 4612 * APIC ID table entry if already setup the LDR. 4613 */ 4614 if (svm->ldr_reg) 4615 avic_handle_ldr_update(vcpu); 4616 4617 return 0; 4618 } 4619 4620 static int avic_handle_dfr_update(struct kvm_vcpu *vcpu) 4621 { 4622 struct vcpu_svm *svm = to_svm(vcpu); 4623 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 4624 u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR); 4625 u32 mod = (dfr >> 28) & 0xf; 4626 4627 /* 4628 * We assume that all local APICs are using the same type. 4629 * If this changes, we need to flush the AVIC logical 4630 * APID id table. 4631 */ 4632 if (kvm_svm->ldr_mode == mod) 4633 return 0; 4634 4635 clear_page(page_address(kvm_svm->avic_logical_id_table_page)); 4636 kvm_svm->ldr_mode = mod; 4637 4638 if (svm->ldr_reg) 4639 avic_handle_ldr_update(vcpu); 4640 return 0; 4641 } 4642 4643 static int avic_unaccel_trap_write(struct vcpu_svm *svm) 4644 { 4645 struct kvm_lapic *apic = svm->vcpu.arch.apic; 4646 u32 offset = svm->vmcb->control.exit_info_1 & 4647 AVIC_UNACCEL_ACCESS_OFFSET_MASK; 4648 4649 switch (offset) { 4650 case APIC_ID: 4651 if (avic_handle_apic_id_update(&svm->vcpu)) 4652 return 0; 4653 break; 4654 case APIC_LDR: 4655 if (avic_handle_ldr_update(&svm->vcpu)) 4656 return 0; 4657 break; 4658 case APIC_DFR: 4659 avic_handle_dfr_update(&svm->vcpu); 4660 break; 4661 default: 4662 break; 4663 } 4664 4665 kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); 4666 4667 return 1; 4668 } 4669 4670 static bool is_avic_unaccelerated_access_trap(u32 offset) 4671 { 4672 bool ret = false; 4673 4674 switch (offset) { 4675 case APIC_ID: 4676 case APIC_EOI: 4677 case APIC_RRR: 4678 case APIC_LDR: 4679 case APIC_DFR: 4680 case APIC_SPIV: 4681 case APIC_ESR: 4682 case APIC_ICR: 4683 case APIC_LVTT: 4684 case APIC_LVTTHMR: 4685 case APIC_LVTPC: 4686 case APIC_LVT0: 4687 case APIC_LVT1: 4688 case APIC_LVTERR: 4689 case APIC_TMICT: 4690 case APIC_TDCR: 4691 ret = true; 4692 break; 4693 default: 4694 break; 4695 } 4696 return ret; 4697 } 4698 4699 static int avic_unaccelerated_access_interception(struct vcpu_svm *svm) 4700 { 4701 int ret = 0; 4702 u32 offset = svm->vmcb->control.exit_info_1 & 4703 AVIC_UNACCEL_ACCESS_OFFSET_MASK; 4704 u32 vector = svm->vmcb->control.exit_info_2 & 4705 AVIC_UNACCEL_ACCESS_VECTOR_MASK; 4706 bool write = (svm->vmcb->control.exit_info_1 >> 32) & 4707 AVIC_UNACCEL_ACCESS_WRITE_MASK; 4708 bool trap = is_avic_unaccelerated_access_trap(offset); 4709 4710 trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset, 4711 trap, write, vector); 4712 if (trap) { 4713 /* Handling Trap */ 4714 WARN_ONCE(!write, "svm: Handling trap read.\n"); 4715 ret = avic_unaccel_trap_write(svm); 4716 } else { 4717 /* Handling Fault */ 4718 ret = (kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE); 4719 } 4720 4721 return ret; 4722 } 4723 4724 static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { 4725 [SVM_EXIT_READ_CR0] = cr_interception, 4726 [SVM_EXIT_READ_CR3] = cr_interception, 4727 [SVM_EXIT_READ_CR4] = cr_interception, 4728 [SVM_EXIT_READ_CR8] = cr_interception, 4729 [SVM_EXIT_CR0_SEL_WRITE] = cr_interception, 4730 [SVM_EXIT_WRITE_CR0] = cr_interception, 4731 [SVM_EXIT_WRITE_CR3] = cr_interception, 4732 [SVM_EXIT_WRITE_CR4] = cr_interception, 4733 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 4734 [SVM_EXIT_READ_DR0] = dr_interception, 4735 [SVM_EXIT_READ_DR1] = dr_interception, 4736 [SVM_EXIT_READ_DR2] = dr_interception, 4737 [SVM_EXIT_READ_DR3] = dr_interception, 4738 [SVM_EXIT_READ_DR4] = dr_interception, 4739 [SVM_EXIT_READ_DR5] = dr_interception, 4740 [SVM_EXIT_READ_DR6] = dr_interception, 4741 [SVM_EXIT_READ_DR7] = dr_interception, 4742 [SVM_EXIT_WRITE_DR0] = dr_interception, 4743 [SVM_EXIT_WRITE_DR1] = dr_interception, 4744 [SVM_EXIT_WRITE_DR2] = dr_interception, 4745 [SVM_EXIT_WRITE_DR3] = dr_interception, 4746 [SVM_EXIT_WRITE_DR4] = dr_interception, 4747 [SVM_EXIT_WRITE_DR5] = dr_interception, 4748 [SVM_EXIT_WRITE_DR6] = dr_interception, 4749 [SVM_EXIT_WRITE_DR7] = dr_interception, 4750 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 4751 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 4752 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, 4753 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 4754 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, 4755 [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception, 4756 [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception, 4757 [SVM_EXIT_INTR] = intr_interception, 4758 [SVM_EXIT_NMI] = nmi_interception, 4759 [SVM_EXIT_SMI] = nop_on_interception, 4760 [SVM_EXIT_INIT] = nop_on_interception, 4761 [SVM_EXIT_VINTR] = interrupt_window_interception, 4762 [SVM_EXIT_RDPMC] = rdpmc_interception, 4763 [SVM_EXIT_CPUID] = cpuid_interception, 4764 [SVM_EXIT_IRET] = iret_interception, 4765 [SVM_EXIT_INVD] = emulate_on_interception, 4766 [SVM_EXIT_PAUSE] = pause_interception, 4767 [SVM_EXIT_HLT] = halt_interception, 4768 [SVM_EXIT_INVLPG] = invlpg_interception, 4769 [SVM_EXIT_INVLPGA] = invlpga_interception, 4770 [SVM_EXIT_IOIO] = io_interception, 4771 [SVM_EXIT_MSR] = msr_interception, 4772 [SVM_EXIT_TASK_SWITCH] = task_switch_interception, 4773 [SVM_EXIT_SHUTDOWN] = shutdown_interception, 4774 [SVM_EXIT_VMRUN] = vmrun_interception, 4775 [SVM_EXIT_VMMCALL] = vmmcall_interception, 4776 [SVM_EXIT_VMLOAD] = vmload_interception, 4777 [SVM_EXIT_VMSAVE] = vmsave_interception, 4778 [SVM_EXIT_STGI] = stgi_interception, 4779 [SVM_EXIT_CLGI] = clgi_interception, 4780 [SVM_EXIT_SKINIT] = skinit_interception, 4781 [SVM_EXIT_WBINVD] = wbinvd_interception, 4782 [SVM_EXIT_MONITOR] = monitor_interception, 4783 [SVM_EXIT_MWAIT] = mwait_interception, 4784 [SVM_EXIT_XSETBV] = xsetbv_interception, 4785 [SVM_EXIT_NPF] = npf_interception, 4786 [SVM_EXIT_RSM] = rsm_interception, 4787 [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, 4788 [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, 4789 }; 4790 4791 static void dump_vmcb(struct kvm_vcpu *vcpu) 4792 { 4793 struct vcpu_svm *svm = to_svm(vcpu); 4794 struct vmcb_control_area *control = &svm->vmcb->control; 4795 struct vmcb_save_area *save = &svm->vmcb->save; 4796 4797 pr_err("VMCB Control Area:\n"); 4798 pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff); 4799 pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16); 4800 pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff); 4801 pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16); 4802 pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions); 4803 pr_err("%-20s%016llx\n", "intercepts:", control->intercept); 4804 pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count); 4805 pr_err("%-20s%d\n", "pause filter threshold:", 4806 control->pause_filter_thresh); 4807 pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa); 4808 pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa); 4809 pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset); 4810 pr_err("%-20s%d\n", "asid:", control->asid); 4811 pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl); 4812 pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); 4813 pr_err("%-20s%08x\n", "int_vector:", control->int_vector); 4814 pr_err("%-20s%08x\n", "int_state:", control->int_state); 4815 pr_err("%-20s%08x\n", "exit_code:", control->exit_code); 4816 pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1); 4817 pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2); 4818 pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info); 4819 pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); 4820 pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); 4821 pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); 4822 pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); 4823 pr_err("%-20s%08x\n", "event_inj:", control->event_inj); 4824 pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); 4825 pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext); 4826 pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); 4827 pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page); 4828 pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); 4829 pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); 4830 pr_err("VMCB State Save Area:\n"); 4831 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 4832 "es:", 4833 save->es.selector, save->es.attrib, 4834 save->es.limit, save->es.base); 4835 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 4836 "cs:", 4837 save->cs.selector, save->cs.attrib, 4838 save->cs.limit, save->cs.base); 4839 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 4840 "ss:", 4841 save->ss.selector, save->ss.attrib, 4842 save->ss.limit, save->ss.base); 4843 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 4844 "ds:", 4845 save->ds.selector, save->ds.attrib, 4846 save->ds.limit, save->ds.base); 4847 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 4848 "fs:", 4849 save->fs.selector, save->fs.attrib, 4850 save->fs.limit, save->fs.base); 4851 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 4852 "gs:", 4853 save->gs.selector, save->gs.attrib, 4854 save->gs.limit, save->gs.base); 4855 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 4856 "gdtr:", 4857 save->gdtr.selector, save->gdtr.attrib, 4858 save->gdtr.limit, save->gdtr.base); 4859 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 4860 "ldtr:", 4861 save->ldtr.selector, save->ldtr.attrib, 4862 save->ldtr.limit, save->ldtr.base); 4863 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 4864 "idtr:", 4865 save->idtr.selector, save->idtr.attrib, 4866 save->idtr.limit, save->idtr.base); 4867 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 4868 "tr:", 4869 save->tr.selector, save->tr.attrib, 4870 save->tr.limit, save->tr.base); 4871 pr_err("cpl: %d efer: %016llx\n", 4872 save->cpl, save->efer); 4873 pr_err("%-15s %016llx %-13s %016llx\n", 4874 "cr0:", save->cr0, "cr2:", save->cr2); 4875 pr_err("%-15s %016llx %-13s %016llx\n", 4876 "cr3:", save->cr3, "cr4:", save->cr4); 4877 pr_err("%-15s %016llx %-13s %016llx\n", 4878 "dr6:", save->dr6, "dr7:", save->dr7); 4879 pr_err("%-15s %016llx %-13s %016llx\n", 4880 "rip:", save->rip, "rflags:", save->rflags); 4881 pr_err("%-15s %016llx %-13s %016llx\n", 4882 "rsp:", save->rsp, "rax:", save->rax); 4883 pr_err("%-15s %016llx %-13s %016llx\n", 4884 "star:", save->star, "lstar:", save->lstar); 4885 pr_err("%-15s %016llx %-13s %016llx\n", 4886 "cstar:", save->cstar, "sfmask:", save->sfmask); 4887 pr_err("%-15s %016llx %-13s %016llx\n", 4888 "kernel_gs_base:", save->kernel_gs_base, 4889 "sysenter_cs:", save->sysenter_cs); 4890 pr_err("%-15s %016llx %-13s %016llx\n", 4891 "sysenter_esp:", save->sysenter_esp, 4892 "sysenter_eip:", save->sysenter_eip); 4893 pr_err("%-15s %016llx %-13s %016llx\n", 4894 "gpat:", save->g_pat, "dbgctl:", save->dbgctl); 4895 pr_err("%-15s %016llx %-13s %016llx\n", 4896 "br_from:", save->br_from, "br_to:", save->br_to); 4897 pr_err("%-15s %016llx %-13s %016llx\n", 4898 "excp_from:", save->last_excp_from, 4899 "excp_to:", save->last_excp_to); 4900 } 4901 4902 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) 4903 { 4904 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; 4905 4906 *info1 = control->exit_info_1; 4907 *info2 = control->exit_info_2; 4908 } 4909 4910 static int handle_exit(struct kvm_vcpu *vcpu) 4911 { 4912 struct vcpu_svm *svm = to_svm(vcpu); 4913 struct kvm_run *kvm_run = vcpu->run; 4914 u32 exit_code = svm->vmcb->control.exit_code; 4915 4916 trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); 4917 4918 if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) 4919 vcpu->arch.cr0 = svm->vmcb->save.cr0; 4920 if (npt_enabled) 4921 vcpu->arch.cr3 = svm->vmcb->save.cr3; 4922 4923 if (unlikely(svm->nested.exit_required)) { 4924 nested_svm_vmexit(svm); 4925 svm->nested.exit_required = false; 4926 4927 return 1; 4928 } 4929 4930 if (is_guest_mode(vcpu)) { 4931 int vmexit; 4932 4933 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, 4934 svm->vmcb->control.exit_info_1, 4935 svm->vmcb->control.exit_info_2, 4936 svm->vmcb->control.exit_int_info, 4937 svm->vmcb->control.exit_int_info_err, 4938 KVM_ISA_SVM); 4939 4940 vmexit = nested_svm_exit_special(svm); 4941 4942 if (vmexit == NESTED_EXIT_CONTINUE) 4943 vmexit = nested_svm_exit_handled(svm); 4944 4945 if (vmexit == NESTED_EXIT_DONE) 4946 return 1; 4947 } 4948 4949 svm_complete_interrupts(svm); 4950 4951 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { 4952 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 4953 kvm_run->fail_entry.hardware_entry_failure_reason 4954 = svm->vmcb->control.exit_code; 4955 pr_err("KVM: FAILED VMRUN WITH VMCB:\n"); 4956 dump_vmcb(vcpu); 4957 return 0; 4958 } 4959 4960 if (is_external_interrupt(svm->vmcb->control.exit_int_info) && 4961 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && 4962 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH && 4963 exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) 4964 printk(KERN_ERR "%s: unexpected exit_int_info 0x%x " 4965 "exit_code 0x%x\n", 4966 __func__, svm->vmcb->control.exit_int_info, 4967 exit_code); 4968 4969 if (exit_code >= ARRAY_SIZE(svm_exit_handlers) 4970 || !svm_exit_handlers[exit_code]) { 4971 WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code); 4972 kvm_queue_exception(vcpu, UD_VECTOR); 4973 return 1; 4974 } 4975 4976 return svm_exit_handlers[exit_code](svm); 4977 } 4978 4979 static void reload_tss(struct kvm_vcpu *vcpu) 4980 { 4981 int cpu = raw_smp_processor_id(); 4982 4983 struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 4984 sd->tss_desc->type = 9; /* available 32/64-bit TSS */ 4985 load_TR_desc(); 4986 } 4987 4988 static void pre_sev_run(struct vcpu_svm *svm, int cpu) 4989 { 4990 struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 4991 int asid = sev_get_asid(svm->vcpu.kvm); 4992 4993 /* Assign the asid allocated with this SEV guest */ 4994 svm->vmcb->control.asid = asid; 4995 4996 /* 4997 * Flush guest TLB: 4998 * 4999 * 1) when different VMCB for the same ASID is to be run on the same host CPU. 5000 * 2) or this VMCB was executed on different host CPU in previous VMRUNs. 5001 */ 5002 if (sd->sev_vmcbs[asid] == svm->vmcb && 5003 svm->last_cpu == cpu) 5004 return; 5005 5006 svm->last_cpu = cpu; 5007 sd->sev_vmcbs[asid] = svm->vmcb; 5008 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; 5009 mark_dirty(svm->vmcb, VMCB_ASID); 5010 } 5011 5012 static void pre_svm_run(struct vcpu_svm *svm) 5013 { 5014 int cpu = raw_smp_processor_id(); 5015 5016 struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 5017 5018 if (sev_guest(svm->vcpu.kvm)) 5019 return pre_sev_run(svm, cpu); 5020 5021 /* FIXME: handle wraparound of asid_generation */ 5022 if (svm->asid_generation != sd->asid_generation) 5023 new_asid(svm, sd); 5024 } 5025 5026 static void svm_inject_nmi(struct kvm_vcpu *vcpu) 5027 { 5028 struct vcpu_svm *svm = to_svm(vcpu); 5029 5030 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; 5031 vcpu->arch.hflags |= HF_NMI_MASK; 5032 set_intercept(svm, INTERCEPT_IRET); 5033 ++vcpu->stat.nmi_injections; 5034 } 5035 5036 static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) 5037 { 5038 struct vmcb_control_area *control; 5039 5040 /* The following fields are ignored when AVIC is enabled */ 5041 control = &svm->vmcb->control; 5042 control->int_vector = irq; 5043 control->int_ctl &= ~V_INTR_PRIO_MASK; 5044 control->int_ctl |= V_IRQ_MASK | 5045 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); 5046 mark_dirty(svm->vmcb, VMCB_INTR); 5047 } 5048 5049 static void svm_set_irq(struct kvm_vcpu *vcpu) 5050 { 5051 struct vcpu_svm *svm = to_svm(vcpu); 5052 5053 BUG_ON(!(gif_set(svm))); 5054 5055 trace_kvm_inj_virq(vcpu->arch.interrupt.nr); 5056 ++vcpu->stat.irq_injections; 5057 5058 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | 5059 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; 5060 } 5061 5062 static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu) 5063 { 5064 return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK); 5065 } 5066 5067 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 5068 { 5069 struct vcpu_svm *svm = to_svm(vcpu); 5070 5071 if (svm_nested_virtualize_tpr(vcpu) || 5072 kvm_vcpu_apicv_active(vcpu)) 5073 return; 5074 5075 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); 5076 5077 if (irr == -1) 5078 return; 5079 5080 if (tpr >= irr) 5081 set_cr_intercept(svm, INTERCEPT_CR8_WRITE); 5082 } 5083 5084 static void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu) 5085 { 5086 return; 5087 } 5088 5089 static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu) 5090 { 5091 return avic && irqchip_split(vcpu->kvm); 5092 } 5093 5094 static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) 5095 { 5096 } 5097 5098 static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) 5099 { 5100 } 5101 5102 /* Note: Currently only used by Hyper-V. */ 5103 static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 5104 { 5105 struct vcpu_svm *svm = to_svm(vcpu); 5106 struct vmcb *vmcb = svm->vmcb; 5107 5108 if (!kvm_vcpu_apicv_active(&svm->vcpu)) 5109 return; 5110 5111 vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; 5112 mark_dirty(vmcb, VMCB_INTR); 5113 } 5114 5115 static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 5116 { 5117 return; 5118 } 5119 5120 static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) 5121 { 5122 kvm_lapic_set_irr(vec, vcpu->arch.apic); 5123 smp_mb__after_atomic(); 5124 5125 if (avic_vcpu_is_running(vcpu)) 5126 wrmsrl(SVM_AVIC_DOORBELL, 5127 kvm_cpu_get_apicid(vcpu->cpu)); 5128 else 5129 kvm_vcpu_wake_up(vcpu); 5130 } 5131 5132 static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) 5133 { 5134 unsigned long flags; 5135 struct amd_svm_iommu_ir *cur; 5136 5137 spin_lock_irqsave(&svm->ir_list_lock, flags); 5138 list_for_each_entry(cur, &svm->ir_list, node) { 5139 if (cur->data != pi->ir_data) 5140 continue; 5141 list_del(&cur->node); 5142 kfree(cur); 5143 break; 5144 } 5145 spin_unlock_irqrestore(&svm->ir_list_lock, flags); 5146 } 5147 5148 static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) 5149 { 5150 int ret = 0; 5151 unsigned long flags; 5152 struct amd_svm_iommu_ir *ir; 5153 5154 /** 5155 * In some cases, the existing irte is updaed and re-set, 5156 * so we need to check here if it's already been * added 5157 * to the ir_list. 5158 */ 5159 if (pi->ir_data && (pi->prev_ga_tag != 0)) { 5160 struct kvm *kvm = svm->vcpu.kvm; 5161 u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); 5162 struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); 5163 struct vcpu_svm *prev_svm; 5164 5165 if (!prev_vcpu) { 5166 ret = -EINVAL; 5167 goto out; 5168 } 5169 5170 prev_svm = to_svm(prev_vcpu); 5171 svm_ir_list_del(prev_svm, pi); 5172 } 5173 5174 /** 5175 * Allocating new amd_iommu_pi_data, which will get 5176 * add to the per-vcpu ir_list. 5177 */ 5178 ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL); 5179 if (!ir) { 5180 ret = -ENOMEM; 5181 goto out; 5182 } 5183 ir->data = pi->ir_data; 5184 5185 spin_lock_irqsave(&svm->ir_list_lock, flags); 5186 list_add(&ir->node, &svm->ir_list); 5187 spin_unlock_irqrestore(&svm->ir_list_lock, flags); 5188 out: 5189 return ret; 5190 } 5191 5192 /** 5193 * Note: 5194 * The HW cannot support posting multicast/broadcast 5195 * interrupts to a vCPU. So, we still use legacy interrupt 5196 * remapping for these kind of interrupts. 5197 * 5198 * For lowest-priority interrupts, we only support 5199 * those with single CPU as the destination, e.g. user 5200 * configures the interrupts via /proc/irq or uses 5201 * irqbalance to make the interrupts single-CPU. 5202 */ 5203 static int 5204 get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, 5205 struct vcpu_data *vcpu_info, struct vcpu_svm **svm) 5206 { 5207 struct kvm_lapic_irq irq; 5208 struct kvm_vcpu *vcpu = NULL; 5209 5210 kvm_set_msi_irq(kvm, e, &irq); 5211 5212 if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { 5213 pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n", 5214 __func__, irq.vector); 5215 return -1; 5216 } 5217 5218 pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, 5219 irq.vector); 5220 *svm = to_svm(vcpu); 5221 vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page)); 5222 vcpu_info->vector = irq.vector; 5223 5224 return 0; 5225 } 5226 5227 /* 5228 * svm_update_pi_irte - set IRTE for Posted-Interrupts 5229 * 5230 * @kvm: kvm 5231 * @host_irq: host irq of the interrupt 5232 * @guest_irq: gsi of the interrupt 5233 * @set: set or unset PI 5234 * returns 0 on success, < 0 on failure 5235 */ 5236 static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, 5237 uint32_t guest_irq, bool set) 5238 { 5239 struct kvm_kernel_irq_routing_entry *e; 5240 struct kvm_irq_routing_table *irq_rt; 5241 int idx, ret = -EINVAL; 5242 5243 if (!kvm_arch_has_assigned_device(kvm) || 5244 !irq_remapping_cap(IRQ_POSTING_CAP)) 5245 return 0; 5246 5247 pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", 5248 __func__, host_irq, guest_irq, set); 5249 5250 idx = srcu_read_lock(&kvm->irq_srcu); 5251 irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); 5252 WARN_ON(guest_irq >= irq_rt->nr_rt_entries); 5253 5254 hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { 5255 struct vcpu_data vcpu_info; 5256 struct vcpu_svm *svm = NULL; 5257 5258 if (e->type != KVM_IRQ_ROUTING_MSI) 5259 continue; 5260 5261 /** 5262 * Here, we setup with legacy mode in the following cases: 5263 * 1. When cannot target interrupt to a specific vcpu. 5264 * 2. Unsetting posted interrupt. 5265 * 3. APIC virtialization is disabled for the vcpu. 5266 */ 5267 if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && 5268 kvm_vcpu_apicv_active(&svm->vcpu)) { 5269 struct amd_iommu_pi_data pi; 5270 5271 /* Try to enable guest_mode in IRTE */ 5272 pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & 5273 AVIC_HPA_MASK); 5274 pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, 5275 svm->vcpu.vcpu_id); 5276 pi.is_guest_mode = true; 5277 pi.vcpu_data = &vcpu_info; 5278 ret = irq_set_vcpu_affinity(host_irq, &pi); 5279 5280 /** 5281 * Here, we successfully setting up vcpu affinity in 5282 * IOMMU guest mode. Now, we need to store the posted 5283 * interrupt information in a per-vcpu ir_list so that 5284 * we can reference to them directly when we update vcpu 5285 * scheduling information in IOMMU irte. 5286 */ 5287 if (!ret && pi.is_guest_mode) 5288 svm_ir_list_add(svm, &pi); 5289 } else { 5290 /* Use legacy mode in IRTE */ 5291 struct amd_iommu_pi_data pi; 5292 5293 /** 5294 * Here, pi is used to: 5295 * - Tell IOMMU to use legacy mode for this interrupt. 5296 * - Retrieve ga_tag of prior interrupt remapping data. 5297 */ 5298 pi.is_guest_mode = false; 5299 ret = irq_set_vcpu_affinity(host_irq, &pi); 5300 5301 /** 5302 * Check if the posted interrupt was previously 5303 * setup with the guest_mode by checking if the ga_tag 5304 * was cached. If so, we need to clean up the per-vcpu 5305 * ir_list. 5306 */ 5307 if (!ret && pi.prev_ga_tag) { 5308 int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); 5309 struct kvm_vcpu *vcpu; 5310 5311 vcpu = kvm_get_vcpu_by_id(kvm, id); 5312 if (vcpu) 5313 svm_ir_list_del(to_svm(vcpu), &pi); 5314 } 5315 } 5316 5317 if (!ret && svm) { 5318 trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id, 5319 e->gsi, vcpu_info.vector, 5320 vcpu_info.pi_desc_addr, set); 5321 } 5322 5323 if (ret < 0) { 5324 pr_err("%s: failed to update PI IRTE\n", __func__); 5325 goto out; 5326 } 5327 } 5328 5329 ret = 0; 5330 out: 5331 srcu_read_unlock(&kvm->irq_srcu, idx); 5332 return ret; 5333 } 5334 5335 static int svm_nmi_allowed(struct kvm_vcpu *vcpu) 5336 { 5337 struct vcpu_svm *svm = to_svm(vcpu); 5338 struct vmcb *vmcb = svm->vmcb; 5339 int ret; 5340 ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && 5341 !(svm->vcpu.arch.hflags & HF_NMI_MASK); 5342 ret = ret && gif_set(svm) && nested_svm_nmi(svm); 5343 5344 return ret; 5345 } 5346 5347 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) 5348 { 5349 struct vcpu_svm *svm = to_svm(vcpu); 5350 5351 return !!(svm->vcpu.arch.hflags & HF_NMI_MASK); 5352 } 5353 5354 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 5355 { 5356 struct vcpu_svm *svm = to_svm(vcpu); 5357 5358 if (masked) { 5359 svm->vcpu.arch.hflags |= HF_NMI_MASK; 5360 set_intercept(svm, INTERCEPT_IRET); 5361 } else { 5362 svm->vcpu.arch.hflags &= ~HF_NMI_MASK; 5363 clr_intercept(svm, INTERCEPT_IRET); 5364 } 5365 } 5366 5367 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) 5368 { 5369 struct vcpu_svm *svm = to_svm(vcpu); 5370 struct vmcb *vmcb = svm->vmcb; 5371 int ret; 5372 5373 if (!gif_set(svm) || 5374 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)) 5375 return 0; 5376 5377 ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF); 5378 5379 if (is_guest_mode(vcpu)) 5380 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); 5381 5382 return ret; 5383 } 5384 5385 static void enable_irq_window(struct kvm_vcpu *vcpu) 5386 { 5387 struct vcpu_svm *svm = to_svm(vcpu); 5388 5389 if (kvm_vcpu_apicv_active(vcpu)) 5390 return; 5391 5392 /* 5393 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes 5394 * 1, because that's a separate STGI/VMRUN intercept. The next time we 5395 * get that intercept, this function will be called again though and 5396 * we'll get the vintr intercept. However, if the vGIF feature is 5397 * enabled, the STGI interception will not occur. Enable the irq 5398 * window under the assumption that the hardware will set the GIF. 5399 */ 5400 if ((vgif_enabled(svm) || gif_set(svm)) && nested_svm_intr(svm)) { 5401 svm_set_vintr(svm); 5402 svm_inject_irq(svm, 0x0); 5403 } 5404 } 5405 5406 static void enable_nmi_window(struct kvm_vcpu *vcpu) 5407 { 5408 struct vcpu_svm *svm = to_svm(vcpu); 5409 5410 if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) 5411 == HF_NMI_MASK) 5412 return; /* IRET will cause a vm exit */ 5413 5414 if (!gif_set(svm)) { 5415 if (vgif_enabled(svm)) 5416 set_intercept(svm, INTERCEPT_STGI); 5417 return; /* STGI will cause a vm exit */ 5418 } 5419 5420 if (svm->nested.exit_required) 5421 return; /* we're not going to run the guest yet */ 5422 5423 /* 5424 * Something prevents NMI from been injected. Single step over possible 5425 * problem (IRET or exception injection or interrupt shadow) 5426 */ 5427 svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu); 5428 svm->nmi_singlestep = true; 5429 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 5430 } 5431 5432 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) 5433 { 5434 return 0; 5435 } 5436 5437 static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) 5438 { 5439 return 0; 5440 } 5441 5442 static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) 5443 { 5444 struct vcpu_svm *svm = to_svm(vcpu); 5445 5446 if (static_cpu_has(X86_FEATURE_FLUSHBYASID)) 5447 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; 5448 else 5449 svm->asid_generation--; 5450 } 5451 5452 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) 5453 { 5454 struct vcpu_svm *svm = to_svm(vcpu); 5455 5456 invlpga(gva, svm->vmcb->control.asid); 5457 } 5458 5459 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) 5460 { 5461 } 5462 5463 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) 5464 { 5465 struct vcpu_svm *svm = to_svm(vcpu); 5466 5467 if (svm_nested_virtualize_tpr(vcpu)) 5468 return; 5469 5470 if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) { 5471 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 5472 kvm_set_cr8(vcpu, cr8); 5473 } 5474 } 5475 5476 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) 5477 { 5478 struct vcpu_svm *svm = to_svm(vcpu); 5479 u64 cr8; 5480 5481 if (svm_nested_virtualize_tpr(vcpu) || 5482 kvm_vcpu_apicv_active(vcpu)) 5483 return; 5484 5485 cr8 = kvm_get_cr8(vcpu); 5486 svm->vmcb->control.int_ctl &= ~V_TPR_MASK; 5487 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; 5488 } 5489 5490 static void svm_complete_interrupts(struct vcpu_svm *svm) 5491 { 5492 u8 vector; 5493 int type; 5494 u32 exitintinfo = svm->vmcb->control.exit_int_info; 5495 unsigned int3_injected = svm->int3_injected; 5496 5497 svm->int3_injected = 0; 5498 5499 /* 5500 * If we've made progress since setting HF_IRET_MASK, we've 5501 * executed an IRET and can allow NMI injection. 5502 */ 5503 if ((svm->vcpu.arch.hflags & HF_IRET_MASK) 5504 && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) { 5505 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); 5506 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 5507 } 5508 5509 svm->vcpu.arch.nmi_injected = false; 5510 kvm_clear_exception_queue(&svm->vcpu); 5511 kvm_clear_interrupt_queue(&svm->vcpu); 5512 5513 if (!(exitintinfo & SVM_EXITINTINFO_VALID)) 5514 return; 5515 5516 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 5517 5518 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; 5519 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; 5520 5521 switch (type) { 5522 case SVM_EXITINTINFO_TYPE_NMI: 5523 svm->vcpu.arch.nmi_injected = true; 5524 break; 5525 case SVM_EXITINTINFO_TYPE_EXEPT: 5526 /* 5527 * In case of software exceptions, do not reinject the vector, 5528 * but re-execute the instruction instead. Rewind RIP first 5529 * if we emulated INT3 before. 5530 */ 5531 if (kvm_exception_is_soft(vector)) { 5532 if (vector == BP_VECTOR && int3_injected && 5533 kvm_is_linear_rip(&svm->vcpu, svm->int3_rip)) 5534 kvm_rip_write(&svm->vcpu, 5535 kvm_rip_read(&svm->vcpu) - 5536 int3_injected); 5537 break; 5538 } 5539 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { 5540 u32 err = svm->vmcb->control.exit_int_info_err; 5541 kvm_requeue_exception_e(&svm->vcpu, vector, err); 5542 5543 } else 5544 kvm_requeue_exception(&svm->vcpu, vector); 5545 break; 5546 case SVM_EXITINTINFO_TYPE_INTR: 5547 kvm_queue_interrupt(&svm->vcpu, vector, false); 5548 break; 5549 default: 5550 break; 5551 } 5552 } 5553 5554 static void svm_cancel_injection(struct kvm_vcpu *vcpu) 5555 { 5556 struct vcpu_svm *svm = to_svm(vcpu); 5557 struct vmcb_control_area *control = &svm->vmcb->control; 5558 5559 control->exit_int_info = control->event_inj; 5560 control->exit_int_info_err = control->event_inj_err; 5561 control->event_inj = 0; 5562 svm_complete_interrupts(svm); 5563 } 5564 5565 static void svm_vcpu_run(struct kvm_vcpu *vcpu) 5566 { 5567 struct vcpu_svm *svm = to_svm(vcpu); 5568 5569 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 5570 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 5571 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 5572 5573 /* 5574 * A vmexit emulation is required before the vcpu can be executed 5575 * again. 5576 */ 5577 if (unlikely(svm->nested.exit_required)) 5578 return; 5579 5580 /* 5581 * Disable singlestep if we're injecting an interrupt/exception. 5582 * We don't want our modified rflags to be pushed on the stack where 5583 * we might not be able to easily reset them if we disabled NMI 5584 * singlestep later. 5585 */ 5586 if (svm->nmi_singlestep && svm->vmcb->control.event_inj) { 5587 /* 5588 * Event injection happens before external interrupts cause a 5589 * vmexit and interrupts are disabled here, so smp_send_reschedule 5590 * is enough to force an immediate vmexit. 5591 */ 5592 disable_nmi_singlestep(svm); 5593 smp_send_reschedule(vcpu->cpu); 5594 } 5595 5596 pre_svm_run(svm); 5597 5598 sync_lapic_to_cr8(vcpu); 5599 5600 svm->vmcb->save.cr2 = vcpu->arch.cr2; 5601 5602 clgi(); 5603 5604 /* 5605 * If this vCPU has touched SPEC_CTRL, restore the guest's value if 5606 * it's non-zero. Since vmentry is serialising on affected CPUs, there 5607 * is no need to worry about the conditional branch over the wrmsr 5608 * being speculatively taken. 5609 */ 5610 x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); 5611 5612 local_irq_enable(); 5613 5614 asm volatile ( 5615 "push %%" _ASM_BP "; \n\t" 5616 "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t" 5617 "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t" 5618 "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t" 5619 "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t" 5620 "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t" 5621 "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t" 5622 #ifdef CONFIG_X86_64 5623 "mov %c[r8](%[svm]), %%r8 \n\t" 5624 "mov %c[r9](%[svm]), %%r9 \n\t" 5625 "mov %c[r10](%[svm]), %%r10 \n\t" 5626 "mov %c[r11](%[svm]), %%r11 \n\t" 5627 "mov %c[r12](%[svm]), %%r12 \n\t" 5628 "mov %c[r13](%[svm]), %%r13 \n\t" 5629 "mov %c[r14](%[svm]), %%r14 \n\t" 5630 "mov %c[r15](%[svm]), %%r15 \n\t" 5631 #endif 5632 5633 /* Enter guest mode */ 5634 "push %%" _ASM_AX " \n\t" 5635 "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t" 5636 __ex(SVM_VMLOAD) "\n\t" 5637 __ex(SVM_VMRUN) "\n\t" 5638 __ex(SVM_VMSAVE) "\n\t" 5639 "pop %%" _ASM_AX " \n\t" 5640 5641 /* Save guest registers, load host registers */ 5642 "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t" 5643 "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t" 5644 "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t" 5645 "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t" 5646 "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t" 5647 "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t" 5648 #ifdef CONFIG_X86_64 5649 "mov %%r8, %c[r8](%[svm]) \n\t" 5650 "mov %%r9, %c[r9](%[svm]) \n\t" 5651 "mov %%r10, %c[r10](%[svm]) \n\t" 5652 "mov %%r11, %c[r11](%[svm]) \n\t" 5653 "mov %%r12, %c[r12](%[svm]) \n\t" 5654 "mov %%r13, %c[r13](%[svm]) \n\t" 5655 "mov %%r14, %c[r14](%[svm]) \n\t" 5656 "mov %%r15, %c[r15](%[svm]) \n\t" 5657 /* 5658 * Clear host registers marked as clobbered to prevent 5659 * speculative use. 5660 */ 5661 "xor %%r8d, %%r8d \n\t" 5662 "xor %%r9d, %%r9d \n\t" 5663 "xor %%r10d, %%r10d \n\t" 5664 "xor %%r11d, %%r11d \n\t" 5665 "xor %%r12d, %%r12d \n\t" 5666 "xor %%r13d, %%r13d \n\t" 5667 "xor %%r14d, %%r14d \n\t" 5668 "xor %%r15d, %%r15d \n\t" 5669 #endif 5670 "xor %%ebx, %%ebx \n\t" 5671 "xor %%ecx, %%ecx \n\t" 5672 "xor %%edx, %%edx \n\t" 5673 "xor %%esi, %%esi \n\t" 5674 "xor %%edi, %%edi \n\t" 5675 "pop %%" _ASM_BP 5676 : 5677 : [svm]"a"(svm), 5678 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), 5679 [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])), 5680 [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])), 5681 [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])), 5682 [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])), 5683 [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])), 5684 [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP])) 5685 #ifdef CONFIG_X86_64 5686 , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])), 5687 [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])), 5688 [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])), 5689 [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])), 5690 [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])), 5691 [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])), 5692 [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])), 5693 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) 5694 #endif 5695 : "cc", "memory" 5696 #ifdef CONFIG_X86_64 5697 , "rbx", "rcx", "rdx", "rsi", "rdi" 5698 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" 5699 #else 5700 , "ebx", "ecx", "edx", "esi", "edi" 5701 #endif 5702 ); 5703 5704 /* Eliminate branch target predictions from guest mode */ 5705 vmexit_fill_RSB(); 5706 5707 #ifdef CONFIG_X86_64 5708 wrmsrl(MSR_GS_BASE, svm->host.gs_base); 5709 #else 5710 loadsegment(fs, svm->host.fs); 5711 #ifndef CONFIG_X86_32_LAZY_GS 5712 loadsegment(gs, svm->host.gs); 5713 #endif 5714 #endif 5715 5716 /* 5717 * We do not use IBRS in the kernel. If this vCPU has used the 5718 * SPEC_CTRL MSR it may have left it on; save the value and 5719 * turn it off. This is much more efficient than blindly adding 5720 * it to the atomic save/restore list. Especially as the former 5721 * (Saving guest MSRs on vmexit) doesn't even exist in KVM. 5722 * 5723 * For non-nested case: 5724 * If the L01 MSR bitmap does not intercept the MSR, then we need to 5725 * save it. 5726 * 5727 * For nested case: 5728 * If the L02 MSR bitmap does not intercept the MSR, then we need to 5729 * save it. 5730 */ 5731 if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) 5732 svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); 5733 5734 reload_tss(vcpu); 5735 5736 local_irq_disable(); 5737 5738 x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); 5739 5740 vcpu->arch.cr2 = svm->vmcb->save.cr2; 5741 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; 5742 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; 5743 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; 5744 5745 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) 5746 kvm_before_interrupt(&svm->vcpu); 5747 5748 stgi(); 5749 5750 /* Any pending NMI will happen here */ 5751 5752 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) 5753 kvm_after_interrupt(&svm->vcpu); 5754 5755 sync_cr8_to_lapic(vcpu); 5756 5757 svm->next_rip = 0; 5758 5759 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; 5760 5761 /* if exit due to PF check for async PF */ 5762 if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) 5763 svm->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); 5764 5765 if (npt_enabled) { 5766 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); 5767 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); 5768 } 5769 5770 /* 5771 * We need to handle MC intercepts here before the vcpu has a chance to 5772 * change the physical cpu 5773 */ 5774 if (unlikely(svm->vmcb->control.exit_code == 5775 SVM_EXIT_EXCP_BASE + MC_VECTOR)) 5776 svm_handle_mce(svm); 5777 5778 mark_all_clean(svm->vmcb); 5779 } 5780 STACK_FRAME_NON_STANDARD(svm_vcpu_run); 5781 5782 static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) 5783 { 5784 struct vcpu_svm *svm = to_svm(vcpu); 5785 5786 svm->vmcb->save.cr3 = __sme_set(root); 5787 mark_dirty(svm->vmcb, VMCB_CR); 5788 } 5789 5790 static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) 5791 { 5792 struct vcpu_svm *svm = to_svm(vcpu); 5793 5794 svm->vmcb->control.nested_cr3 = __sme_set(root); 5795 mark_dirty(svm->vmcb, VMCB_NPT); 5796 5797 /* Also sync guest cr3 here in case we live migrate */ 5798 svm->vmcb->save.cr3 = kvm_read_cr3(vcpu); 5799 mark_dirty(svm->vmcb, VMCB_CR); 5800 } 5801 5802 static int is_disabled(void) 5803 { 5804 u64 vm_cr; 5805 5806 rdmsrl(MSR_VM_CR, vm_cr); 5807 if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) 5808 return 1; 5809 5810 return 0; 5811 } 5812 5813 static void 5814 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 5815 { 5816 /* 5817 * Patch in the VMMCALL instruction: 5818 */ 5819 hypercall[0] = 0x0f; 5820 hypercall[1] = 0x01; 5821 hypercall[2] = 0xd9; 5822 } 5823 5824 static void svm_check_processor_compat(void *rtn)