1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * AMD SVM support 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * 10 * Authors: 11 * Yaniv Kamay <yaniv@qumranet.com> 12 * Avi Kivity <avi@qumranet.com> 13 */ 14 15 #define pr_fmt(fmt) "SVM: " fmt 16 17 #include <linux/kvm_host.h> 18 19 #include "irq.h" 20 #include "mmu.h" 21 #include "kvm_cache_regs.h" 22 #include "x86.h" 23 #include "cpuid.h" 24 #include "pmu.h" 25 26 #include <linux/module.h> 27 #include <linux/mod_devicetable.h> 28 #include <linux/kernel.h> 29 #include <linux/vmalloc.h> 30 #include <linux/highmem.h> 31 #include <linux/sched.h> 32 #include <linux/trace_events.h> 33 #include <linux/slab.h> 34 #include <linux/amd-iommu.h> 35 #include <linux/hashtable.h> 36 #include <linux/frame.h> 37 #include <linux/psp-sev.h> 38 #include <linux/file.h> 39 #include <linux/pagemap.h> 40 #include <linux/swap.h> 41 42 #include <asm/apic.h> 43 #include <asm/perf_event.h> 44 #include <asm/tlbflush.h> 45 #include <asm/desc.h> 46 #include <asm/debugreg.h> 47 #include <asm/kvm_para.h> 48 #include <asm/irq_remapping.h> 49 #include <asm/spec-ctrl.h> 50 51 #include <asm/virtext.h> 52 #include "trace.h" 53 54 #define __ex(x) __kvm_handle_fault_on_reboot(x) 55 56 MODULE_AUTHOR("Qumranet"); 57 MODULE_LICENSE("GPL"); 58 59 static const struct x86_cpu_id svm_cpu_id[] = { 60 X86_FEATURE_MATCH(X86_FEATURE_SVM), 61 {} 62 }; 63 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); 64 65 #define IOPM_ALLOC_ORDER 2 66 #define MSRPM_ALLOC_ORDER 1 67 68 #define SEG_TYPE_LDT 2 69 #define SEG_TYPE_BUSY_TSS16 3 70 71 #define SVM_FEATURE_NPT (1 << 0) 72 #define SVM_FEATURE_LBRV (1 << 1) 73 #define SVM_FEATURE_SVML (1 << 2) 74 #define SVM_FEATURE_NRIP (1 << 3) 75 #define SVM_FEATURE_TSC_RATE (1 << 4) 76 #define SVM_FEATURE_VMCB_CLEAN (1 << 5) 77 #define SVM_FEATURE_FLUSH_ASID (1 << 6) 78 #define SVM_FEATURE_DECODE_ASSIST (1 << 7) 79 #define SVM_FEATURE_PAUSE_FILTER (1 << 10) 80 81 #define SVM_AVIC_DOORBELL 0xc001011b 82 83 #define NESTED_EXIT_HOST 0 /* Exit handled on host level */ 84 #define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ 85 #define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ 86 87 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 88 89 #define TSC_RATIO_RSVD 0xffffff0000000000ULL 90 #define TSC_RATIO_MIN 0x0000000000000001ULL 91 #define TSC_RATIO_MAX 0x000000ffffffffffULL 92 93 #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) 94 95 /* 96 * 0xff is broadcast, so the max index allowed for physical APIC ID 97 * table is 0xfe. APIC IDs above 0xff are reserved. 98 */ 99 #define AVIC_MAX_PHYSICAL_ID_COUNT 255 100 101 #define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 102 #define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 103 #define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF 104 105 /* AVIC GATAG is encoded using VM and VCPU IDs */ 106 #define AVIC_VCPU_ID_BITS 8 107 #define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1) 108 109 #define AVIC_VM_ID_BITS 24 110 #define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS) 111 #define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1) 112 113 #define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \ 114 (y & AVIC_VCPU_ID_MASK)) 115 #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK) 116 #define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) 117 118 static bool erratum_383_found __read_mostly; 119 120 static const u32 host_save_user_msrs[] = { 121 #ifdef CONFIG_X86_64 122 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, 123 MSR_FS_BASE, 124 #endif 125 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 126 MSR_TSC_AUX, 127 }; 128 129 #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) 130 131 struct kvm_sev_info { 132 bool active; /* SEV enabled guest */ 133 unsigned int asid; /* ASID used for this guest */ 134 unsigned int handle; /* SEV firmware handle */ 135 int fd; /* SEV device fd */ 136 unsigned long pages_locked; /* Number of pages locked */ 137 struct list_head regions_list; /* List of registered regions */ 138 }; 139 140 struct kvm_svm { 141 struct kvm kvm; 142 143 /* Struct members for AVIC */ 144 u32 avic_vm_id; 145 struct page *avic_logical_id_table_page; 146 struct page *avic_physical_id_table_page; 147 struct hlist_node hnode; 148 149 struct kvm_sev_info sev_info; 150 }; 151 152 struct kvm_vcpu; 153 154 struct nested_state { 155 struct vmcb *hsave; 156 u64 hsave_msr; 157 u64 vm_cr_msr; 158 u64 vmcb; 159 160 /* These are the merged vectors */ 161 u32 *msrpm; 162 163 /* gpa pointers to the real vectors */ 164 u64 vmcb_msrpm; 165 u64 vmcb_iopm; 166 167 /* A VMEXIT is required but not yet emulated */ 168 bool exit_required; 169 170 /* cache for intercepts of the guest */ 171 u32 intercept_cr; 172 u32 intercept_dr; 173 u32 intercept_exceptions; 174 u64 intercept; 175 176 /* Nested Paging related state */ 177 u64 nested_cr3; 178 }; 179 180 #define MSRPM_OFFSETS 16 181 static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; 182 183 /* 184 * Set osvw_len to higher value when updated Revision Guides 185 * are published and we know what the new status bits are 186 */ 187 static uint64_t osvw_len = 4, osvw_status; 188 189 struct vcpu_svm { 190 struct kvm_vcpu vcpu; 191 struct vmcb *vmcb; 192 unsigned long vmcb_pa; 193 struct svm_cpu_data *svm_data; 194 uint64_t asid_generation; 195 uint64_t sysenter_esp; 196 uint64_t sysenter_eip; 197 uint64_t tsc_aux; 198 199 u64 msr_decfg; 200 201 u64 next_rip; 202 203 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; 204 struct { 205 u16 fs; 206 u16 gs; 207 u16 ldt; 208 u64 gs_base; 209 } host; 210 211 u64 spec_ctrl; 212 /* 213 * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be 214 * translated into the appropriate L2_CFG bits on the host to 215 * perform speculative control. 216 */ 217 u64 virt_spec_ctrl; 218 219 u32 *msrpm; 220 221 ulong nmi_iret_rip; 222 223 struct nested_state nested; 224 225 bool nmi_singlestep; 226 u64 nmi_singlestep_guest_rflags; 227 228 unsigned int3_injected; 229 unsigned long int3_rip; 230 231 /* cached guest cpuid flags for faster access */ 232 bool nrips_enabled : 1; 233 234 u32 ldr_reg; 235 u32 dfr_reg; 236 struct page *avic_backing_page; 237 u64 *avic_physical_id_cache; 238 bool avic_is_running; 239 240 /* 241 * Per-vcpu list of struct amd_svm_iommu_ir: 242 * This is used mainly to store interrupt remapping information used 243 * when update the vcpu affinity. This avoids the need to scan for 244 * IRTE and try to match ga_tag in the IOMMU driver. 245 */ 246 struct list_head ir_list; 247 spinlock_t ir_list_lock; 248 249 /* which host CPU was used for running this vcpu */ 250 unsigned int last_cpu; 251 }; 252 253 /* 254 * This is a wrapper of struct amd_iommu_ir_data. 255 */ 256 struct amd_svm_iommu_ir { 257 struct list_head node; /* Used by SVM for per-vcpu ir_list */ 258 void *data; /* Storing pointer to struct amd_ir_data */ 259 }; 260 261 #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) 262 #define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31 263 #define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) 264 265 #define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL) 266 #define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12) 267 #define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) 268 #define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) 269 270 static DEFINE_PER_CPU(u64, current_tsc_ratio); 271 #define TSC_RATIO_DEFAULT 0x0100000000ULL 272 273 #define MSR_INVALID 0xffffffffU 274 275 static const struct svm_direct_access_msrs { 276 u32 index; /* Index of the MSR */ 277 bool always; /* True if intercept is always on */ 278 } direct_access_msrs[] = { 279 { .index = MSR_STAR, .always = true }, 280 { .index = MSR_IA32_SYSENTER_CS, .always = true }, 281 #ifdef CONFIG_X86_64 282 { .index = MSR_GS_BASE, .always = true }, 283 { .index = MSR_FS_BASE, .always = true }, 284 { .index = MSR_KERNEL_GS_BASE, .always = true }, 285 { .index = MSR_LSTAR, .always = true }, 286 { .index = MSR_CSTAR, .always = true }, 287 { .index = MSR_SYSCALL_MASK, .always = true }, 288 #endif 289 { .index = MSR_IA32_SPEC_CTRL, .always = false }, 290 { .index = MSR_IA32_PRED_CMD, .always = false }, 291 { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, 292 { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, 293 { .index = MSR_IA32_LASTINTFROMIP, .always = false }, 294 { .index = MSR_IA32_LASTINTTOIP, .always = false }, 295 { .index = MSR_INVALID, .always = false }, 296 }; 297 298 /* enable NPT for AMD64 and X86 with PAE */ 299 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 300 static bool npt_enabled = true; 301 #else 302 static bool npt_enabled; 303 #endif 304 305 /* 306 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 307 * pause_filter_count: On processors that support Pause filtering(indicated 308 * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter 309 * count value. On VMRUN this value is loaded into an internal counter. 310 * Each time a pause instruction is executed, this counter is decremented 311 * until it reaches zero at which time a #VMEXIT is generated if pause 312 * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause 313 * Intercept Filtering for more details. 314 * This also indicate if ple logic enabled. 315 * 316 * pause_filter_thresh: In addition, some processor families support advanced 317 * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on 318 * the amount of time a guest is allowed to execute in a pause loop. 319 * In this mode, a 16-bit pause filter threshold field is added in the 320 * VMCB. The threshold value is a cycle count that is used to reset the 321 * pause counter. As with simple pause filtering, VMRUN loads the pause 322 * count value from VMCB into an internal counter. Then, on each pause 323 * instruction the hardware checks the elapsed number of cycles since 324 * the most recent pause instruction against the pause filter threshold. 325 * If the elapsed cycle count is greater than the pause filter threshold, 326 * then the internal pause count is reloaded from the VMCB and execution 327 * continues. If the elapsed cycle count is less than the pause filter 328 * threshold, then the internal pause count is decremented. If the count 329 * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is 330 * triggered. If advanced pause filtering is supported and pause filter 331 * threshold field is set to zero, the filter will operate in the simpler, 332 * count only mode. 333 */ 334 335 static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP; 336 module_param(pause_filter_thresh, ushort, 0444); 337 338 static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW; 339 module_param(pause_filter_count, ushort, 0444); 340 341 /* Default doubles per-vcpu window every exit. */ 342 static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW; 343 module_param(pause_filter_count_grow, ushort, 0444); 344 345 /* Default resets per-vcpu window every exit to pause_filter_count. */ 346 static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; 347 module_param(pause_filter_count_shrink, ushort, 0444); 348 349 /* Default is to compute the maximum so we can never overflow. */ 350 static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX; 351 module_param(pause_filter_count_max, ushort, 0444); 352 353 /* allow nested paging (virtualized MMU) for all guests */ 354 static int npt = true; 355 module_param(npt, int, S_IRUGO); 356 357 /* allow nested virtualization in KVM/SVM */ 358 static int nested = true; 359 module_param(nested, int, S_IRUGO); 360 361 /* enable / disable AVIC */ 362 static int avic; 363 #ifdef CONFIG_X86_LOCAL_APIC 364 module_param(avic, int, S_IRUGO); 365 #endif 366 367 /* enable/disable Next RIP Save */ 368 static int nrips = true; 369 module_param(nrips, int, 0444); 370 371 /* enable/disable Virtual VMLOAD VMSAVE */ 372 static int vls = true; 373 module_param(vls, int, 0444); 374 375 /* enable/disable Virtual GIF */ 376 static int vgif = true; 377 module_param(vgif, int, 0444); 378 379 /* enable/disable SEV support */ 380 static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT); 381 module_param(sev, int, 0444); 382 383 static bool __read_mostly dump_invalid_vmcb = 0; 384 module_param(dump_invalid_vmcb, bool, 0644); 385 386 static u8 rsm_ins_bytes[] = "\x0f\xaa"; 387 388 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); 389 static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa); 390 static void svm_complete_interrupts(struct vcpu_svm *svm); 391 392 static int nested_svm_exit_handled(struct vcpu_svm *svm); 393 static int nested_svm_intercept(struct vcpu_svm *svm); 394 static int nested_svm_vmexit(struct vcpu_svm *svm); 395 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 396 bool has_error_code, u32 error_code); 397 398 enum { 399 VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, 400 pause filter count */ 401 VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */ 402 VMCB_ASID, /* ASID */ 403 VMCB_INTR, /* int_ctl, int_vector */ 404 VMCB_NPT, /* npt_en, nCR3, gPAT */ 405 VMCB_CR, /* CR0, CR3, CR4, EFER */ 406 VMCB_DR, /* DR6, DR7 */ 407 VMCB_DT, /* GDT, IDT */ 408 VMCB_SEG, /* CS, DS, SS, ES, CPL */ 409 VMCB_CR2, /* CR2 only */ 410 VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */ 411 VMCB_AVIC, /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE, 412 * AVIC PHYSICAL_TABLE pointer, 413 * AVIC LOGICAL_TABLE pointer 414 */ 415 VMCB_DIRTY_MAX, 416 }; 417 418 /* TPR and CR2 are always written before VMRUN */ 419 #define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2)) 420 421 #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL 422 423 static unsigned int max_sev_asid; 424 static unsigned int min_sev_asid; 425 static unsigned long *sev_asid_bitmap; 426 #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) 427 428 struct enc_region { 429 struct list_head list; 430 unsigned long npages; 431 struct page **pages; 432 unsigned long uaddr; 433 unsigned long size; 434 }; 435 436 437 static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) 438 { 439 return container_of(kvm, struct kvm_svm, kvm); 440 } 441 442 static inline bool svm_sev_enabled(void) 443 { 444 return IS_ENABLED(CONFIG_KVM_AMD_SEV) ? max_sev_asid : 0; 445 } 446 447 static inline bool sev_guest(struct kvm *kvm) 448 { 449 #ifdef CONFIG_KVM_AMD_SEV 450 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; 451 452 return sev->active; 453 #else 454 return false; 455 #endif 456 } 457 458 static inline int sev_get_asid(struct kvm *kvm) 459 { 460 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; 461 462 return sev->asid; 463 } 464 465 static inline void mark_all_dirty(struct vmcb *vmcb) 466 { 467 vmcb->control.clean = 0; 468 } 469 470 static inline void mark_all_clean(struct vmcb *vmcb) 471 { 472 vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1) 473 & ~VMCB_ALWAYS_DIRTY_MASK; 474 } 475 476 static inline void mark_dirty(struct vmcb *vmcb, int bit) 477 { 478 vmcb->control.clean &= ~(1 << bit); 479 } 480 481 static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) 482 { 483 return container_of(vcpu, struct vcpu_svm, vcpu); 484 } 485 486 static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) 487 { 488 svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK; 489 mark_dirty(svm->vmcb, VMCB_AVIC); 490 } 491 492 static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu) 493 { 494 struct vcpu_svm *svm = to_svm(vcpu); 495 u64 *entry = svm->avic_physical_id_cache; 496 497 if (!entry) 498 return false; 499 500 return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); 501 } 502 503 static void recalc_intercepts(struct vcpu_svm *svm) 504 { 505 struct vmcb_control_area *c, *h; 506 struct nested_state *g; 507 508 mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 509 510 if (!is_guest_mode(&svm->vcpu)) 511 return; 512 513 c = &svm->vmcb->control; 514 h = &svm->nested.hsave->control; 515 g = &svm->nested; 516 517 c->intercept_cr = h->intercept_cr | g->intercept_cr; 518 c->intercept_dr = h->intercept_dr | g->intercept_dr; 519 c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions; 520 c->intercept = h->intercept | g->intercept; 521 } 522 523 static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm) 524 { 525 if (is_guest_mode(&svm->vcpu)) 526 return svm->nested.hsave; 527 else 528 return svm->vmcb; 529 } 530 531 static inline void set_cr_intercept(struct vcpu_svm *svm, int bit) 532 { 533 struct vmcb *vmcb = get_host_vmcb(svm); 534 535 vmcb->control.intercept_cr |= (1U << bit); 536 537 recalc_intercepts(svm); 538 } 539 540 static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit) 541 { 542 struct vmcb *vmcb = get_host_vmcb(svm); 543 544 vmcb->control.intercept_cr &= ~(1U << bit); 545 546 recalc_intercepts(svm); 547 } 548 549 static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit) 550 { 551 struct vmcb *vmcb = get_host_vmcb(svm); 552 553 return vmcb->control.intercept_cr & (1U << bit); 554 } 555 556 static inline void set_dr_intercepts(struct vcpu_svm *svm) 557 { 558 struct vmcb *vmcb = get_host_vmcb(svm); 559 560 vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ) 561 | (1 << INTERCEPT_DR1_READ) 562 | (1 << INTERCEPT_DR2_READ) 563 | (1 << INTERCEPT_DR3_READ) 564 | (1 << INTERCEPT_DR4_READ) 565 | (1 << INTERCEPT_DR5_READ) 566 | (1 << INTERCEPT_DR6_READ) 567 | (1 << INTERCEPT_DR7_READ) 568 | (1 << INTERCEPT_DR0_WRITE) 569 | (1 << INTERCEPT_DR1_WRITE) 570 | (1 << INTERCEPT_DR2_WRITE) 571 | (1 << INTERCEPT_DR3_WRITE) 572 | (1 << INTERCEPT_DR4_WRITE) 573 | (1 << INTERCEPT_DR5_WRITE) 574 | (1 << INTERCEPT_DR6_WRITE) 575 | (1 << INTERCEPT_DR7_WRITE); 576 577 recalc_intercepts(svm); 578 } 579 580 static inline void clr_dr_intercepts(struct vcpu_svm *svm) 581 { 582 struct vmcb *vmcb = get_host_vmcb(svm); 583 584 vmcb->control.intercept_dr = 0; 585 586 recalc_intercepts(svm); 587 } 588 589 static inline void set_exception_intercept(struct vcpu_svm *svm, int bit) 590 { 591 struct vmcb *vmcb = get_host_vmcb(svm); 592 593 vmcb->control.intercept_exceptions |= (1U << bit); 594 595 recalc_intercepts(svm); 596 } 597 598 static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit) 599 { 600 struct vmcb *vmcb = get_host_vmcb(svm); 601 602 vmcb->control.intercept_exceptions &= ~(1U << bit); 603 604 recalc_intercepts(svm); 605 } 606 607 static inline void set_intercept(struct vcpu_svm *svm, int bit) 608 { 609 struct vmcb *vmcb = get_host_vmcb(svm); 610 611 vmcb->control.intercept |= (1ULL << bit); 612 613 recalc_intercepts(svm); 614 } 615 616 static inline void clr_intercept(struct vcpu_svm *svm, int bit) 617 { 618 struct vmcb *vmcb = get_host_vmcb(svm); 619 620 vmcb->control.intercept &= ~(1ULL << bit); 621 622 recalc_intercepts(svm); 623 } 624 625 static inline bool vgif_enabled(struct vcpu_svm *svm) 626 { 627 return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK); 628 } 629 630 static inline void enable_gif(struct vcpu_svm *svm) 631 { 632 if (vgif_enabled(svm)) 633 svm->vmcb->control.int_ctl |= V_GIF_MASK; 634 else 635 svm->vcpu.arch.hflags |= HF_GIF_MASK; 636 } 637 638 static inline void disable_gif(struct vcpu_svm *svm) 639 { 640 if (vgif_enabled(svm)) 641 svm->vmcb->control.int_ctl &= ~V_GIF_MASK; 642 else 643 svm->vcpu.arch.hflags &= ~HF_GIF_MASK; 644 } 645 646 static inline bool gif_set(struct vcpu_svm *svm) 647 { 648 if (vgif_enabled(svm)) 649 return !!(svm->vmcb->control.int_ctl & V_GIF_MASK); 650 else 651 return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); 652 } 653 654 static unsigned long iopm_base; 655 656 struct kvm_ldttss_desc { 657 u16 limit0; 658 u16 base0; 659 unsigned base1:8, type:5, dpl:2, p:1; 660 unsigned limit1:4, zero0:3, g:1, base2:8; 661 u32 base3; 662 u32 zero1; 663 } __attribute__((packed)); 664 665 struct svm_cpu_data { 666 int cpu; 667 668 u64 asid_generation; 669 u32 max_asid; 670 u32 next_asid; 671 u32 min_asid; 672 struct kvm_ldttss_desc *tss_desc; 673 674 struct page *save_area; 675 struct vmcb *current_vmcb; 676 677 /* index = sev_asid, value = vmcb pointer */ 678 struct vmcb **sev_vmcbs; 679 }; 680 681 static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); 682 683 static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; 684 685 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) 686 #define MSRS_RANGE_SIZE 2048 687 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) 688 689 static u32 svm_msrpm_offset(u32 msr) 690 { 691 u32 offset; 692 int i; 693 694 for (i = 0; i < NUM_MSR_MAPS; i++) { 695 if (msr < msrpm_ranges[i] || 696 msr >= msrpm_ranges[i] + MSRS_IN_RANGE) 697 continue; 698 699 offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */ 700 offset += (i * MSRS_RANGE_SIZE); /* add range offset */ 701 702 /* Now we have the u8 offset - but need the u32 offset */ 703 return offset / 4; 704 } 705 706 /* MSR not in any range */ 707 return MSR_INVALID; 708 } 709 710 #define MAX_INST_SIZE 15 711 712 static inline void clgi(void) 713 { 714 asm volatile (__ex("clgi")); 715 } 716 717 static inline void stgi(void) 718 { 719 asm volatile (__ex("stgi")); 720 } 721 722 static inline void invlpga(unsigned long addr, u32 asid) 723 { 724 asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr)); 725 } 726 727 static int get_npt_level(struct kvm_vcpu *vcpu) 728 { 729 #ifdef CONFIG_X86_64 730 return PT64_ROOT_4LEVEL; 731 #else 732 return PT32E_ROOT_LEVEL; 733 #endif 734 } 735 736 static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) 737 { 738 vcpu->arch.efer = efer; 739 740 if (!npt_enabled) { 741 /* Shadow paging assumes NX to be available. */ 742 efer |= EFER_NX; 743 744 if (!(efer & EFER_LMA)) 745 efer &= ~EFER_LME; 746 } 747 748 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; 749 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); 750 } 751 752 static int is_external_interrupt(u32 info) 753 { 754 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; 755 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); 756 } 757 758 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu) 759 { 760 struct vcpu_svm *svm = to_svm(vcpu); 761 u32 ret = 0; 762 763 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) 764 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; 765 return ret; 766 } 767 768 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 769 { 770 struct vcpu_svm *svm = to_svm(vcpu); 771 772 if (mask == 0) 773 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; 774 else 775 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK; 776 777 } 778 779 static void skip_emulated_instruction(struct kvm_vcpu *vcpu) 780 { 781 struct vcpu_svm *svm = to_svm(vcpu); 782 783 if (nrips && svm->vmcb->control.next_rip != 0) { 784 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS)); 785 svm->next_rip = svm->vmcb->control.next_rip; 786 } 787 788 if (!svm->next_rip) { 789 if (kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) != 790 EMULATE_DONE) 791 printk(KERN_DEBUG "%s: NOP\n", __func__); 792 return; 793 } 794 if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) 795 printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n", 796 __func__, kvm_rip_read(vcpu), svm->next_rip); 797 798 kvm_rip_write(vcpu, svm->next_rip); 799 svm_set_interrupt_shadow(vcpu, 0); 800 } 801 802 static void svm_queue_exception(struct kvm_vcpu *vcpu) 803 { 804 struct vcpu_svm *svm = to_svm(vcpu); 805 unsigned nr = vcpu->arch.exception.nr; 806 bool has_error_code = vcpu->arch.exception.has_error_code; 807 bool reinject = vcpu->arch.exception.injected; 808 u32 error_code = vcpu->arch.exception.error_code; 809 810 /* 811 * If we are within a nested VM we'd better #VMEXIT and let the guest 812 * handle the exception 813 */ 814 if (!reinject && 815 nested_svm_check_exception(svm, nr, has_error_code, error_code)) 816 return; 817 818 kvm_deliver_exception_payload(&svm->vcpu); 819 820 if (nr == BP_VECTOR && !nrips) { 821 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); 822 823 /* 824 * For guest debugging where we have to reinject #BP if some 825 * INT3 is guest-owned: 826 * Emulate nRIP by moving RIP forward. Will fail if injection 827 * raises a fault that is not intercepted. Still better than 828 * failing in all cases. 829 */ 830 skip_emulated_instruction(&svm->vcpu); 831 rip = kvm_rip_read(&svm->vcpu); 832 svm->int3_rip = rip + svm->vmcb->save.cs.base; 833 svm->int3_injected = rip - old_rip; 834 } 835 836 svm->vmcb->control.event_inj = nr 837 | SVM_EVTINJ_VALID 838 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) 839 | SVM_EVTINJ_TYPE_EXEPT; 840 svm->vmcb->control.event_inj_err = error_code; 841 } 842 843 static void svm_init_erratum_383(void) 844 { 845 u32 low, high; 846 int err; 847 u64 val; 848 849 if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH)) 850 return; 851 852 /* Use _safe variants to not break nested virtualization */ 853 val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); 854 if (err) 855 return; 856 857 val |= (1ULL << 47); 858 859 low = lower_32_bits(val); 860 high = upper_32_bits(val); 861 862 native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); 863 864 erratum_383_found = true; 865 } 866 867 static void svm_init_osvw(struct kvm_vcpu *vcpu) 868 { 869 /* 870 * Guests should see errata 400 and 415 as fixed (assuming that 871 * HLT and IO instructions are intercepted). 872 */ 873 vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3; 874 vcpu->arch.osvw.status = osvw_status & ~(6ULL); 875 876 /* 877 * By increasing VCPU's osvw.length to 3 we are telling the guest that 878 * all osvw.status bits inside that length, including bit 0 (which is 879 * reserved for erratum 298), are valid. However, if host processor's 880 * osvw_len is 0 then osvw_status[0] carries no information. We need to 881 * be conservative here and therefore we tell the guest that erratum 298 882 * is present (because we really don't know). 883 */ 884 if (osvw_len == 0 && boot_cpu_data.x86 == 0x10) 885 vcpu->arch.osvw.status |= 1; 886 } 887 888 static int has_svm(void) 889 { 890 const char *msg; 891 892 if (!cpu_has_svm(&msg)) { 893 printk(KERN_INFO "has_svm: %s\n", msg); 894 return 0; 895 } 896 897 return 1; 898 } 899 900 static void svm_hardware_disable(void) 901 { 902 /* Make sure we clean up behind us */ 903 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) 904 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); 905 906 cpu_svm_disable(); 907 908 amd_pmu_disable_virt(); 909 } 910 911 static int svm_hardware_enable(void) 912 { 913 914 struct svm_cpu_data *sd; 915 uint64_t efer; 916 struct desc_struct *gdt; 917 int me = raw_smp_processor_id(); 918 919 rdmsrl(MSR_EFER, efer); 920 if (efer & EFER_SVME) 921 return -EBUSY; 922 923 if (!has_svm()) { 924 pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me); 925 return -EINVAL; 926 } 927 sd = per_cpu(svm_data, me); 928 if (!sd) { 929 pr_err("%s: svm_data is NULL on %d\n", __func__, me); 930 return -EINVAL; 931 } 932 933 sd->asid_generation = 1; 934 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 935 sd->next_asid = sd->max_asid + 1; 936 sd->min_asid = max_sev_asid + 1; 937 938 gdt = get_current_gdt_rw(); 939 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 940 941 wrmsrl(MSR_EFER, efer | EFER_SVME); 942 943 wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); 944 945 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { 946 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); 947 __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT); 948 } 949 950 951 /* 952 * Get OSVW bits. 953 * 954 * Note that it is possible to have a system with mixed processor 955 * revisions and therefore different OSVW bits. If bits are not the same 956 * on different processors then choose the worst case (i.e. if erratum 957 * is present on one processor and not on another then assume that the 958 * erratum is present everywhere). 959 */ 960 if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { 961 uint64_t len, status = 0; 962 int err; 963 964 len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); 965 if (!err) 966 status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, 967 &err); 968 969 if (err) 970 osvw_status = osvw_len = 0; 971 else { 972 if (len < osvw_len) 973 osvw_len = len; 974 osvw_status |= status; 975 osvw_status &= (1ULL << osvw_len) - 1; 976 } 977 } else 978 osvw_status = osvw_len = 0; 979 980 svm_init_erratum_383(); 981 982 amd_pmu_enable_virt(); 983 984 return 0; 985 } 986 987 static void svm_cpu_uninit(int cpu) 988 { 989 struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id()); 990 991 if (!sd) 992 return; 993 994 per_cpu(svm_data, raw_smp_processor_id()) = NULL; 995 kfree(sd->sev_vmcbs); 996 __free_page(sd->save_area); 997 kfree(sd); 998 } 999 1000 static int svm_cpu_init(int cpu) 1001 { 1002 struct svm_cpu_data *sd; 1003 int r; 1004 1005 sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); 1006 if (!sd) 1007 return -ENOMEM; 1008 sd->cpu = cpu; 1009 r = -ENOMEM; 1010 sd->save_area = alloc_page(GFP_KERNEL); 1011 if (!sd->save_area) 1012 goto err_1; 1013 1014 if (svm_sev_enabled()) { 1015 r = -ENOMEM; 1016 sd->sev_vmcbs = kmalloc_array(max_sev_asid + 1, 1017 sizeof(void *), 1018 GFP_KERNEL); 1019 if (!sd->sev_vmcbs) 1020 goto err_1; 1021 } 1022 1023 per_cpu(svm_data, cpu) = sd; 1024 1025 return 0; 1026 1027 err_1: 1028 kfree(sd); 1029 return r; 1030 1031 } 1032 1033 static bool valid_msr_intercept(u32 index) 1034 { 1035 int i; 1036 1037 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) 1038 if (direct_access_msrs[i].index == index) 1039 return true; 1040 1041 return false; 1042 } 1043 1044 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr) 1045 { 1046 u8 bit_write; 1047 unsigned long tmp; 1048 u32 offset; 1049 u32 *msrpm; 1050 1051 msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm: 1052 to_svm(vcpu)->msrpm; 1053 1054 offset = svm_msrpm_offset(msr); 1055 bit_write = 2 * (msr & 0x0f) + 1; 1056 tmp = msrpm[offset]; 1057 1058 BUG_ON(offset == MSR_INVALID); 1059 1060 return !!test_bit(bit_write, &tmp); 1061 } 1062 1063 static void set_msr_interception(u32 *msrpm, unsigned msr, 1064 int read, int write) 1065 { 1066 u8 bit_read, bit_write; 1067 unsigned long tmp; 1068 u32 offset; 1069 1070 /* 1071 * If this warning triggers extend the direct_access_msrs list at the 1072 * beginning of the file 1073 */ 1074 WARN_ON(!valid_msr_intercept(msr)); 1075 1076 offset = svm_msrpm_offset(msr); 1077 bit_read = 2 * (msr & 0x0f); 1078 bit_write = 2 * (msr & 0x0f) + 1; 1079 tmp = msrpm[offset]; 1080 1081 BUG_ON(offset == MSR_INVALID); 1082 1083 read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp); 1084 write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp); 1085 1086 msrpm[offset] = tmp; 1087 } 1088 1089 static void svm_vcpu_init_msrpm(u32 *msrpm) 1090 { 1091 int i; 1092 1093 memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); 1094 1095 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { 1096 if (!direct_access_msrs[i].always) 1097 continue; 1098 1099 set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1); 1100 } 1101 } 1102 1103 static void add_msr_offset(u32 offset) 1104 { 1105 int i; 1106 1107 for (i = 0; i < MSRPM_OFFSETS; ++i) { 1108 1109 /* Offset already in list? */ 1110 if (msrpm_offsets[i] == offset) 1111 return; 1112 1113 /* Slot used by another offset? */ 1114 if (msrpm_offsets[i] != MSR_INVALID) 1115 continue; 1116 1117 /* Add offset to list */ 1118 msrpm_offsets[i] = offset; 1119 1120 return; 1121 } 1122 1123 /* 1124 * If this BUG triggers the msrpm_offsets table has an overflow. Just 1125 * increase MSRPM_OFFSETS in this case. 1126 */ 1127 BUG(); 1128 } 1129 1130 static void init_msrpm_offsets(void) 1131 { 1132 int i; 1133 1134 memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets)); 1135 1136 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { 1137 u32 offset; 1138 1139 offset = svm_msrpm_offset(direct_access_msrs[i].index); 1140 BUG_ON(offset == MSR_INVALID); 1141 1142 add_msr_offset(offset); 1143 } 1144 } 1145 1146 static void svm_enable_lbrv(struct vcpu_svm *svm) 1147 { 1148 u32 *msrpm = svm->msrpm; 1149 1150 svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; 1151 set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); 1152 set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); 1153 set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); 1154 set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1); 1155 } 1156 1157 static void svm_disable_lbrv(struct vcpu_svm *svm) 1158 { 1159 u32 *msrpm = svm->msrpm; 1160 1161 svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; 1162 set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); 1163 set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); 1164 set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); 1165 set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); 1166 } 1167 1168 static void disable_nmi_singlestep(struct vcpu_svm *svm) 1169 { 1170 svm->nmi_singlestep = false; 1171 1172 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) { 1173 /* Clear our flags if they were not set by the guest */ 1174 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) 1175 svm->vmcb->save.rflags &= ~X86_EFLAGS_TF; 1176 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) 1177 svm->vmcb->save.rflags &= ~X86_EFLAGS_RF; 1178 } 1179 } 1180 1181 /* Note: 1182 * This hash table is used to map VM_ID to a struct kvm_svm, 1183 * when handling AMD IOMMU GALOG notification to schedule in 1184 * a particular vCPU. 1185 */ 1186 #define SVM_VM_DATA_HASH_BITS 8 1187 static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); 1188 static u32 next_vm_id = 0; 1189 static bool next_vm_id_wrapped = 0; 1190 static DEFINE_SPINLOCK(svm_vm_data_hash_lock); 1191 1192 /* Note: 1193 * This function is called from IOMMU driver to notify 1194 * SVM to schedule in a particular vCPU of a particular VM. 1195 */ 1196 static int avic_ga_log_notifier(u32 ga_tag) 1197 { 1198 unsigned long flags; 1199 struct kvm_svm *kvm_svm; 1200 struct kvm_vcpu *vcpu = NULL; 1201 u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); 1202 u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag); 1203 1204 pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id); 1205 1206 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 1207 hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) { 1208 if (kvm_svm->avic_vm_id != vm_id) 1209 continue; 1210 vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id); 1211 break; 1212 } 1213 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 1214 1215 /* Note: 1216 * At this point, the IOMMU should have already set the pending 1217 * bit in the vAPIC backing page. So, we just need to schedule 1218 * in the vcpu. 1219 */ 1220 if (vcpu) 1221 kvm_vcpu_wake_up(vcpu); 1222 1223 return 0; 1224 } 1225 1226 static __init int sev_hardware_setup(void) 1227 { 1228 struct sev_user_data_status *status; 1229 int rc; 1230 1231 /* Maximum number of encrypted guests supported simultaneously */ 1232 max_sev_asid = cpuid_ecx(0x8000001F); 1233 1234 if (!max_sev_asid) 1235 return 1; 1236 1237 /* Minimum ASID value that should be used for SEV guest */ 1238 min_sev_asid = cpuid_edx(0x8000001F); 1239 1240 /* Initialize SEV ASID bitmap */ 1241 sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); 1242 if (!sev_asid_bitmap) 1243 return 1; 1244 1245 status = kmalloc(sizeof(*status), GFP_KERNEL); 1246 if (!status) 1247 return 1; 1248 1249 /* 1250 * Check SEV platform status. 1251 * 1252 * PLATFORM_STATUS can be called in any state, if we failed to query 1253 * the PLATFORM status then either PSP firmware does not support SEV 1254 * feature or SEV firmware is dead. 1255 */ 1256 rc = sev_platform_status(status, NULL); 1257 if (rc) 1258 goto err; 1259 1260 pr_info("SEV supported\n"); 1261 1262 err: 1263 kfree(status); 1264 return rc; 1265 } 1266 1267 static void grow_ple_window(struct kvm_vcpu *vcpu) 1268 { 1269 struct vcpu_svm *svm = to_svm(vcpu); 1270 struct vmcb_control_area *control = &svm->vmcb->control; 1271 int old = control->pause_filter_count; 1272 1273 control->pause_filter_count = __grow_ple_window(old, 1274 pause_filter_count, 1275 pause_filter_count_grow, 1276 pause_filter_count_max); 1277 1278 if (control->pause_filter_count != old) 1279 mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 1280 1281 trace_kvm_ple_window_grow(vcpu->vcpu_id, 1282 control->pause_filter_count, old); 1283 } 1284 1285 static void shrink_ple_window(struct kvm_vcpu *vcpu) 1286 { 1287 struct vcpu_svm *svm = to_svm(vcpu); 1288 struct vmcb_control_area *control = &svm->vmcb->control; 1289 int old = control->pause_filter_count; 1290 1291 control->pause_filter_count = 1292 __shrink_ple_window(old, 1293 pause_filter_count, 1294 pause_filter_count_shrink, 1295 pause_filter_count); 1296 if (control->pause_filter_count != old) 1297 mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 1298 1299 trace_kvm_ple_window_shrink(vcpu->vcpu_id, 1300 control->pause_filter_count, old); 1301 } 1302 1303 static __init int svm_hardware_setup(void) 1304 { 1305 int cpu; 1306 struct page *iopm_pages; 1307 void *iopm_va; 1308 int r; 1309 1310 iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER); 1311 1312 if (!iopm_pages) 1313 return -ENOMEM; 1314 1315 iopm_va = page_address(iopm_pages); 1316 memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); 1317 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; 1318 1319 init_msrpm_offsets(); 1320 1321 if (boot_cpu_has(X86_FEATURE_NX)) 1322 kvm_enable_efer_bits(EFER_NX); 1323 1324 if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) 1325 kvm_enable_efer_bits(EFER_FFXSR); 1326 1327 if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { 1328 kvm_has_tsc_control = true; 1329 kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; 1330 kvm_tsc_scaling_ratio_frac_bits = 32; 1331 } 1332 1333 /* Check for pause filtering support */ 1334 if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { 1335 pause_filter_count = 0; 1336 pause_filter_thresh = 0; 1337 } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) { 1338 pause_filter_thresh = 0; 1339 } 1340 1341 if (nested) { 1342 printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); 1343 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); 1344 } 1345 1346 if (sev) { 1347 if (boot_cpu_has(X86_FEATURE_SEV) && 1348 IS_ENABLED(CONFIG_KVM_AMD_SEV)) { 1349 r = sev_hardware_setup(); 1350 if (r) 1351 sev = false; 1352 } else { 1353 sev = false; 1354 } 1355 } 1356 1357 for_each_possible_cpu(cpu) { 1358 r = svm_cpu_init(cpu); 1359 if (r) 1360 goto err; 1361 } 1362 1363 if (!boot_cpu_has(X86_FEATURE_NPT)) 1364 npt_enabled = false; 1365 1366 if (npt_enabled && !npt) { 1367 printk(KERN_INFO "kvm: Nested Paging disabled\n"); 1368 npt_enabled = false; 1369 } 1370 1371 if (npt_enabled) { 1372 printk(KERN_INFO "kvm: Nested Paging enabled\n"); 1373 kvm_enable_tdp(); 1374 } else 1375 kvm_disable_tdp(); 1376 1377 if (nrips) { 1378 if (!boot_cpu_has(X86_FEATURE_NRIPS)) 1379 nrips = false; 1380 } 1381 1382 if (avic) { 1383 if (!npt_enabled || 1384 !boot_cpu_has(X86_FEATURE_AVIC) || 1385 !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) { 1386 avic = false; 1387 } else { 1388 pr_info("AVIC enabled\n"); 1389 1390 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); 1391 } 1392 } 1393 1394 if (vls) { 1395 if (!npt_enabled || 1396 !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || 1397 !IS_ENABLED(CONFIG_X86_64)) { 1398 vls = false; 1399 } else { 1400 pr_info("Virtual VMLOAD VMSAVE supported\n"); 1401 } 1402 } 1403 1404 if (vgif) { 1405 if (!boot_cpu_has(X86_FEATURE_VGIF)) 1406 vgif = false; 1407 else 1408 pr_info("Virtual GIF supported\n"); 1409 } 1410 1411 return 0; 1412 1413 err: 1414 __free_pages(iopm_pages, IOPM_ALLOC_ORDER); 1415 iopm_base = 0; 1416 return r; 1417 } 1418 1419 static __exit void svm_hardware_unsetup(void) 1420 { 1421 int cpu; 1422 1423 if (svm_sev_enabled()) 1424 bitmap_free(sev_asid_bitmap); 1425 1426 for_each_possible_cpu(cpu) 1427 svm_cpu_uninit(cpu); 1428 1429 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); 1430 iopm_base = 0; 1431 } 1432 1433 static void init_seg(struct vmcb_seg *seg) 1434 { 1435 seg->selector = 0; 1436 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | 1437 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ 1438 seg->limit = 0xffff; 1439 seg->base = 0; 1440 } 1441 1442 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) 1443 { 1444 seg->selector = 0; 1445 seg->attrib = SVM_SELECTOR_P_MASK | type; 1446 seg->limit = 0xffff; 1447 seg->base = 0; 1448 } 1449 1450 static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu) 1451 { 1452 struct vcpu_svm *svm = to_svm(vcpu); 1453 1454 if (is_guest_mode(vcpu)) 1455 return svm->nested.hsave->control.tsc_offset; 1456 1457 return vcpu->arch.tsc_offset; 1458 } 1459 1460 static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 1461 { 1462 struct vcpu_svm *svm = to_svm(vcpu); 1463 u64 g_tsc_offset = 0; 1464 1465 if (is_guest_mode(vcpu)) { 1466 /* Write L1's TSC offset. */ 1467 g_tsc_offset = svm->vmcb->control.tsc_offset - 1468 svm->nested.hsave->control.tsc_offset; 1469 svm->nested.hsave->control.tsc_offset = offset; 1470 } 1471 1472 trace_kvm_write_tsc_offset(vcpu->vcpu_id, 1473 svm->vmcb->control.tsc_offset - g_tsc_offset, 1474 offset); 1475 1476 svm->vmcb->control.tsc_offset = offset + g_tsc_offset; 1477 1478 mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 1479 return svm->vmcb->control.tsc_offset; 1480 } 1481 1482 static void avic_init_vmcb(struct vcpu_svm *svm) 1483 { 1484 struct vmcb *vmcb = svm->vmcb; 1485 struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm); 1486 phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page)); 1487 phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page)); 1488 phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page)); 1489 1490 vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; 1491 vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; 1492 vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; 1493 vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; 1494 vmcb->control.int_ctl |= AVIC_ENABLE_MASK; 1495 } 1496 1497 static void init_vmcb(struct vcpu_svm *svm) 1498 { 1499 struct vmcb_control_area *control = &svm->vmcb->control; 1500 struct vmcb_save_area *save = &svm->vmcb->save; 1501 1502 svm->vcpu.arch.hflags = 0; 1503 1504 set_cr_intercept(svm, INTERCEPT_CR0_READ); 1505 set_cr_intercept(svm, INTERCEPT_CR3_READ); 1506 set_cr_intercept(svm, INTERCEPT_CR4_READ); 1507 set_cr_intercept(svm, INTERCEPT_CR0_WRITE); 1508 set_cr_intercept(svm, INTERCEPT_CR3_WRITE); 1509 set_cr_intercept(svm, INTERCEPT_CR4_WRITE); 1510 if (!kvm_vcpu_apicv_active(&svm->vcpu)) 1511 set_cr_intercept(svm, INTERCEPT_CR8_WRITE); 1512 1513 set_dr_intercepts(svm); 1514 1515 set_exception_intercept(svm, PF_VECTOR); 1516 set_exception_intercept(svm, UD_VECTOR); 1517 set_exception_intercept(svm, MC_VECTOR); 1518 set_exception_intercept(svm, AC_VECTOR); 1519 set_exception_intercept(svm, DB_VECTOR); 1520 /* 1521 * Guest access to VMware backdoor ports could legitimately 1522 * trigger #GP because of TSS I/O permission bitmap. 1523 * We intercept those #GP and allow access to them anyway 1524 * as VMware does. 1525 */ 1526 if (enable_vmware_backdoor) 1527 set_exception_intercept(svm, GP_VECTOR); 1528 1529 set_intercept(svm, INTERCEPT_INTR); 1530 set_intercept(svm, INTERCEPT_NMI); 1531 set_intercept(svm, INTERCEPT_SMI); 1532 set_intercept(svm, INTERCEPT_SELECTIVE_CR0); 1533 set_intercept(svm, INTERCEPT_RDPMC); 1534 set_intercept(svm, INTERCEPT_CPUID); 1535 set_intercept(svm, INTERCEPT_INVD); 1536 set_intercept(svm, INTERCEPT_INVLPG); 1537 set_intercept(svm, INTERCEPT_INVLPGA); 1538 set_intercept(svm, INTERCEPT_IOIO_PROT); 1539 set_intercept(svm, INTERCEPT_MSR_PROT); 1540 set_intercept(svm, INTERCEPT_TASK_SWITCH); 1541 set_intercept(svm, INTERCEPT_SHUTDOWN); 1542 set_intercept(svm, INTERCEPT_VMRUN); 1543 set_intercept(svm, INTERCEPT_VMMCALL); 1544 set_intercept(svm, INTERCEPT_VMLOAD); 1545 set_intercept(svm, INTERCEPT_VMSAVE); 1546 set_intercept(svm, INTERCEPT_STGI); 1547 set_intercept(svm, INTERCEPT_CLGI); 1548 set_intercept(svm, INTERCEPT_SKINIT); 1549 set_intercept(svm, INTERCEPT_WBINVD); 1550 set_intercept(svm, INTERCEPT_XSETBV); 1551 set_intercept(svm, INTERCEPT_RSM); 1552 1553 if (!kvm_mwait_in_guest(svm->vcpu.kvm)) { 1554 set_intercept(svm, INTERCEPT_MONITOR); 1555 set_intercept(svm, INTERCEPT_MWAIT); 1556 } 1557 1558 if (!kvm_hlt_in_guest(svm->vcpu.kvm)) 1559 set_intercept(svm, INTERCEPT_HLT); 1560 1561 control->iopm_base_pa = __sme_set(iopm_base); 1562 control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); 1563 control->int_ctl = V_INTR_MASKING_MASK; 1564 1565 init_seg(&save->es); 1566 init_seg(&save->ss); 1567 init_seg(&save->ds); 1568 init_seg(&save->fs); 1569 init_seg(&save->gs); 1570 1571 save->cs.selector = 0xf000; 1572 save->cs.base = 0xffff0000; 1573 /* Executable/Readable Code Segment */ 1574 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | 1575 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; 1576 save->cs.limit = 0xffff; 1577 1578 save->gdtr.limit = 0xffff; 1579 save->idtr.limit = 0xffff; 1580 1581 init_sys_seg(&save->ldtr, SEG_TYPE_LDT); 1582 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); 1583 1584 svm_set_efer(&svm->vcpu, 0); 1585 save->dr6 = 0xffff0ff0; 1586 kvm_set_rflags(&svm->vcpu, 2); 1587 save->rip = 0x0000fff0; 1588 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; 1589 1590 /* 1591 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. 1592 * It also updates the guest-visible cr0 value. 1593 */ 1594 svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); 1595 kvm_mmu_reset_context(&svm->vcpu); 1596 1597 save->cr4 = X86_CR4_PAE; 1598 /* rdx = ?? */ 1599 1600 if (npt_enabled) { 1601 /* Setup VMCB for Nested Paging */ 1602 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; 1603 clr_intercept(svm, INTERCEPT_INVLPG); 1604 clr_exception_intercept(svm, PF_VECTOR); 1605 clr_cr_intercept(svm, INTERCEPT_CR3_READ); 1606 clr_cr_intercept(svm, INTERCEPT_CR3_WRITE); 1607 save->g_pat = svm->vcpu.arch.pat; 1608 save->cr3 = 0; 1609 save->cr4 = 0; 1610 } 1611 svm->asid_generation = 0; 1612 1613 svm->nested.vmcb = 0; 1614 svm->vcpu.arch.hflags = 0; 1615 1616 if (pause_filter_count) { 1617 control->pause_filter_count = pause_filter_count; 1618 if (pause_filter_thresh) 1619 control->pause_filter_thresh = pause_filter_thresh; 1620 set_intercept(svm, INTERCEPT_PAUSE); 1621 } else { 1622 clr_intercept(svm, INTERCEPT_PAUSE); 1623 } 1624 1625 if (kvm_vcpu_apicv_active(&svm->vcpu)) 1626 avic_init_vmcb(svm); 1627 1628 /* 1629 * If hardware supports Virtual VMLOAD VMSAVE then enable it 1630 * in VMCB and clear intercepts to avoid #VMEXIT. 1631 */ 1632 if (vls) { 1633 clr_intercept(svm, INTERCEPT_VMLOAD); 1634 clr_intercept(svm, INTERCEPT_VMSAVE); 1635 svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 1636 } 1637 1638 if (vgif) { 1639 clr_intercept(svm, INTERCEPT_STGI); 1640 clr_intercept(svm, INTERCEPT_CLGI); 1641 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; 1642 } 1643 1644 if (sev_guest(svm->vcpu.kvm)) { 1645 svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE; 1646 clr_exception_intercept(svm, UD_VECTOR); 1647 } 1648 1649 mark_all_dirty(svm->vmcb); 1650 1651 enable_gif(svm); 1652 1653 } 1654 1655 static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, 1656 unsigned int index) 1657 { 1658 u64 *avic_physical_id_table; 1659 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 1660 1661 if (index >= AVIC_MAX_PHYSICAL_ID_COUNT) 1662 return NULL; 1663 1664 avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page); 1665 1666 return &avic_physical_id_table[index]; 1667 } 1668 1669 /** 1670 * Note: 1671 * AVIC hardware walks the nested page table to check permissions, 1672 * but does not use the SPA address specified in the leaf page 1673 * table entry since it uses address in the AVIC_BACKING_PAGE pointer 1674 * field of the VMCB. Therefore, we set up the 1675 * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here. 1676 */ 1677 static int avic_init_access_page(struct kvm_vcpu *vcpu) 1678 { 1679 struct kvm *kvm = vcpu->kvm; 1680 int ret = 0; 1681 1682 mutex_lock(&kvm->slots_lock); 1683 if (kvm->arch.apic_access_page_done) 1684 goto out; 1685 1686 ret = __x86_set_memory_region(kvm, 1687 APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 1688 APIC_DEFAULT_PHYS_BASE, 1689 PAGE_SIZE); 1690 if (ret) 1691 goto out; 1692 1693 kvm->arch.apic_access_page_done = true; 1694 out: 1695 mutex_unlock(&kvm->slots_lock); 1696 return ret; 1697 } 1698 1699 static int avic_init_backing_page(struct kvm_vcpu *vcpu) 1700 { 1701 int ret; 1702 u64 *entry, new_entry; 1703 int id = vcpu->vcpu_id; 1704 struct vcpu_svm *svm = to_svm(vcpu); 1705 1706 ret = avic_init_access_page(vcpu); 1707 if (ret) 1708 return ret; 1709 1710 if (id >= AVIC_MAX_PHYSICAL_ID_COUNT) 1711 return -EINVAL; 1712 1713 if (!svm->vcpu.arch.apic->regs) 1714 return -EINVAL; 1715 1716 svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs); 1717 1718 /* Setting AVIC backing page address in the phy APIC ID table */ 1719 entry = avic_get_physical_id_entry(vcpu, id); 1720 if (!entry) 1721 return -EINVAL; 1722 1723 new_entry = __sme_set((page_to_phys(svm->avic_backing_page) & 1724 AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | 1725 AVIC_PHYSICAL_ID_ENTRY_VALID_MASK); 1726 WRITE_ONCE(*entry, new_entry); 1727 1728 svm->avic_physical_id_cache = entry; 1729 1730 return 0; 1731 } 1732 1733 static void __sev_asid_free(int asid) 1734 { 1735 struct svm_cpu_data *sd; 1736 int cpu, pos; 1737 1738 pos = asid - 1; 1739 clear_bit(pos, sev_asid_bitmap); 1740 1741 for_each_possible_cpu(cpu) { 1742 sd = per_cpu(svm_data, cpu); 1743 sd->sev_vmcbs[pos] = NULL; 1744 } 1745 } 1746 1747 static void sev_asid_free(struct kvm *kvm) 1748 { 1749 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; 1750 1751 __sev_asid_free(sev->asid); 1752 } 1753 1754 static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) 1755 { 1756 struct sev_data_decommission *decommission; 1757 struct sev_data_deactivate *data; 1758 1759 if (!handle) 1760 return; 1761 1762 data = kzalloc(sizeof(*data), GFP_KERNEL); 1763 if (!data) 1764 return; 1765 1766 /* deactivate handle */ 1767 data->handle = handle; 1768 sev_guest_deactivate(data, NULL); 1769 1770 wbinvd_on_all_cpus(); 1771 sev_guest_df_flush(NULL); 1772 kfree(data); 1773 1774 decommission = kzalloc(sizeof(*decommission), GFP_KERNEL); 1775 if (!decommission) 1776 return; 1777 1778 /* decommission handle */ 1779 decommission->handle = handle; 1780 sev_guest_decommission(decommission, NULL); 1781 1782 kfree(decommission); 1783 } 1784 1785 static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, 1786 unsigned long ulen, unsigned long *n, 1787 int write) 1788 { 1789 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; 1790 unsigned long npages, npinned, size; 1791 unsigned long locked, lock_limit; 1792 struct page **pages; 1793 unsigned long first, last; 1794 1795 if (ulen == 0 || uaddr + ulen < uaddr) 1796 return NULL; 1797 1798 /* Calculate number of pages. */ 1799 first = (uaddr & PAGE_MASK) >> PAGE_SHIFT; 1800 last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT; 1801 npages = (last - first + 1); 1802 1803 locked = sev->pages_locked + npages; 1804 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 1805 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { 1806 pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit); 1807 return NULL; 1808 } 1809 1810 /* Avoid using vmalloc for smaller buffers. */ 1811 size = npages * sizeof(struct page *); 1812 if (size > PAGE_SIZE) 1813 pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1814 PAGE_KERNEL); 1815 else 1816 pages = kmalloc(size, GFP_KERNEL_ACCOUNT); 1817 1818 if (!pages) 1819 return NULL; 1820 1821 /* Pin the user virtual address. */ 1822 npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages); 1823 if (npinned != npages) { 1824 pr_err("SEV: Failure locking %lu pages.\n", npages); 1825 goto err; 1826 } 1827 1828 *n = npages; 1829 sev->pages_locked = locked; 1830 1831 return pages; 1832 1833 err: 1834 if (npinned > 0) 1835 release_pages(pages, npinned); 1836 1837 kvfree(pages); 1838 return NULL; 1839 } 1840 1841 static void sev_unpin_memory(struct kvm *kvm, struct page **pages, 1842 unsigned long npages) 1843 { 1844 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; 1845 1846 release_pages(pages, npages); 1847 kvfree(pages); 1848 sev->pages_locked -= npages; 1849 } 1850 1851 static void sev_clflush_pages(struct page *pages[], unsigned long npages) 1852 { 1853 uint8_t *page_virtual; 1854 unsigned long i; 1855 1856 if (npages == 0 || pages == NULL) 1857 return; 1858 1859 for (i = 0; i < npages; i++) { 1860 page_virtual = kmap_atomic(pages[i]); 1861 clflush_cache_range(page_virtual, PAGE_SIZE); 1862 kunmap_atomic(page_virtual); 1863 } 1864 } 1865 1866 static void __unregister_enc_region_locked(struct kvm *kvm, 1867 struct enc_region *region) 1868 { 1869 /* 1870 * The guest may change the memory encryption attribute from C=0 -> C=1 1871 * or vice versa for this memory range. Lets make sure caches are 1872 * flushed to ensure that guest data gets written into memory with 1873 * correct C-bit. 1874 */ 1875 sev_clflush_pages(region->pages, region->npages); 1876 1877 sev_unpin_memory(kvm, region->pages, region->npages); 1878 list_del(®ion->list); 1879 kfree(region); 1880 } 1881 1882 static struct kvm *svm_vm_alloc(void) 1883 { 1884 struct kvm_svm *kvm_svm = __vmalloc(sizeof(struct kvm_svm), 1885 GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1886 PAGE_KERNEL); 1887 return &kvm_svm->kvm; 1888 } 1889 1890 static void svm_vm_free(struct kvm *kvm) 1891 { 1892 vfree(to_kvm_svm(kvm)); 1893 } 1894 1895 static void sev_vm_destroy(struct kvm *kvm) 1896 { 1897 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; 1898 struct list_head *head = &sev->regions_list; 1899 struct list_head *pos, *q; 1900 1901 if (!sev_guest(kvm)) 1902 return; 1903 1904 mutex_lock(&kvm->lock); 1905 1906 /* 1907 * if userspace was terminated before unregistering the memory regions 1908 * then lets unpin all the registered memory. 1909 */ 1910 if (!list_empty(head)) { 1911 list_for_each_safe(pos, q, head) { 1912 __unregister_enc_region_locked(kvm, 1913 list_entry(pos, struct enc_region, list)); 1914 } 1915 } 1916 1917 mutex_unlock(&kvm->lock); 1918 1919 sev_unbind_asid(kvm, sev->handle); 1920 sev_asid_free(kvm); 1921 } 1922 1923 static void avic_vm_destroy(struct kvm *kvm) 1924 { 1925 unsigned long flags; 1926 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 1927 1928 if (!avic) 1929 return; 1930 1931 if (kvm_svm->avic_logical_id_table_page) 1932 __free_page(kvm_svm->avic_logical_id_table_page); 1933 if (kvm_svm->avic_physical_id_table_page) 1934 __free_page(kvm_svm->avic_physical_id_table_page); 1935 1936 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 1937 hash_del(&kvm_svm->hnode); 1938 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 1939 } 1940 1941 static void svm_vm_destroy(struct kvm *kvm) 1942 { 1943 avic_vm_destroy(kvm); 1944 sev_vm_destroy(kvm); 1945 } 1946 1947 static int avic_vm_init(struct kvm *kvm) 1948 { 1949 unsigned long flags; 1950 int err = -ENOMEM; 1951 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 1952 struct kvm_svm *k2; 1953 struct page *p_page; 1954 struct page *l_page; 1955 u32 vm_id; 1956 1957 if (!avic) 1958 return 0; 1959 1960 /* Allocating physical APIC ID table (4KB) */ 1961 p_page = alloc_page(GFP_KERNEL_ACCOUNT); 1962 if (!p_page) 1963 goto free_avic; 1964 1965 kvm_svm->avic_physical_id_table_page = p_page; 1966 clear_page(page_address(p_page)); 1967 1968 /* Allocating logical APIC ID table (4KB) */ 1969 l_page = alloc_page(GFP_KERNEL_ACCOUNT); 1970 if (!l_page) 1971 goto free_avic; 1972 1973 kvm_svm->avic_logical_id_table_page = l_page; 1974 clear_page(page_address(l_page)); 1975 1976 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 1977 again: 1978 vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK; 1979 if (vm_id == 0) { /* id is 1-based, zero is not okay */ 1980 next_vm_id_wrapped = 1; 1981 goto again; 1982 } 1983 /* Is it still in use? Only possible if wrapped at least once */ 1984 if (next_vm_id_wrapped) { 1985 hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) { 1986 if (k2->avic_vm_id == vm_id) 1987 goto again; 1988 } 1989 } 1990 kvm_svm->avic_vm_id = vm_id; 1991 hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id); 1992 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 1993 1994 return 0; 1995 1996 free_avic: 1997 avic_vm_destroy(kvm); 1998 return err; 1999 } 2000 2001 static inline int 2002 avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) 2003 { 2004 int ret = 0; 2005 unsigned long flags; 2006 struct amd_svm_iommu_ir *ir; 2007 struct vcpu_svm *svm = to_svm(vcpu); 2008 2009 if (!kvm_arch_has_assigned_device(vcpu->kvm)) 2010 return 0; 2011 2012 /* 2013 * Here, we go through the per-vcpu ir_list to update all existing 2014 * interrupt remapping table entry targeting this vcpu. 2015 */ 2016 spin_lock_irqsave(&svm->ir_list_lock, flags); 2017 2018 if (list_empty(&svm->ir_list)) 2019 goto out; 2020 2021 list_for_each_entry(ir, &svm->ir_list, node) { 2022 ret = amd_iommu_update_ga(cpu, r, ir->data); 2023 if (ret) 2024 break; 2025 } 2026 out: 2027 spin_unlock_irqrestore(&svm->ir_list_lock, flags); 2028 return ret; 2029 } 2030 2031 static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 2032 { 2033 u64 entry; 2034 /* ID = 0xff (broadcast), ID > 0xff (reserved) */ 2035 int h_physical_id = kvm_cpu_get_apicid(cpu); 2036 struct vcpu_svm *svm = to_svm(vcpu); 2037 2038 if (!kvm_vcpu_apicv_active(vcpu)) 2039 return; 2040 2041 /* 2042 * Since the host physical APIC id is 8 bits, 2043 * we can support host APIC ID upto 255. 2044 */ 2045 if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) 2046 return; 2047 2048 entry = READ_ONCE(*(svm->avic_physical_id_cache)); 2049 WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); 2050 2051 entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; 2052 entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); 2053 2054 entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 2055 if (svm->avic_is_running) 2056 entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 2057 2058 WRITE_ONCE(*(svm->avic_physical_id_cache), entry); 2059 avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, 2060 svm->avic_is_running); 2061 } 2062 2063 static void avic_vcpu_put(struct kvm_vcpu *vcpu) 2064 { 2065 u64 entry; 2066 struct vcpu_svm *svm = to_svm(vcpu); 2067 2068 if (!kvm_vcpu_apicv_active(vcpu)) 2069 return; 2070 2071 entry = READ_ONCE(*(svm->avic_physical_id_cache)); 2072 if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) 2073 avic_update_iommu_vcpu_affinity(vcpu, -1, 0); 2074 2075 entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 2076 WRITE_ONCE(*(svm->avic_physical_id_cache), entry); 2077 } 2078 2079 /** 2080 * This function is called during VCPU halt/unhalt. 2081 */ 2082 static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) 2083 { 2084 struct vcpu_svm *svm = to_svm(vcpu); 2085 2086 svm->avic_is_running = is_run; 2087 if (is_run) 2088 avic_vcpu_load(vcpu, vcpu->cpu); 2089 else 2090 avic_vcpu_put(vcpu); 2091 } 2092 2093 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 2094 { 2095 struct vcpu_svm *svm = to_svm(vcpu); 2096 u32 dummy; 2097 u32 eax = 1; 2098 2099 vcpu->arch.microcode_version = 0x01000065; 2100 svm->spec_ctrl = 0; 2101 svm->virt_spec_ctrl = 0; 2102 2103 if (!init_event) { 2104 svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | 2105 MSR_IA32_APICBASE_ENABLE; 2106 if (kvm_vcpu_is_reset_bsp(&svm->vcpu)) 2107 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; 2108 } 2109 init_vmcb(svm); 2110 2111 kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true); 2112 kvm_rdx_write(vcpu, eax); 2113 2114 if (kvm_vcpu_apicv_active(vcpu) && !init_event) 2115 avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); 2116 } 2117 2118 static int avic_init_vcpu(struct vcpu_svm *svm) 2119 { 2120 int ret; 2121 2122 if (!kvm_vcpu_apicv_active(&svm->vcpu)) 2123 return 0; 2124 2125 ret = avic_init_backing_page(&svm->vcpu); 2126 if (ret) 2127 return ret; 2128 2129 INIT_LIST_HEAD(&svm->ir_list); 2130 spin_lock_init(&svm->ir_list_lock); 2131 svm->dfr_reg = APIC_DFR_FLAT; 2132 2133 return ret; 2134 } 2135 2136 static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) 2137 { 2138 struct vcpu_svm *svm; 2139 struct page *page; 2140 struct page *msrpm_pages; 2141 struct page *hsave_page; 2142 struct page *nested_msrpm_pages; 2143 int err; 2144 2145 svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); 2146 if (!svm) { 2147 err = -ENOMEM; 2148 goto out; 2149 } 2150 2151 svm->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, 2152 GFP_KERNEL_ACCOUNT); 2153 if (!svm->vcpu.arch.user_fpu) { 2154 printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n"); 2155 err = -ENOMEM; 2156 goto free_partial_svm; 2157 } 2158 2159 svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, 2160 GFP_KERNEL_ACCOUNT); 2161 if (!svm->vcpu.arch.guest_fpu) { 2162 printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); 2163 err = -ENOMEM; 2164 goto free_user_fpu; 2165 } 2166 2167 err = kvm_vcpu_init(&svm->vcpu, kvm, id); 2168 if (err) 2169 goto free_svm; 2170 2171 err = -ENOMEM; 2172 page = alloc_page(GFP_KERNEL_ACCOUNT); 2173 if (!page) 2174 goto uninit; 2175 2176 msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); 2177 if (!msrpm_pages) 2178 goto free_page1; 2179 2180 nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); 2181 if (!nested_msrpm_pages) 2182 goto free_page2; 2183 2184 hsave_page = alloc_page(GFP_KERNEL_ACCOUNT); 2185 if (!hsave_page) 2186 goto free_page3; 2187 2188 err = avic_init_vcpu(svm); 2189 if (err) 2190 goto free_page4; 2191 2192 /* We initialize this flag to true to make sure that the is_running 2193 * bit would be set the first time the vcpu is loaded. 2194 */ 2195 svm->avic_is_running = true; 2196 2197 svm->nested.hsave = page_address(hsave_page); 2198 2199 svm->msrpm = page_address(msrpm_pages); 2200 svm_vcpu_init_msrpm(svm->msrpm); 2201 2202 svm->nested.msrpm = page_address(nested_msrpm_pages); 2203 svm_vcpu_init_msrpm(svm->nested.msrpm); 2204 2205 svm->vmcb = page_address(page); 2206 clear_page(svm->vmcb); 2207 svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT); 2208 svm->asid_generation = 0; 2209 init_vmcb(svm); 2210 2211 svm_init_osvw(&svm->vcpu); 2212 2213 return &svm->vcpu; 2214 2215 free_page4: 2216 __free_page(hsave_page); 2217 free_page3: 2218 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); 2219 free_page2: 2220 __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); 2221 free_page1: 2222 __free_page(page); 2223 uninit: 2224 kvm_vcpu_uninit(&svm->vcpu); 2225 free_svm: 2226 kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu); 2227 free_user_fpu: 2228 kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu); 2229 free_partial_svm: 2230 kmem_cache_free(kvm_vcpu_cache, svm); 2231 out: 2232 return ERR_PTR(err); 2233 } 2234 2235 static void svm_clear_current_vmcb(struct vmcb *vmcb) 2236 { 2237 int i; 2238 2239 for_each_online_cpu(i) 2240 cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL); 2241 } 2242 2243 static void svm_free_vcpu(struct kvm_vcpu *vcpu) 2244 { 2245 struct vcpu_svm *svm = to_svm(vcpu); 2246 2247 /* 2248 * The vmcb page can be recycled, causing a false negative in 2249 * svm_vcpu_load(). So, ensure that no logical CPU has this 2250 * vmcb page recorded as its current vmcb. 2251 */ 2252 svm_clear_current_vmcb(svm->vmcb); 2253 2254 __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT)); 2255 __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); 2256 __free_page(virt_to_page(svm->nested.hsave)); 2257 __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); 2258 kvm_vcpu_uninit(vcpu); 2259 kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu); 2260 kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu); 2261 kmem_cache_free(kvm_vcpu_cache, svm); 2262 } 2263 2264 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 2265 { 2266 struct vcpu_svm *svm = to_svm(vcpu); 2267 struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 2268 int i; 2269 2270 if (unlikely(cpu != vcpu->cpu)) { 2271 svm->asid_generation = 0; 2272 mark_all_dirty(svm->vmcb); 2273 } 2274 2275 #ifdef CONFIG_X86_64 2276 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base); 2277 #endif 2278 savesegment(fs, svm->host.fs); 2279 savesegment(gs, svm->host.gs); 2280 svm->host.ldt = kvm_read_ldt(); 2281 2282 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 2283 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 2284 2285 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { 2286 u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio; 2287 if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) { 2288 __this_cpu_write(current_tsc_ratio, tsc_ratio); 2289 wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio); 2290 } 2291 } 2292 /* This assumes that the kernel never uses MSR_TSC_AUX */ 2293 if (static_cpu_has(X86_FEATURE_RDTSCP)) 2294 wrmsrl(MSR_TSC_AUX, svm->tsc_aux); 2295 2296 if (sd->current_vmcb != svm->vmcb) { 2297 sd->current_vmcb = svm->vmcb; 2298 indirect_branch_prediction_barrier(); 2299 } 2300 avic_vcpu_load(vcpu, cpu); 2301 } 2302 2303 static void svm_vcpu_put(struct kvm_vcpu *vcpu) 2304 { 2305 struct vcpu_svm *svm = to_svm(vcpu); 2306 int i; 2307 2308 avic_vcpu_put(vcpu); 2309 2310 ++vcpu->stat.host_state_reload; 2311 kvm_load_ldt(svm->host.ldt); 2312 #ifdef CONFIG_X86_64 2313 loadsegment(fs, svm->host.fs); 2314 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase); 2315 load_gs_index(svm->host.gs); 2316 #else 2317 #ifdef CONFIG_X86_32_LAZY_GS 2318 loadsegment(gs, svm->host.gs); 2319 #endif 2320 #endif 2321 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 2322 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 2323 } 2324 2325 static void svm_vcpu_blocking(struct kvm_vcpu *vcpu) 2326 { 2327 avic_set_running(vcpu, false); 2328 } 2329 2330 static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) 2331 { 2332 avic_set_running(vcpu, true); 2333 } 2334 2335 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 2336 { 2337 struct vcpu_svm *svm = to_svm(vcpu); 2338 unsigned long rflags = svm->vmcb->save.rflags; 2339 2340 if (svm->nmi_singlestep) { 2341 /* Hide our flags if they were not set by the guest */ 2342 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) 2343 rflags &= ~X86_EFLAGS_TF; 2344 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) 2345 rflags &= ~X86_EFLAGS_RF; 2346 } 2347 return rflags; 2348 } 2349 2350 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 2351 { 2352 if (to_svm(vcpu)->nmi_singlestep) 2353 rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 2354 2355 /* 2356 * Any change of EFLAGS.VM is accompanied by a reload of SS 2357 * (caused by either a task switch or an inter-privilege IRET), 2358 * so we do not need to update the CPL here. 2359 */ 2360 to_svm(vcpu)->vmcb->save.rflags = rflags; 2361 } 2362 2363 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 2364 { 2365 switch (reg) { 2366 case VCPU_EXREG_PDPTR: 2367 BUG_ON(!npt_enabled); 2368 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); 2369 break; 2370 default: 2371 BUG(); 2372 } 2373 } 2374 2375 static void svm_set_vintr(struct vcpu_svm *svm) 2376 { 2377 set_intercept(svm, INTERCEPT_VINTR); 2378 } 2379 2380 static void svm_clear_vintr(struct vcpu_svm *svm) 2381 { 2382 clr_intercept(svm, INTERCEPT_VINTR); 2383 } 2384 2385 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) 2386 { 2387 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; 2388 2389 switch (seg) { 2390 case VCPU_SREG_CS: return &save->cs; 2391 case VCPU_SREG_DS: return &save->ds; 2392 case VCPU_SREG_ES: return &save->es; 2393 case VCPU_SREG_FS: return &save->fs; 2394 case VCPU_SREG_GS: return &save->gs; 2395 case VCPU_SREG_SS: return &save->ss; 2396 case VCPU_SREG_TR: return &save->tr; 2397 case VCPU_SREG_LDTR: return &save->ldtr; 2398 } 2399 BUG(); 2400 return NULL; 2401 } 2402 2403 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg) 2404 { 2405 struct vmcb_seg *s = svm_seg(vcpu, seg); 2406 2407 return s->base; 2408 } 2409 2410 static void svm_get_segment(struct kvm_vcpu *vcpu, 2411 struct kvm_segment *var, int seg) 2412 { 2413 struct vmcb_seg *s = svm_seg(vcpu, seg); 2414 2415 var->base = s->base; 2416 var->limit = s->limit; 2417 var->selector = s->selector; 2418 var->type = s->attrib & SVM_SELECTOR_TYPE_MASK; 2419 var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1; 2420 var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; 2421 var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1; 2422 var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1; 2423 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; 2424 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; 2425 2426 /* 2427 * AMD CPUs circa 2014 track the G bit for all segments except CS. 2428 * However, the SVM spec states that the G bit is not observed by the 2429 * CPU, and some VMware virtual CPUs drop the G bit for all segments. 2430 * So let's synthesize a legal G bit for all segments, this helps 2431 * running KVM nested. It also helps cross-vendor migration, because 2432 * Intel's vmentry has a check on the 'G' bit. 2433 */ 2434 var->g = s->limit > 0xfffff; 2435 2436 /* 2437 * AMD's VMCB does not have an explicit unusable field, so emulate it 2438 * for cross vendor migration purposes by "not present" 2439 */ 2440 var->unusable = !var->present; 2441 2442 switch (seg) { 2443 case VCPU_SREG_TR: 2444 /* 2445 * Work around a bug where the busy flag in the tr selector 2446 * isn't exposed 2447 */ 2448 var->type |= 0x2; 2449 break; 2450 case VCPU_SREG_DS: 2451 case VCPU_SREG_ES: 2452 case VCPU_SREG_FS: 2453 case VCPU_SREG_GS: 2454 /* 2455 * The accessed bit must always be set in the segment 2456 * descriptor cache, although it can be cleared in the 2457 * descriptor, the cached bit always remains at 1. Since 2458 * Intel has a check on this, set it here to support 2459 * cross-vendor migration. 2460 */ 2461 if (!var->unusable) 2462 var->type |= 0x1; 2463 break; 2464 case VCPU_SREG_SS: 2465 /* 2466 * On AMD CPUs sometimes the DB bit in the segment 2467 * descriptor is left as 1, although the whole segment has 2468 * been made unusable. Clear it here to pass an Intel VMX 2469 * entry check when cross vendor migrating. 2470 */ 2471 if (var->unusable) 2472 var->db = 0; 2473 /* This is symmetric with svm_set_segment() */ 2474 var->dpl = to_svm(vcpu)->vmcb->save.cpl; 2475 break; 2476 } 2477 } 2478 2479 static int svm_get_cpl(struct kvm_vcpu *vcpu) 2480 { 2481 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; 2482 2483 return save->cpl; 2484 } 2485 2486 static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 2487 { 2488 struct vcpu_svm *svm = to_svm(vcpu); 2489 2490 dt->size = svm->vmcb->save.idtr.limit; 2491 dt->address = svm->vmcb->save.idtr.base; 2492 } 2493 2494 static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 2495 { 2496 struct vcpu_svm *svm = to_svm(vcpu); 2497 2498 svm->vmcb->save.idtr.limit = dt->size; 2499 svm->vmcb->save.idtr.base = dt->address ; 2500 mark_dirty(svm->vmcb, VMCB_DT); 2501 } 2502 2503 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 2504 { 2505 struct vcpu_svm *svm = to_svm(vcpu); 2506 2507 dt->size = svm->vmcb->save.gdtr.limit; 2508 dt->address = svm->vmcb->save.gdtr.base; 2509 } 2510 2511 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 2512 { 2513 struct vcpu_svm *svm = to_svm(vcpu); 2514 2515 svm->vmcb->save.gdtr.limit = dt->size; 2516 svm->vmcb->save.gdtr.base = dt->address ; 2517 mark_dirty(svm->vmcb, VMCB_DT); 2518 } 2519 2520 static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 2521 { 2522 } 2523 2524 static void svm_decache_cr3(struct kvm_vcpu *vcpu) 2525 { 2526 } 2527 2528 static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 2529 { 2530 } 2531 2532 static void update_cr0_intercept(struct vcpu_svm *svm) 2533 { 2534 ulong gcr0 = svm->vcpu.arch.cr0; 2535 u64 *hcr0 = &svm->vmcb->save.cr0; 2536 2537 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) 2538 | (gcr0 & SVM_CR0_SELECTIVE_MASK); 2539 2540 mark_dirty(svm->vmcb, VMCB_CR); 2541 2542 if (gcr0 == *hcr0) { 2543 clr_cr_intercept(svm, INTERCEPT_CR0_READ); 2544 clr_cr_intercept(svm, INTERCEPT_CR0_WRITE); 2545 } else { 2546 set_cr_intercept(svm, INTERCEPT_CR0_READ); 2547 set_cr_intercept(svm, INTERCEPT_CR0_WRITE); 2548 } 2549 } 2550 2551 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 2552 { 2553 struct vcpu_svm *svm = to_svm(vcpu); 2554 2555 #ifdef CONFIG_X86_64 2556 if (vcpu->arch.efer & EFER_LME) { 2557 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 2558 vcpu->arch.efer |= EFER_LMA; 2559 svm->vmcb->save.efer |= EFER_LMA | EFER_LME; 2560 } 2561 2562 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { 2563 vcpu->arch.efer &= ~EFER_LMA; 2564 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); 2565 } 2566 } 2567 #endif 2568 vcpu->arch.cr0 = cr0; 2569 2570 if (!npt_enabled) 2571 cr0 |= X86_CR0_PG | X86_CR0_WP; 2572 2573 /* 2574 * re-enable caching here because the QEMU bios 2575 * does not do it - this results in some delay at 2576 * reboot 2577 */ 2578 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) 2579 cr0 &= ~(X86_CR0_CD | X86_CR0_NW); 2580 svm->vmcb->save.cr0 = cr0; 2581 mark_dirty(svm->vmcb, VMCB_CR); 2582 update_cr0_intercept(svm); 2583 } 2584 2585 static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 2586 { 2587 unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; 2588 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; 2589 2590 if (cr4 & X86_CR4_VMXE) 2591 return 1; 2592 2593 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) 2594 svm_flush_tlb(vcpu, true); 2595 2596 vcpu->arch.cr4 = cr4; 2597 if (!npt_enabled) 2598 cr4 |= X86_CR4_PAE; 2599 cr4 |= host_cr4_mce; 2600 to_svm(vcpu)->vmcb->save.cr4 = cr4; 2601 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); 2602 return 0; 2603 } 2604 2605 static void svm_set_segment(struct kvm_vcpu *vcpu, 2606 struct kvm_segment *var, int seg) 2607 { 2608 struct vcpu_svm *svm = to_svm(vcpu); 2609 struct vmcb_seg *s = svm_seg(vcpu, seg); 2610 2611 s->base = var->base; 2612 s->limit = var->limit; 2613 s->selector = var->selector; 2614 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); 2615 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; 2616 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; 2617 s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT; 2618 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; 2619 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; 2620 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; 2621 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; 2622 2623 /* 2624 * This is always accurate, except if SYSRET returned to a segment 2625 * with SS.DPL != 3. Intel does not have this quirk, and always 2626 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it 2627 * would entail passing the CPL to userspace and back. 2628 */ 2629 if (seg == VCPU_SREG_SS) 2630 /* This is symmetric with svm_get_segment() */ 2631 svm->vmcb->save.cpl = (var->dpl & 3); 2632 2633 mark_dirty(svm->vmcb, VMCB_SEG); 2634 } 2635 2636 static void update_bp_intercept(struct kvm_vcpu *vcpu) 2637 { 2638 struct vcpu_svm *svm = to_svm(vcpu); 2639 2640 clr_exception_intercept(svm, BP_VECTOR); 2641 2642 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 2643 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 2644 set_exception_intercept(svm, BP_VECTOR); 2645 } else 2646 vcpu->guest_debug = 0; 2647 } 2648 2649 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) 2650 { 2651 if (sd->next_asid > sd->max_asid) { 2652 ++sd->asid_generation; 2653 sd->next_asid = sd->min_asid; 2654 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; 2655 } 2656 2657 svm->asid_generation = sd->asid_generation; 2658 svm->vmcb->control.asid = sd->next_asid++; 2659 2660 mark_dirty(svm->vmcb, VMCB_ASID); 2661 } 2662 2663 static u64 svm_get_dr6(struct kvm_vcpu *vcpu) 2664 { 2665 return to_svm(vcpu)->vmcb->save.dr6; 2666 } 2667 2668 static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) 2669 { 2670 struct vcpu_svm *svm = to_svm(vcpu); 2671 2672 svm->vmcb->save.dr6 = value; 2673 mark_dirty(svm->vmcb, VMCB_DR); 2674 } 2675 2676 static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 2677 { 2678 struct vcpu_svm *svm = to_svm(vcpu); 2679 2680 get_debugreg(vcpu->arch.db[0], 0); 2681 get_debugreg(vcpu->arch.db[1], 1); 2682 get_debugreg(vcpu->arch.db[2], 2); 2683 get_debugreg(vcpu->arch.db[3], 3); 2684 vcpu->arch.dr6 = svm_get_dr6(vcpu); 2685 vcpu->arch.dr7 = svm->vmcb->save.dr7; 2686 2687 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 2688 set_dr_intercepts(svm); 2689 } 2690 2691 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) 2692 { 2693 struct vcpu_svm *svm = to_svm(vcpu); 2694 2695 svm->vmcb->save.dr7 = value; 2696 mark_dirty(svm->vmcb, VMCB_DR); 2697 } 2698 2699 static int pf_interception(struct vcpu_svm *svm) 2700 { 2701 u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2); 2702 u64 error_code = svm->vmcb->control.exit_info_1; 2703 2704 return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address, 2705 static_cpu_has(X86_FEATURE_DECODEASSISTS) ? 2706 svm->vmcb->control.insn_bytes : NULL, 2707 svm->vmcb->control.insn_len); 2708 } 2709 2710 static int npf_interception(struct vcpu_svm *svm) 2711 { 2712 u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2); 2713 u64 error_code = svm->vmcb->control.exit_info_1; 2714 2715 trace_kvm_page_fault(fault_address, error_code); 2716 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, 2717 static_cpu_has(X86_FEATURE_DECODEASSISTS) ? 2718 svm->vmcb->control.insn_bytes : NULL, 2719 svm->vmcb->control.insn_len); 2720 } 2721 2722 static int db_interception(struct vcpu_svm *svm) 2723 { 2724 struct kvm_run *kvm_run = svm->vcpu.run; 2725 struct kvm_vcpu *vcpu = &svm->vcpu; 2726 2727 if (!(svm->vcpu.guest_debug & 2728 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && 2729 !svm->nmi_singlestep) { 2730 kvm_queue_exception(&svm->vcpu, DB_VECTOR); 2731 return 1; 2732 } 2733 2734 if (svm->nmi_singlestep) { 2735 disable_nmi_singlestep(svm); 2736 /* Make sure we check for pending NMIs upon entry */ 2737 kvm_make_request(KVM_REQ_EVENT, vcpu); 2738 } 2739 2740 if (svm->vcpu.guest_debug & 2741 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) { 2742 kvm_run->exit_reason = KVM_EXIT_DEBUG; 2743 kvm_run->debug.arch.pc = 2744 svm->vmcb->save.cs.base + svm->vmcb->save.rip; 2745 kvm_run->debug.arch.exception = DB_VECTOR; 2746 return 0; 2747 } 2748 2749 return 1; 2750 } 2751 2752 static int bp_interception(struct vcpu_svm *svm) 2753 { 2754 struct kvm_run *kvm_run = svm->vcpu.run; 2755 2756 kvm_run->exit_reason = KVM_EXIT_DEBUG; 2757 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; 2758 kvm_run->debug.arch.exception = BP_VECTOR; 2759 return 0; 2760 } 2761 2762 static int ud_interception(struct vcpu_svm *svm) 2763 { 2764 return handle_ud(&svm->vcpu); 2765 } 2766 2767 static int ac_interception(struct vcpu_svm *svm) 2768 { 2769 kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0); 2770 return 1; 2771 } 2772 2773 static int gp_interception(struct vcpu_svm *svm) 2774 { 2775 struct kvm_vcpu *vcpu = &svm->vcpu; 2776 u32 error_code = svm->vmcb->control.exit_info_1; 2777 int er; 2778 2779 WARN_ON_ONCE(!enable_vmware_backdoor); 2780 2781 er = kvm_emulate_instruction(vcpu, 2782 EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); 2783 if (er == EMULATE_USER_EXIT) 2784 return 0; 2785 else if (er != EMULATE_DONE) 2786 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 2787 return 1; 2788 } 2789 2790 static bool is_erratum_383(void) 2791 { 2792 int err, i; 2793 u64 value; 2794 2795 if (!erratum_383_found) 2796 return false; 2797 2798 value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err); 2799 if (err) 2800 return false; 2801 2802 /* Bit 62 may or may not be set for this mce */ 2803 value &= ~(1ULL << 62); 2804 2805 if (value != 0xb600000000010015ULL) 2806 return false; 2807 2808 /* Clear MCi_STATUS registers */ 2809 for (i = 0; i < 6; ++i) 2810 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0); 2811 2812 value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err); 2813 if (!err) { 2814 u32 low, high; 2815 2816 value &= ~(1ULL << 2); 2817 low = lower_32_bits(value); 2818 high = upper_32_bits(value); 2819 2820 native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); 2821 } 2822 2823 /* Flush tlb to evict multi-match entries */ 2824 __flush_tlb_all(); 2825 2826 return true; 2827 } 2828 2829 static void svm_handle_mce(struct vcpu_svm *svm) 2830 { 2831 if (is_erratum_383()) { 2832 /* 2833 * Erratum 383 triggered. Guest state is corrupt so kill the 2834 * guest. 2835 */ 2836 pr_err("KVM: Guest triggered AMD Erratum 383\n"); 2837 2838 kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu); 2839 2840 return; 2841 } 2842 2843 /* 2844 * On an #MC intercept the MCE handler is not called automatically in 2845 * the host. So do it by hand here. 2846 */ 2847 asm volatile ( 2848 "int $0x12\n"); 2849 /* not sure if we ever come back to this point */ 2850 2851 return; 2852 } 2853 2854 static int mc_interception(struct vcpu_svm *svm) 2855 { 2856 return 1; 2857 } 2858 2859 static int shutdown_interception(struct vcpu_svm *svm) 2860 { 2861 struct kvm_run *kvm_run = svm->vcpu.run; 2862 2863 /* 2864 * VMCB is undefined after a SHUTDOWN intercept 2865 * so reinitialize it. 2866 */ 2867 clear_page(svm->vmcb); 2868 init_vmcb(svm); 2869 2870 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 2871 return 0; 2872 } 2873 2874 static int io_interception(struct vcpu_svm *svm) 2875 { 2876 struct kvm_vcpu *vcpu = &svm->vcpu; 2877 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 2878 int size, in, string; 2879 unsigned port; 2880 2881 ++svm->vcpu.stat.io_exits; 2882 string = (io_info & SVM_IOIO_STR_MASK) != 0; 2883 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 2884 if (string) 2885 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; 2886 2887 port = io_info >> 16; 2888 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 2889 svm->next_rip = svm->vmcb->control.exit_info_2; 2890 2891 return kvm_fast_pio(&svm->vcpu, size, port, in); 2892 } 2893 2894 static int nmi_interception(struct vcpu_svm *svm) 2895 { 2896 return 1; 2897 } 2898 2899 static int intr_interception(struct vcpu_svm *svm) 2900 { 2901 ++svm->vcpu.stat.irq_exits; 2902 return 1; 2903 } 2904 2905 static int nop_on_interception(struct vcpu_svm *svm) 2906 { 2907 return 1; 2908 } 2909 2910 static int halt_interception(struct vcpu_svm *svm) 2911 { 2912 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; 2913 return kvm_emulate_halt(&svm->vcpu); 2914 } 2915 2916 static int vmmcall_interception(struct vcpu_svm *svm) 2917 { 2918 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2919 return kvm_emulate_hypercall(&svm->vcpu); 2920 } 2921 2922 static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) 2923 { 2924 struct vcpu_svm *svm = to_svm(vcpu); 2925 2926 return svm->nested.nested_cr3; 2927 } 2928 2929 static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) 2930 { 2931 struct vcpu_svm *svm = to_svm(vcpu); 2932 u64 cr3 = svm->nested.nested_cr3; 2933 u64 pdpte; 2934 int ret; 2935 2936 ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte, 2937 offset_in_page(cr3) + index * 8, 8); 2938 if (ret) 2939 return 0; 2940 return pdpte; 2941 } 2942 2943 static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, 2944 unsigned long root) 2945 { 2946 struct vcpu_svm *svm = to_svm(vcpu); 2947 2948 svm->vmcb->control.nested_cr3 = __sme_set(root); 2949 mark_dirty(svm->vmcb, VMCB_NPT); 2950 } 2951 2952 static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, 2953 struct x86_exception *fault) 2954 { 2955 struct vcpu_svm *svm = to_svm(vcpu); 2956 2957 if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) { 2958 /* 2959 * TODO: track the cause of the nested page fault, and 2960 * correctly fill in the high bits of exit_info_1. 2961 */ 2962 svm->vmcb->control.exit_code = SVM_EXIT_NPF; 2963 svm->vmcb->control.exit_code_hi = 0; 2964 svm->vmcb->control.exit_info_1 = (1ULL << 32); 2965 svm->vmcb->control.exit_info_2 = fault->address; 2966 } 2967 2968 svm->vmcb->control.exit_info_1 &= ~0xffffffffULL; 2969 svm->vmcb->control.exit_info_1 |= fault->error_code; 2970 2971 /* 2972 * The present bit is always zero for page structure faults on real 2973 * hardware. 2974 */ 2975 if (svm->vmcb->control.exit_info_1 & (2ULL << 32)) 2976 svm->vmcb->control.exit_info_1 &= ~1; 2977 2978 nested_svm_vmexit(svm); 2979 } 2980 2981 static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) 2982 { 2983 WARN_ON(mmu_is_nested(vcpu)); 2984 2985 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 2986 kvm_init_shadow_mmu(vcpu); 2987 vcpu->arch.mmu->set_cr3 = nested_svm_set_tdp_cr3; 2988 vcpu->arch.mmu->get_cr3 = nested_svm_get_tdp_cr3; 2989 vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; 2990 vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit; 2991 vcpu->arch.mmu->shadow_root_level = get_npt_level(vcpu); 2992 reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu); 2993 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 2994 } 2995 2996 static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) 2997 { 2998 vcpu->arch.mmu = &vcpu->arch.root_mmu; 2999 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 3000 } 3001 3002 static int nested_svm_check_permissions(struct vcpu_svm *svm) 3003 { 3004 if (!(svm->vcpu.arch.efer & EFER_SVME) || 3005 !is_paging(&svm->vcpu)) { 3006 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 3007 return 1; 3008 } 3009 3010 if (svm->vmcb->save.cpl) { 3011 kvm_inject_gp(&svm->vcpu, 0); 3012 return 1; 3013 } 3014 3015 return 0; 3016 } 3017 3018 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 3019 bool has_error_code, u32 error_code) 3020 { 3021 int vmexit; 3022 3023 if (!is_guest_mode(&svm->vcpu)) 3024 return 0; 3025 3026 vmexit = nested_svm_intercept(svm); 3027 if (vmexit != NESTED_EXIT_DONE) 3028 return 0; 3029 3030 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; 3031 svm->vmcb->control.exit_code_hi = 0; 3032 svm->vmcb->control.exit_info_1 = error_code; 3033 3034 /* 3035 * EXITINFO2 is undefined for all exception intercepts other 3036 * than #PF. 3037 */ 3038 if (svm->vcpu.arch.exception.nested_apf) 3039 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; 3040 else if (svm->vcpu.arch.exception.has_payload) 3041 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload; 3042 else 3043 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; 3044 3045 svm->nested.exit_required = true; 3046 return vmexit; 3047 } 3048 3049 /* This function returns true if it is save to enable the irq window */ 3050 static inline bool nested_svm_intr(struct vcpu_svm *svm) 3051 { 3052 if (!is_guest_mode(&svm->vcpu)) 3053 return true; 3054 3055 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) 3056 return true; 3057 3058 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) 3059 return false; 3060 3061 /* 3062 * if vmexit was already requested (by intercepted exception 3063 * for instance) do not overwrite it with "external interrupt" 3064 * vmexit. 3065 */ 3066 if (svm->nested.exit_required) 3067 return false; 3068 3069 svm->vmcb->control.exit_code = SVM_EXIT_INTR; 3070 svm->vmcb->control.exit_info_1 = 0; 3071 svm->vmcb->control.exit_info_2 = 0; 3072 3073 if (svm->nested.intercept & 1ULL) { 3074 /* 3075 * The #vmexit can't be emulated here directly because this 3076 * code path runs with irqs and preemption disabled. A 3077 * #vmexit emulation might sleep. Only signal request for 3078 * the #vmexit here. 3079 */ 3080 svm->nested.exit_required = true; 3081 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); 3082 return false; 3083 } 3084 3085 return true; 3086 } 3087 3088 /* This function returns true if it is save to enable the nmi window */ 3089 static inline bool nested_svm_nmi(struct vcpu_svm *svm) 3090 { 3091 if (!is_guest_mode(&svm->vcpu)) 3092 return true; 3093 3094 if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) 3095 return true; 3096 3097 svm->vmcb->control.exit_code = SVM_EXIT_NMI; 3098 svm->nested.exit_required = true; 3099 3100 return false; 3101 } 3102 3103 static int nested_svm_intercept_ioio(struct vcpu_svm *svm) 3104 { 3105 unsigned port, size, iopm_len; 3106 u16 val, mask; 3107 u8 start_bit; 3108 u64 gpa; 3109 3110 if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT))) 3111 return NESTED_EXIT_HOST; 3112 3113 port = svm->vmcb->control.exit_info_1 >> 16; 3114 size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >> 3115 SVM_IOIO_SIZE_SHIFT; 3116 gpa = svm->nested.vmcb_iopm + (port / 8); 3117 start_bit = port % 8; 3118 iopm_len = (start_bit + size > 8) ? 2 : 1; 3119 mask = (0xf >> (4 - size)) << start_bit; 3120 val = 0; 3121 3122 if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len)) 3123 return NESTED_EXIT_DONE; 3124 3125 return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; 3126 } 3127 3128 static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) 3129 { 3130 u32 offset, msr, value; 3131 int write, mask; 3132 3133 if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) 3134 return NESTED_EXIT_HOST; 3135 3136 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 3137 offset = svm_msrpm_offset(msr); 3138 write = svm->vmcb->control.exit_info_1 & 1; 3139 mask = 1 << ((2 * (msr & 0xf)) + write); 3140 3141 if (offset == MSR_INVALID) 3142 return NESTED_EXIT_DONE; 3143 3144 /* Offset is in 32 bit units but need in 8 bit units */ 3145 offset *= 4; 3146 3147 if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4)) 3148 return NESTED_EXIT_DONE; 3149 3150 return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; 3151 } 3152 3153 /* DB exceptions for our internal use must not cause vmexit */ 3154 static int nested_svm_intercept_db(struct vcpu_svm *svm) 3155 { 3156 unsigned long dr6; 3157 3158 /* if we're not singlestepping, it's not ours */ 3159 if (!svm->nmi_singlestep) 3160 return NESTED_EXIT_DONE; 3161 3162 /* if it's not a singlestep exception, it's not ours */ 3163 if (kvm_get_dr(&svm->vcpu, 6, &dr6)) 3164 return NESTED_EXIT_DONE; 3165 if (!(dr6 & DR6_BS)) 3166 return NESTED_EXIT_DONE; 3167 3168 /* if the guest is singlestepping, it should get the vmexit */ 3169 if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) { 3170 disable_nmi_singlestep(svm); 3171 return NESTED_EXIT_DONE; 3172 } 3173 3174 /* it's ours, the nested hypervisor must not see this one */ 3175 return NESTED_EXIT_HOST; 3176 } 3177 3178 static int nested_svm_exit_special(struct vcpu_svm *svm) 3179 { 3180 u32 exit_code = svm->vmcb->control.exit_code; 3181 3182 switch (exit_code) { 3183 case SVM_EXIT_INTR: 3184 case SVM_EXIT_NMI: 3185 case SVM_EXIT_EXCP_BASE + MC_VECTOR: 3186 return NESTED_EXIT_HOST; 3187 case SVM_EXIT_NPF: 3188 /* For now we are always handling NPFs when using them */ 3189 if (npt_enabled) 3190 return NESTED_EXIT_HOST; 3191 break; 3192 case SVM_EXIT_EXCP_BASE + PF_VECTOR: 3193 /* When we're shadowing, trap PFs, but not async PF */ 3194 if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0) 3195 return NESTED_EXIT_HOST; 3196 break; 3197 default: 3198 break; 3199 } 3200 3201 return NESTED_EXIT_CONTINUE; 3202 } 3203 3204 /* 3205 * If this function returns true, this #vmexit was already handled 3206 */ 3207 static int nested_svm_intercept(struct vcpu_svm *svm) 3208 { 3209 u32 exit_code = svm->vmcb->control.exit_code; 3210 int vmexit = NESTED_EXIT_HOST; 3211 3212 switch (exit_code) { 3213 case SVM_EXIT_MSR: 3214 vmexit = nested_svm_exit_handled_msr(svm); 3215 break; 3216 case SVM_EXIT_IOIO: 3217 vmexit = nested_svm_intercept_ioio(svm); 3218 break; 3219 case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { 3220 u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0); 3221 if (svm->nested.intercept_cr & bit) 3222 vmexit = NESTED_EXIT_DONE; 3223 break; 3224 } 3225 case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { 3226 u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0); 3227 if (svm->nested.intercept_dr & bit) 3228 vmexit = NESTED_EXIT_DONE; 3229 break; 3230 } 3231 case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { 3232 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); 3233 if (svm->nested.intercept_exceptions & excp_bits) { 3234 if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR) 3235 vmexit = nested_svm_intercept_db(svm); 3236 else 3237 vmexit = NESTED_EXIT_DONE; 3238 } 3239 /* async page fault always cause vmexit */ 3240 else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && 3241 svm->vcpu.arch.exception.nested_apf != 0) 3242 vmexit = NESTED_EXIT_DONE; 3243 break; 3244 } 3245 case SVM_EXIT_ERR: { 3246 vmexit = NESTED_EXIT_DONE; 3247 break; 3248 } 3249 default: { 3250 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); 3251 if (svm->nested.intercept & exit_bits) 3252 vmexit = NESTED_EXIT_DONE; 3253 } 3254 } 3255 3256 return vmexit; 3257 } 3258 3259 static int nested_svm_exit_handled(struct vcpu_svm *svm) 3260 { 3261 int vmexit; 3262 3263 vmexit = nested_svm_intercept(svm); 3264 3265 if (vmexit == NESTED_EXIT_DONE) 3266 nested_svm_vmexit(svm); 3267 3268 return vmexit; 3269 } 3270 3271 static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb) 3272 { 3273 struct vmcb_control_area *dst = &dst_vmcb->control; 3274 struct vmcb_control_area *from = &from_vmcb->control; 3275 3276 dst->intercept_cr = from->intercept_cr; 3277 dst->intercept_dr = from->intercept_dr; 3278 dst->intercept_exceptions = from->intercept_exceptions; 3279 dst->intercept = from->intercept; 3280 dst->iopm_base_pa = from->iopm_base_pa; 3281 dst->msrpm_base_pa = from->msrpm_base_pa; 3282 dst->tsc_offset = from->tsc_offset; 3283 dst->asid = from->asid; 3284 dst->tlb_ctl = from->tlb_ctl; 3285 dst->int_ctl = from->int_ctl; 3286 dst->int_vector = from->int_vector; 3287 dst->int_state = from->int_state; 3288 dst->exit_code = from->exit_code; 3289 dst->exit_code_hi = from->exit_code_hi; 3290 dst->exit_info_1 = from->exit_info_1; 3291 dst->exit_info_2 = from->exit_info_2; 3292 dst->exit_int_info = from->exit_int_info; 3293 dst->exit_int_info_err = from->exit_int_info_err; 3294 dst->nested_ctl = from->nested_ctl; 3295 dst->event_inj = from->event_inj; 3296 dst->event_inj_err = from->event_inj_err; 3297 dst->nested_cr3 = from->nested_cr3; 3298 dst->virt_ext = from->virt_ext; 3299 dst->pause_filter_count = from->pause_filter_count; 3300 dst->pause_filter_thresh = from->pause_filter_thresh; 3301 } 3302 3303 static int nested_svm_vmexit(struct vcpu_svm *svm) 3304 { 3305 int rc; 3306 struct vmcb *nested_vmcb; 3307 struct vmcb *hsave = svm->nested.hsave; 3308 struct vmcb *vmcb = svm->vmcb; 3309 struct kvm_host_map map; 3310 3311 trace_kvm_nested_vmexit_inject(vmcb->control.exit_code, 3312 vmcb->control.exit_info_1, 3313 vmcb->control.exit_info_2, 3314 vmcb->control.exit_int_info, 3315 vmcb->control.exit_int_info_err, 3316 KVM_ISA_SVM); 3317 3318 rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb), &map); 3319 if (rc) { 3320 if (rc == -EINVAL) 3321 kvm_inject_gp(&svm->vcpu, 0); 3322 return 1; 3323 } 3324 3325 nested_vmcb = map.hva; 3326 3327 /* Exit Guest-Mode */ 3328 leave_guest_mode(&svm->vcpu); 3329 svm->nested.vmcb = 0; 3330 3331 /* Give the current vmcb to the guest */ 3332 disable_gif(svm); 3333 3334 nested_vmcb->save.es = vmcb->save.es; 3335 nested_vmcb->save.cs = vmcb->save.cs; 3336 nested_vmcb->save.ss = vmcb->save.ss; 3337 nested_vmcb->save.ds = vmcb->save.ds; 3338 nested_vmcb->save.gdtr = vmcb->save.gdtr; 3339 nested_vmcb->save.idtr = vmcb->save.idtr; 3340 nested_vmcb->save.efer = svm->vcpu.arch.efer; 3341 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); 3342 nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu); 3343 nested_vmcb->save.cr2 = vmcb->save.cr2; 3344 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; 3345 nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu); 3346 nested_vmcb->save.rip = vmcb->save.rip; 3347 nested_vmcb->save.rsp = vmcb->save.rsp; 3348 nested_vmcb->save.rax = vmcb->save.rax; 3349 nested_vmcb->save.dr7 = vmcb->save.dr7; 3350 nested_vmcb->save.dr6 = vmcb->save.dr6; 3351 nested_vmcb->save.cpl = vmcb->save.cpl; 3352 3353 nested_vmcb->control.int_ctl = vmcb->control.int_ctl; 3354 nested_vmcb->control.int_vector = vmcb->control.int_vector; 3355 nested_vmcb->control.int_state = vmcb->control.int_state; 3356 nested_vmcb->control.exit_code = vmcb->control.exit_code; 3357 nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi; 3358 nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1; 3359 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; 3360 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; 3361 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; 3362 3363 if (svm->nrips_enabled) 3364 nested_vmcb->control.next_rip = vmcb->control.next_rip; 3365 3366 /* 3367 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have 3368 * to make sure that we do not lose injected events. So check event_inj 3369 * here and copy it to exit_int_info if it is valid. 3370 * Exit_int_info and event_inj can't be both valid because the case 3371 * below only happens on a VMRUN instruction intercept which has 3372 * no valid exit_int_info set. 3373 */ 3374 if (vmcb->control.event_inj & SVM_EVTINJ_VALID) { 3375 struct vmcb_control_area *nc = &nested_vmcb->control; 3376 3377 nc->exit_int_info = vmcb->control.event_inj; 3378 nc->exit_int_info_err = vmcb->control.event_inj_err; 3379 } 3380 3381 nested_vmcb->control.tlb_ctl = 0; 3382 nested_vmcb->control.event_inj = 0; 3383 nested_vmcb->control.event_inj_err = 0; 3384 3385 nested_vmcb->control.pause_filter_count = 3386 svm->vmcb->control.pause_filter_count; 3387 nested_vmcb->control.pause_filter_thresh = 3388 svm->vmcb->control.pause_filter_thresh; 3389 3390 /* We always set V_INTR_MASKING and remember the old value in hflags */ 3391 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) 3392 nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; 3393 3394 /* Restore the original control entries */ 3395 copy_vmcb_control_area(vmcb, hsave); 3396 3397 svm->vcpu.arch.tsc_offset = svm->vmcb->control.tsc_offset; 3398 kvm_clear_exception_queue(&svm->vcpu); 3399 kvm_clear_interrupt_queue(&svm->vcpu); 3400 3401 svm->nested.nested_cr3 = 0; 3402 3403 /* Restore selected save entries */ 3404 svm->vmcb->save.es = hsave->save.es; 3405 svm->vmcb->save.cs = hsave->save.cs; 3406 svm->vmcb->save.ss = hsave->save.ss; 3407 svm->vmcb->save.ds = hsave->save.ds; 3408 svm->vmcb->save.gdtr = hsave->save.gdtr; 3409 svm->vmcb->save.idtr = hsave->save.idtr; 3410 kvm_set_rflags(&svm->vcpu, hsave->save.rflags); 3411 svm_set_efer(&svm->vcpu, hsave->save.efer); 3412 svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); 3413 svm_set_cr4(&svm->vcpu, hsave->save.cr4); 3414 if (npt_enabled) { 3415 svm->vmcb->save.cr3 = hsave->save.cr3; 3416 svm->vcpu.arch.cr3 = hsave->save.cr3; 3417 } else { 3418 (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3); 3419 } 3420 kvm_rax_write(&svm->vcpu, hsave->save.rax); 3421 kvm_rsp_write(&svm->vcpu, hsave->save.rsp); 3422 kvm_rip_write(&svm->vcpu, hsave->save.rip); 3423 svm->vmcb->save.dr7 = 0; 3424 svm->vmcb->save.cpl = 0; 3425 svm->vmcb->control.exit_int_info = 0; 3426 3427 mark_all_dirty(svm->vmcb); 3428 3429 kvm_vcpu_unmap(&svm->vcpu, &map, true); 3430 3431 nested_svm_uninit_mmu_context(&svm->vcpu); 3432 kvm_mmu_reset_context(&svm->vcpu); 3433 kvm_mmu_load(&svm->vcpu); 3434 3435 /* 3436 * Drop what we picked up for L2 via svm_complete_interrupts() so it 3437 * doesn't end up in L1. 3438 */ 3439 svm->vcpu.arch.nmi_injected = false; 3440 kvm_clear_exception_queue(&svm->vcpu); 3441 kvm_clear_interrupt_queue(&svm->vcpu); 3442 3443 return 0; 3444 } 3445 3446 static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) 3447 { 3448 /* 3449 * This function merges the msr permission bitmaps of kvm and the 3450 * nested vmcb. It is optimized in that it only merges the parts where 3451 * the kvm msr permission bitmap may contain zero bits 3452 */ 3453 int i; 3454 3455 if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) 3456 return true; 3457 3458 for (i = 0; i < MSRPM_OFFSETS; i++) { 3459 u32 value, p; 3460 u64 offset; 3461 3462 if (msrpm_offsets[i] == 0xffffffff) 3463 break; 3464 3465 p = msrpm_offsets[i]; 3466 offset = svm->nested.vmcb_msrpm + (p * 4); 3467 3468 if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4)) 3469 return false; 3470 3471 svm->nested.msrpm[p] = svm->msrpm[p] | value; 3472 } 3473 3474 svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm)); 3475 3476 return true; 3477 } 3478 3479 static bool nested_vmcb_checks(struct vmcb *vmcb) 3480 { 3481 if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0) 3482 return false; 3483 3484 if (vmcb->control.asid == 0) 3485 return false; 3486 3487 if ((vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && 3488 !npt_enabled) 3489 return false; 3490 3491 return true; 3492 } 3493 3494 static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, 3495 struct vmcb *nested_vmcb, struct kvm_host_map *map) 3496 { 3497 if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF) 3498 svm->vcpu.arch.hflags |= HF_HIF_MASK; 3499 else 3500 svm->vcpu.arch.hflags &= ~HF_HIF_MASK; 3501 3502 if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) { 3503 svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3; 3504 nested_svm_init_mmu_context(&svm->vcpu); 3505 } 3506 3507 /* Load the nested guest state */ 3508 svm->vmcb->save.es = nested_vmcb->save.es; 3509 svm->vmcb->save.cs = nested_vmcb->save.cs; 3510 svm->vmcb->save.ss = nested_vmcb->save.ss; 3511 svm->vmcb->save.ds = nested_vmcb->save.ds; 3512 svm->vmcb->save.gdtr = nested_vmcb->save.gdtr; 3513 svm->vmcb->save.idtr = nested_vmcb->save.idtr; 3514 kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags); 3515 svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); 3516 svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); 3517 svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); 3518 if (npt_enabled) { 3519 svm->vmcb->save.cr3 = nested_vmcb->save.cr3; 3520 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; 3521 } else 3522 (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); 3523 3524 /* Guest paging mode is active - reset mmu */ 3525 kvm_mmu_reset_context(&svm->vcpu); 3526 3527 svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; 3528 kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax); 3529 kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp); 3530 kvm_rip_write(&svm->vcpu, nested_vmcb->save.rip); 3531 3532 /* In case we don't even reach vcpu_run, the fields are not updated */ 3533 svm->vmcb->save.rax = nested_vmcb->save.rax; 3534 svm->vmcb->save.rsp = nested_vmcb->save.rsp; 3535 svm->vmcb->save.rip = nested_vmcb->save.rip; 3536 svm->vmcb->save.dr7 = nested_vmcb->save.dr7; 3537 svm->vmcb->save.dr6 = nested_vmcb->save.dr6; 3538 svm->vmcb->save.cpl = nested_vmcb->save.cpl; 3539 3540 svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL; 3541 svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; 3542 3543 /* cache intercepts */ 3544 svm->nested.intercept_cr = nested_vmcb->control.intercept_cr; 3545 svm->nested.intercept_dr = nested_vmcb->control.intercept_dr; 3546 svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; 3547 svm->nested.intercept = nested_vmcb->control.intercept; 3548 3549 svm_flush_tlb(&svm->vcpu, true); 3550 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; 3551 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) 3552 svm->vcpu.arch.hflags |= HF_VINTR_MASK; 3553 else 3554 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; 3555 3556 if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { 3557 /* We only want the cr8 intercept bits of the guest */ 3558 clr_cr_intercept(svm, INTERCEPT_CR8_READ); 3559 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); 3560 } 3561 3562 /* We don't want to see VMMCALLs from a nested guest */ 3563 clr_intercept(svm, INTERCEPT_VMMCALL); 3564 3565 svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset; 3566 svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset; 3567 3568 svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext; 3569 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; 3570 svm->vmcb->control.int_state = nested_vmcb->control.int_state; 3571 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; 3572 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; 3573 3574 svm->vmcb->control.pause_filter_count = 3575 nested_vmcb->control.pause_filter_count; 3576 svm->vmcb->control.pause_filter_thresh = 3577 nested_vmcb->control.pause_filter_thresh; 3578 3579 kvm_vcpu_unmap(&svm->vcpu, map, true); 3580 3581 /* Enter Guest-Mode */ 3582 enter_guest_mode(&svm->vcpu); 3583 3584 /* 3585 * Merge guest and host intercepts - must be called with vcpu in 3586 * guest-mode to take affect here 3587 */ 3588 recalc_intercepts(svm); 3589 3590 svm->nested.vmcb = vmcb_gpa; 3591 3592 enable_gif(svm); 3593 3594 mark_all_dirty(svm->vmcb); 3595 } 3596 3597 static bool nested_svm_vmrun(struct vcpu_svm *svm) 3598 { 3599 int rc; 3600 struct vmcb *nested_vmcb; 3601 struct vmcb *hsave = svm->nested.hsave; 3602 struct vmcb *vmcb = svm->vmcb; 3603 struct kvm_host_map map; 3604 u64 vmcb_gpa; 3605 3606 vmcb_gpa = svm->vmcb->save.rax; 3607 3608 rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map); 3609 if (rc) { 3610 if (rc == -EINVAL) 3611 kvm_inject_gp(&svm->vcpu, 0); 3612 return false; 3613 } 3614 3615 nested_vmcb = map.hva; 3616 3617 if (!nested_vmcb_checks(nested_vmcb)) { 3618 nested_vmcb->control.exit_code = SVM_EXIT_ERR; 3619 nested_vmcb->control.exit_code_hi = 0; 3620 nested_vmcb->control.exit_info_1 = 0; 3621 nested_vmcb->control.exit_info_2 = 0; 3622 3623 kvm_vcpu_unmap(&svm->vcpu, &map, true); 3624 3625 return false; 3626 } 3627 3628 trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, 3629 nested_vmcb->save.rip, 3630 nested_vmcb->control.int_ctl, 3631 nested_vmcb->control.event_inj, 3632 nested_vmcb->control.nested_ctl); 3633 3634 trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, 3635 nested_vmcb->control.intercept_cr >> 16, 3636 nested_vmcb->control.intercept_exceptions, 3637 nested_vmcb->control.intercept); 3638 3639 /* Clear internal status */ 3640 kvm_clear_exception_queue(&svm->vcpu); 3641 kvm_clear_interrupt_queue(&svm->vcpu); 3642 3643 /* 3644 * Save the old vmcb, so we don't need to pick what we save, but can 3645 * restore everything when a VMEXIT occurs 3646 */ 3647 hsave->save.es = vmcb->save.es; 3648 hsave->save.cs = vmcb->save.cs; 3649 hsave->save.ss = vmcb->save.ss; 3650 hsave->save.ds = vmcb->save.ds; 3651 hsave->save.gdtr = vmcb->save.gdtr; 3652 hsave->save.idtr = vmcb->save.idtr; 3653 hsave->save.efer = svm->vcpu.arch.efer; 3654 hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); 3655 hsave->save.cr4 = svm->vcpu.arch.cr4; 3656 hsave->save.rflags = kvm_get_rflags(&svm->vcpu); 3657 hsave->save.rip = kvm_rip_read(&svm->vcpu); 3658 hsave->save.rsp = vmcb->save.rsp; 3659 hsave->save.rax = vmcb->save.rax; 3660 if (npt_enabled) 3661 hsave->save.cr3 = vmcb->save.cr3; 3662 else 3663 hsave->save.cr3 = kvm_read_cr3(&svm->vcpu); 3664 3665 copy_vmcb_control_area(hsave, vmcb); 3666 3667 enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map); 3668 3669 return true; 3670 } 3671 3672 static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) 3673 { 3674 to_vmcb->save.fs = from_vmcb->save.fs; 3675 to_vmcb->save.gs = from_vmcb->save.gs; 3676 to_vmcb->save.tr = from_vmcb->save.tr; 3677 to_vmcb->save.ldtr = from_vmcb->save.ldtr; 3678 to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base; 3679 to_vmcb->save.star = from_vmcb->save.star; 3680 to_vmcb->save.lstar = from_vmcb->save.lstar; 3681 to_vmcb->save.cstar = from_vmcb->save.cstar; 3682 to_vmcb->save.sfmask = from_vmcb->save.sfmask; 3683 to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs; 3684 to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp; 3685 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; 3686 } 3687 3688 static int vmload_interception(struct vcpu_svm *svm) 3689 { 3690 struct vmcb *nested_vmcb; 3691 struct kvm_host_map map; 3692 int ret; 3693 3694 if (nested_svm_check_permissions(svm)) 3695 return 1; 3696 3697 ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map); 3698 if (ret) { 3699 if (ret == -EINVAL) 3700 kvm_inject_gp(&svm->vcpu, 0); 3701 return 1; 3702 } 3703 3704 nested_vmcb = map.hva; 3705 3706 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3707 ret = kvm_skip_emulated_instruction(&svm->vcpu); 3708 3709 nested_svm_vmloadsave(nested_vmcb, svm->vmcb); 3710 kvm_vcpu_unmap(&svm->vcpu, &map, true); 3711 3712 return ret; 3713 } 3714 3715 static int vmsave_interception(struct vcpu_svm *svm) 3716 { 3717 struct vmcb *nested_vmcb; 3718 struct kvm_host_map map; 3719 int ret; 3720 3721 if (nested_svm_check_permissions(svm)) 3722 return 1; 3723 3724 ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map); 3725 if (ret) { 3726 if (ret == -EINVAL) 3727 kvm_inject_gp(&svm->vcpu, 0); 3728 return 1; 3729 } 3730 3731 nested_vmcb = map.hva; 3732 3733 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3734 ret = kvm_skip_emulated_instruction(&svm->vcpu); 3735 3736 nested_svm_vmloadsave(svm->vmcb, nested_vmcb); 3737 kvm_vcpu_unmap(&svm->vcpu, &map, true); 3738 3739 return ret; 3740 } 3741 3742 static int vmrun_interception(struct vcpu_svm *svm) 3743 { 3744 if (nested_svm_check_permissions(svm)) 3745 return 1; 3746 3747 /* Save rip after vmrun instruction */ 3748 kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3); 3749 3750 if (!nested_svm_vmrun(svm)) 3751 return 1; 3752 3753 if (!nested_svm_vmrun_msrpm(svm)) 3754 goto failed; 3755 3756 return 1; 3757 3758 failed: 3759 3760 svm->vmcb->control.exit_code = SVM_EXIT_ERR; 3761 svm->vmcb->control.exit_code_hi = 0; 3762 svm->vmcb->control.exit_info_1 = 0; 3763 svm->vmcb->control.exit_info_2 = 0; 3764 3765 nested_svm_vmexit(svm); 3766 3767 return 1; 3768 } 3769 3770 static int stgi_interception(struct vcpu_svm *svm) 3771 { 3772 int ret; 3773 3774 if (nested_svm_check_permissions(svm)) 3775 return 1; 3776 3777 /* 3778 * If VGIF is enabled, the STGI intercept is only added to 3779 * detect the opening of the SMI/NMI window; remove it now. 3780 */ 3781 if (vgif_enabled(svm)) 3782 clr_intercept(svm, INTERCEPT_STGI); 3783 3784 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3785 ret = kvm_skip_emulated_instruction(&svm->vcpu); 3786 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 3787 3788 enable_gif(svm); 3789 3790 return ret; 3791 } 3792 3793 static int clgi_interception(struct vcpu_svm *svm) 3794 { 3795 int ret; 3796 3797 if (nested_svm_check_permissions(svm)) 3798 return 1; 3799 3800 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3801 ret = kvm_skip_emulated_instruction(&svm->vcpu); 3802 3803 disable_gif(svm); 3804 3805 /* After a CLGI no interrupts should come */ 3806 if (!kvm_vcpu_apicv_active(&svm->vcpu)) { 3807 svm_clear_vintr(svm); 3808 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 3809 mark_dirty(svm->vmcb, VMCB_INTR); 3810 } 3811 3812 return ret; 3813 } 3814 3815 static int invlpga_interception(struct vcpu_svm *svm) 3816 { 3817 struct kvm_vcpu *vcpu = &svm->vcpu; 3818 3819 trace_kvm_invlpga(svm->vmcb->save.rip, kvm_rcx_read(&svm->vcpu), 3820 kvm_rax_read(&svm->vcpu)); 3821 3822 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ 3823 kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu)); 3824 3825 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3826 return kvm_skip_emulated_instruction(&svm->vcpu); 3827 } 3828 3829 static int skinit_interception(struct vcpu_svm *svm) 3830 { 3831 trace_kvm_skinit(svm->vmcb->save.rip, kvm_rax_read(&svm->vcpu)); 3832 3833 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 3834 return 1; 3835 } 3836 3837 static int wbinvd_interception(struct vcpu_svm *svm) 3838 { 3839 return kvm_emulate_wbinvd(&svm->vcpu); 3840 } 3841 3842 static int xsetbv_interception(struct vcpu_svm *svm) 3843 { 3844 u64 new_bv = kvm_read_edx_eax(&svm->vcpu); 3845 u32 index = kvm_rcx_read(&svm->vcpu); 3846 3847 if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { 3848 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3849 return kvm_skip_emulated_instruction(&svm->vcpu); 3850 } 3851 3852 return 1; 3853 } 3854 3855 static int task_switch_interception(struct vcpu_svm *svm) 3856 { 3857 u16 tss_selector; 3858 int reason; 3859 int int_type = svm->vmcb->control.exit_int_info & 3860 SVM_EXITINTINFO_TYPE_MASK; 3861 int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK; 3862 uint32_t type = 3863 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; 3864 uint32_t idt_v = 3865 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; 3866 bool has_error_code = false; 3867 u32 error_code = 0; 3868 3869 tss_selector = (u16)svm->vmcb->control.exit_info_1; 3870 3871 if (svm->vmcb->control.exit_info_2 & 3872 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET)) 3873 reason = TASK_SWITCH_IRET; 3874 else if (svm->vmcb->control.exit_info_2 & 3875 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) 3876 reason = TASK_SWITCH_JMP; 3877 else if (idt_v) 3878 reason = TASK_SWITCH_GATE; 3879 else 3880 reason = TASK_SWITCH_CALL; 3881 3882 if (reason == TASK_SWITCH_GATE) { 3883 switch (type) { 3884 case SVM_EXITINTINFO_TYPE_NMI: 3885 svm->vcpu.arch.nmi_injected = false; 3886 break; 3887 case SVM_EXITINTINFO_TYPE_EXEPT: 3888 if (svm->vmcb->control.exit_info_2 & 3889 (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) { 3890 has_error_code = true; 3891 error_code = 3892 (u32)svm->vmcb->control.exit_info_2; 3893 } 3894 kvm_clear_exception_queue(&svm->vcpu); 3895 break; 3896 case SVM_EXITINTINFO_TYPE_INTR: 3897 kvm_clear_interrupt_queue(&svm->vcpu); 3898 break; 3899 default: 3900 break; 3901 } 3902 } 3903 3904 if (reason != TASK_SWITCH_GATE || 3905 int_type == SVM_EXITINTINFO_TYPE_SOFT || 3906 (int_type == SVM_EXITINTINFO_TYPE_EXEPT && 3907 (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) 3908 skip_emulated_instruction(&svm->vcpu); 3909 3910 if (int_type != SVM_EXITINTINFO_TYPE_SOFT) 3911 int_vec = -1; 3912 3913 if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, 3914 has_error_code, error_code) == EMULATE_FAIL) { 3915 svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3916 svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 3917 svm->vcpu.run->internal.ndata = 0; 3918 return 0; 3919 } 3920 return 1; 3921 } 3922 3923 static int cpuid_interception(struct vcpu_svm *svm) 3924 { 3925 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 3926 return kvm_emulate_cpuid(&svm->vcpu); 3927 } 3928 3929 static int iret_interception(struct vcpu_svm *svm) 3930 { 3931 ++svm->vcpu.stat.nmi_window_exits; 3932 clr_intercept(svm, INTERCEPT_IRET); 3933 svm->vcpu.arch.hflags |= HF_IRET_MASK; 3934 svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); 3935 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 3936 return 1; 3937 } 3938 3939 static int invlpg_interception(struct vcpu_svm *svm) 3940 { 3941 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) 3942 return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; 3943 3944 kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); 3945 return kvm_skip_emulated_instruction(&svm->vcpu); 3946 } 3947 3948 static int emulate_on_interception(struct vcpu_svm *svm) 3949 { 3950 return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; 3951 } 3952 3953 static int rsm_interception(struct vcpu_svm *svm) 3954 { 3955 return kvm_emulate_instruction_from_buffer(&svm->vcpu, 3956 rsm_ins_bytes, 2) == EMULATE_DONE; 3957 } 3958 3959 static int rdpmc_interception(struct vcpu_svm *svm) 3960 { 3961 int err; 3962 3963 if (!nrips) 3964 return emulate_on_interception(svm); 3965 3966 err = kvm_rdpmc(&svm->vcpu); 3967 return kvm_complete_insn_gp(&svm->vcpu, err); 3968 } 3969 3970 static bool check_selective_cr0_intercepted(struct vcpu_svm *svm, 3971 unsigned long val) 3972 { 3973 unsigned long cr0 = svm->vcpu.arch.cr0; 3974 bool ret = false; 3975 u64 intercept; 3976 3977 intercept = svm->nested.intercept; 3978 3979 if (!is_guest_mode(&svm->vcpu) || 3980 (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))) 3981 return false; 3982 3983 cr0 &= ~SVM_CR0_SELECTIVE_MASK; 3984 val &= ~SVM_CR0_SELECTIVE_MASK; 3985 3986 if (cr0 ^ val) { 3987 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; 3988 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE); 3989 } 3990 3991 return ret; 3992 } 3993 3994 #define CR_VALID (1ULL << 63) 3995 3996 static int cr_interception(struct vcpu_svm *svm) 3997 { 3998 int reg, cr; 3999 unsigned long val; 4000 int err; 4001 4002 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) 4003 return emulate_on_interception(svm); 4004 4005 if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0)) 4006 return emulate_on_interception(svm); 4007 4008 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; 4009 if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE) 4010 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0; 4011 else 4012 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0; 4013 4014 err = 0; 4015 if (cr >= 16) { /* mov to cr */ 4016 cr -= 16; 4017 val = kvm_register_read(&svm->vcpu, reg); 4018 switch (cr) { 4019 case 0: 4020 if (!check_selective_cr0_intercepted(svm, val)) 4021 err = kvm_set_cr0(&svm->vcpu, val); 4022 else 4023 return 1; 4024 4025 break; 4026 case 3: 4027 err = kvm_set_cr3(&svm->vcpu, val); 4028 break; 4029 case 4: 4030 err = kvm_set_cr4(&svm->vcpu, val); 4031 break; 4032 case 8: 4033 err = kvm_set_cr8(&svm->vcpu, val); 4034 break; 4035 default: 4036 WARN(1, "unhandled write to CR%d", cr); 4037 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 4038 return 1; 4039 } 4040 } else { /* mov from cr */ 4041 switch (cr) { 4042 case 0: 4043 val = kvm_read_cr0(&svm->vcpu); 4044 break; 4045 case 2: 4046 val = svm->vcpu.arch.cr2; 4047 break; 4048 case 3: 4049 val = kvm_read_cr3(&svm->vcpu); 4050 break; 4051 case 4: 4052 val = kvm_read_cr4(&svm->vcpu); 4053 break; 4054 case 8: 4055 val = kvm_get_cr8(&svm->vcpu); 4056 break; 4057 default: 4058 WARN(1, "unhandled read from CR%d", cr); 4059 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 4060 return 1; 4061 } 4062 kvm_register_write(&svm->vcpu, reg, val); 4063 } 4064 return kvm_complete_insn_gp(&svm->vcpu, err); 4065 } 4066 4067 static int dr_interception(struct vcpu_svm *svm) 4068 { 4069 int reg, dr; 4070 unsigned long val; 4071 4072 if (svm->vcpu.guest_debug == 0) { 4073 /* 4074 * No more DR vmexits; force a reload of the debug registers 4075 * and reenter on this instruction. The next vmexit will 4076 * retrieve the full state of the debug registers. 4077 */ 4078 clr_dr_intercepts(svm); 4079 svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 4080 return 1; 4081 } 4082 4083 if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) 4084 return emulate_on_interception(svm); 4085 4086 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; 4087 dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; 4088 4089 if (dr >= 16) { /* mov to DRn */ 4090 if (!kvm_require_dr(&svm->vcpu, dr - 16)) 4091 return 1; 4092 val = kvm_register_read(&svm->vcpu, reg); 4093 kvm_set_dr(&svm->vcpu, dr - 16, val); 4094 } else { 4095 if (!kvm_require_dr(&svm->vcpu, dr)) 4096 return 1; 4097 kvm_get_dr(&svm->vcpu, dr, &val); 4098 kvm_register_write(&svm->vcpu, reg, val); 4099 } 4100 4101 return kvm_skip_emulated_instruction(&svm->vcpu); 4102 } 4103 4104 static int cr8_write_interception(struct vcpu_svm *svm) 4105 { 4106 struct kvm_run *kvm_run = svm->vcpu.run; 4107 int r; 4108 4109 u8 cr8_prev = kvm_get_cr8(&svm->vcpu); 4110 /* instruction emulation calls kvm_set_cr8() */ 4111 r = cr_interception(svm); 4112 if (lapic_in_kernel(&svm->vcpu)) 4113 return r; 4114 if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) 4115 return r; 4116 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 4117 return 0; 4118 } 4119 4120 static int svm_get_msr_feature(struct kvm_msr_entry *msr) 4121 { 4122 msr->data = 0; 4123 4124 switch (msr->index) { 4125 case MSR_F10H_DECFG: 4126 if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) 4127 msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE; 4128 break; 4129 default: 4130 return 1; 4131 } 4132 4133 return 0; 4134 } 4135 4136 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 4137 { 4138 struct vcpu_svm *svm = to_svm(vcpu); 4139 4140 switch (msr_info->index) { 4141 case MSR_STAR: 4142 msr_info->data = svm->vmcb->save.star; 4143 break; 4144 #ifdef CONFIG_X86_64 4145 case MSR_LSTAR: 4146 msr_info->data = svm->vmcb->save.lstar; 4147 break; 4148 case MSR_CSTAR: 4149 msr_info->data = svm->vmcb->save.cstar; 4150 break; 4151 case MSR_KERNEL_GS_BASE: 4152 msr_info->data = svm->vmcb->save.kernel_gs_base; 4153 break; 4154 case MSR_SYSCALL_MASK: 4155 msr_info->data = svm->vmcb->save.sfmask; 4156 break; 4157 #endif 4158 case MSR_IA32_SYSENTER_CS: 4159 msr_info->data = svm->vmcb->save.sysenter_cs; 4160 break; 4161 case MSR_IA32_SYSENTER_EIP: 4162 msr_info->data = svm->sysenter_eip; 4163 break; 4164 case MSR_IA32_SYSENTER_ESP: 4165 msr_info->data = svm->sysenter_esp; 4166 break; 4167 case MSR_TSC_AUX: 4168 if (!boot_cpu_has(X86_FEATURE_RDTSCP)) 4169 return 1; 4170 msr_info->data = svm->tsc_aux; 4171 break; 4172 /* 4173 * Nobody will change the following 5 values in the VMCB so we can 4174 * safely return them on rdmsr. They will always be 0 until LBRV is 4175 * implemented. 4176 */ 4177 case MSR_IA32_DEBUGCTLMSR: 4178 msr_info->data = svm->vmcb->save.dbgctl; 4179 break; 4180 case MSR_IA32_LASTBRANCHFROMIP: 4181 msr_info->data = svm->vmcb->save.br_from; 4182 break; 4183 case MSR_IA32_LASTBRANCHTOIP: 4184 msr_info->data = svm->vmcb->save.br_to; 4185 break; 4186 case MSR_IA32_LASTINTFROMIP: 4187 msr_info->data = svm->vmcb->save.last_excp_from; 4188 break; 4189 case MSR_IA32_LASTINTTOIP: 4190 msr_info->data = svm->vmcb->save.last_excp_to; 4191 break; 4192 case MSR_VM_HSAVE_PA: 4193 msr_info->data = svm->nested.hsave_msr; 4194 break; 4195 case MSR_VM_CR: 4196 msr_info->data = svm->nested.vm_cr_msr; 4197 break; 4198 case MSR_IA32_SPEC_CTRL: 4199 if (!msr_info->host_initiated && 4200 !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) && 4201 !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) 4202 return 1; 4203 4204 msr_info->data = svm->spec_ctrl; 4205 break; 4206 case MSR_AMD64_VIRT_SPEC_CTRL: 4207 if (!msr_info->host_initiated && 4208 !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) 4209 return 1; 4210 4211 msr_info->data = svm->virt_spec_ctrl; 4212 break; 4213 case MSR_F15H_IC_CFG: { 4214 4215 int family, model; 4216 4217 family = guest_cpuid_family(vcpu); 4218 model = guest_cpuid_model(vcpu); 4219 4220 if (family < 0 || model < 0) 4221 return kvm_get_msr_common(vcpu, msr_info); 4222 4223 msr_info->data = 0; 4224 4225 if (family == 0x15 && 4226 (model >= 0x2 && model < 0x20)) 4227 msr_info->data = 0x1E; 4228 } 4229 break; 4230 case MSR_F10H_DECFG: 4231 msr_info->data = svm->msr_decfg; 4232 break; 4233 default: 4234 return kvm_get_msr_common(vcpu, msr_info); 4235 } 4236 return 0; 4237 } 4238 4239 static int rdmsr_interception(struct vcpu_svm *svm) 4240 { 4241 u32 ecx = kvm_rcx_read(&svm->vcpu); 4242 struct msr_data msr_info; 4243 4244 msr_info.index = ecx; 4245 msr_info.host_initiated = false; 4246 if (svm_get_msr(&svm->vcpu, &msr_info)) { 4247 trace_kvm_msr_read_ex(ecx); 4248 kvm_inject_gp(&svm->vcpu, 0); 4249 return 1; 4250 } else { 4251 trace_kvm_msr_read(ecx, msr_info.data); 4252 4253 kvm_rax_write(&svm->vcpu, msr_info.data & 0xffffffff); 4254 kvm_rdx_write(&svm->vcpu, msr_info.data >> 32); 4255 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 4256 return kvm_skip_emulated_instruction(&svm->vcpu); 4257 } 4258 } 4259 4260 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) 4261 { 4262 struct vcpu_svm *svm = to_svm(vcpu); 4263 int svm_dis, chg_mask; 4264 4265 if (data & ~SVM_VM_CR_VALID_MASK) 4266 return 1; 4267 4268 chg_mask = SVM_VM_CR_VALID_MASK; 4269 4270 if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK) 4271 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK); 4272 4273 svm->nested.vm_cr_msr &= ~chg_mask; 4274 svm->nested.vm_cr_msr |= (data & chg_mask); 4275 4276 svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK; 4277 4278 /* check for svm_disable while efer.svme is set */ 4279 if (svm_dis && (vcpu->arch.efer & EFER_SVME)) 4280 return 1; 4281 4282 return 0; 4283 } 4284 4285 static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 4286 { 4287 struct vcpu_svm *svm = to_svm(vcpu); 4288 4289 u32 ecx = msr->index; 4290 u64 data = msr->data; 4291 switch (ecx) { 4292 case MSR_IA32_CR_PAT: 4293 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) 4294 return 1; 4295 vcpu->arch.pat = data; 4296 svm->vmcb->save.g_pat = data; 4297 mark_dirty(svm->vmcb, VMCB_NPT); 4298 break; 4299 case MSR_IA32_SPEC_CTRL: 4300 if (!msr->host_initiated && 4301 !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) && 4302 !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) 4303 return 1; 4304 4305 /* The STIBP bit doesn't fault even if it's not advertised */ 4306 if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) 4307 return 1; 4308 4309 svm->spec_ctrl = data; 4310 4311 if (!data) 4312 break; 4313 4314 /* 4315 * For non-nested: 4316 * When it's written (to non-zero) for the first time, pass 4317 * it through. 4318 * 4319 * For nested: 4320 * The handling of the MSR bitmap for L2 guests is done in 4321 * nested_svm_vmrun_msrpm. 4322 * We update the L1 MSR bit as well since it will end up 4323 * touching the MSR anyway now. 4324 */ 4325 set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); 4326 break; 4327 case MSR_IA32_PRED_CMD: 4328 if (!msr->host_initiated && 4329 !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)) 4330 return 1; 4331 4332 if (data & ~PRED_CMD_IBPB) 4333 return 1; 4334 4335 if (!data) 4336 break; 4337 4338 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); 4339 if (is_guest_mode(vcpu)) 4340 break; 4341 set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); 4342 break; 4343 case MSR_AMD64_VIRT_SPEC_CTRL: 4344 if (!msr->host_initiated && 4345 !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) 4346 return 1; 4347 4348 if (data & ~SPEC_CTRL_SSBD) 4349 return 1; 4350 4351 svm->virt_spec_ctrl = data; 4352 break; 4353 case MSR_STAR: 4354 svm->vmcb->save.star = data; 4355 break; 4356 #ifdef CONFIG_X86_64 4357 case MSR_LSTAR: 4358 svm->vmcb->save.lstar = data; 4359 break; 4360 case MSR_CSTAR: 4361 svm->vmcb->save.cstar = data; 4362 break; 4363 case MSR_KERNEL_GS_BASE: 4364 svm->vmcb->save.kernel_gs_base = data; 4365 break; 4366 case MSR_SYSCALL_MASK: 4367 svm->vmcb->save.sfmask = data; 4368 break; 4369 #endif 4370 case MSR_IA32_SYSENTER_CS: 4371 svm->vmcb->save.sysenter_cs = data; 4372 break; 4373 case MSR_IA32_SYSENTER_EIP: 4374 svm->sysenter_eip = data; 4375 svm->vmcb->save.sysenter_eip = data; 4376 break; 4377 case MSR_IA32_SYSENTER_ESP: 4378 svm->sysenter_esp = data; 4379 svm->vmcb->save.sysenter_esp = data; 4380 break; 4381 case MSR_TSC_AUX: 4382 if (!boot_cpu_has(X86_FEATURE_RDTSCP)) 4383 return 1; 4384 4385 /* 4386 * This is rare, so we update the MSR here instead of using 4387 * direct_access_msrs. Doing that would require a rdmsr in 4388 * svm_vcpu_put. 4389 */ 4390 svm->tsc_aux = data; 4391 wrmsrl(MSR_TSC_AUX, svm->tsc_aux); 4392 break; 4393 case MSR_IA32_DEBUGCTLMSR: 4394 if (!boot_cpu_has(X86_FEATURE_LBRV)) { 4395 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", 4396 __func__, data); 4397 break; 4398 } 4399 if (data & DEBUGCTL_RESERVED_BITS) 4400 return 1; 4401 4402 svm->vmcb->save.dbgctl = data; 4403 mark_dirty(svm->vmcb, VMCB_LBR); 4404 if (data & (1ULL<<0)) 4405 svm_enable_lbrv(svm); 4406 else 4407 svm_disable_lbrv(svm); 4408 break; 4409 case MSR_VM_HSAVE_PA: 4410 svm->nested.hsave_msr = data; 4411 break; 4412 case MSR_VM_CR: 4413 return svm_set_vm_cr(vcpu, data); 4414 case MSR_VM_IGNNE: 4415 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); 4416 break; 4417 case MSR_F10H_DECFG: { 4418 struct kvm_msr_entry msr_entry; 4419 4420 msr_entry.index = msr->index; 4421 if (svm_get_msr_feature(&msr_entry)) 4422 return 1; 4423 4424 /* Check the supported bits */ 4425 if (data & ~msr_entry.data) 4426 return 1; 4427 4428 /* Don't allow the guest to change a bit, #GP */ 4429 if (!msr->host_initiated && (data ^ msr_entry.data)) 4430 return 1; 4431 4432 svm->msr_decfg = data; 4433 break; 4434 } 4435 case MSR_IA32_APICBASE: 4436 if (kvm_vcpu_apicv_active(vcpu)) 4437 avic_update_vapic_bar(to_svm(vcpu), data); 4438 /* Fall through */ 4439 default: 4440 return kvm_set_msr_common(vcpu, msr); 4441 } 4442 return 0; 4443 } 4444 4445 static int wrmsr_interception(struct vcpu_svm *svm) 4446 { 4447 struct msr_data msr; 4448 u32 ecx = kvm_rcx_read(&svm->vcpu); 4449 u64 data = kvm_read_edx_eax(&svm->vcpu); 4450 4451 msr.data = data; 4452 msr.index = ecx; 4453 msr.host_initiated = false; 4454 4455 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 4456 if (kvm_set_msr(&svm->vcpu, &msr)) { 4457 trace_kvm_msr_write_ex(ecx, data); 4458 kvm_inject_gp(&svm->vcpu, 0); 4459 return 1; 4460 } else { 4461 trace_kvm_msr_write(ecx, data); 4462 return kvm_skip_emulated_instruction(&svm->vcpu); 4463 } 4464 } 4465 4466 static int msr_interception(struct vcpu_svm *svm) 4467 { 4468 if (svm->vmcb->control.exit_info_1) 4469 return wrmsr_interception(svm); 4470 else 4471 return rdmsr_interception(svm); 4472 } 4473 4474 static int interrupt_window_interception(struct vcpu_svm *svm) 4475 { 4476 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 4477 svm_clear_vintr(svm); 4478 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 4479 mark_dirty(svm->vmcb, VMCB_INTR); 4480 ++svm->vcpu.stat.irq_window_exits; 4481 return 1; 4482 } 4483 4484 static int pause_interception(struct vcpu_svm *svm) 4485 { 4486 struct kvm_vcpu *vcpu = &svm->vcpu; 4487 bool in_kernel = (svm_get_cpl(vcpu) == 0); 4488 4489 if (pause_filter_thresh) 4490 grow_ple_window(vcpu); 4491 4492 kvm_vcpu_on_spin(vcpu, in_kernel); 4493 return 1; 4494 } 4495 4496 static int nop_interception(struct vcpu_svm *svm) 4497 { 4498 return kvm_skip_emulated_instruction(&(svm->vcpu)); 4499 } 4500 4501 static int monitor_interception(struct vcpu_svm *svm) 4502 { 4503 printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); 4504 return nop_interception(svm); 4505 } 4506 4507 static int mwait_interception(struct vcpu_svm *svm) 4508 { 4509 printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); 4510 return nop_interception(svm); 4511 } 4512 4513 enum avic_ipi_failure_cause { 4514 AVIC_IPI_FAILURE_INVALID_INT_TYPE, 4515 AVIC_IPI_FAILURE_TARGET_NOT_RUNNING, 4516 AVIC_IPI_FAILURE_INVALID_TARGET, 4517 AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, 4518 }; 4519 4520 static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) 4521 { 4522 u32 icrh = svm->vmcb->control.exit_info_1 >> 32; 4523 u32 icrl = svm->vmcb->control.exit_info_1; 4524 u32 id = svm->vmcb->control.exit_info_2 >> 32; 4525 u32 index = svm->vmcb->control.exit_info_2 & 0xFF; 4526 struct kvm_lapic *apic = svm->vcpu.arch.apic; 4527 4528 trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index); 4529 4530 switch (id) { 4531 case AVIC_IPI_FAILURE_INVALID_INT_TYPE: 4532 /* 4533 * AVIC hardware handles the generation of 4534 * IPIs when the specified Message Type is Fixed 4535 * (also known as fixed delivery mode) and 4536 * the Trigger Mode is edge-triggered. The hardware 4537 * also supports self and broadcast delivery modes 4538 * specified via the Destination Shorthand(DSH) 4539 * field of the ICRL. Logical and physical APIC ID 4540 * formats are supported. All other IPI types cause 4541 * a #VMEXIT, which needs to emulated. 4542 */ 4543 kvm_lapic_reg_write(apic, APIC_ICR2, icrh); 4544 kvm_lapic_reg_write(apic, APIC_ICR, icrl); 4545 break; 4546 case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: { 4547 int i; 4548 struct kvm_vcpu *vcpu; 4549 struct kvm *kvm = svm->vcpu.kvm; 4550 struct kvm_lapic *apic = svm->vcpu.arch.apic; 4551 4552 /* 4553 * At this point, we expect that the AVIC HW has already 4554 * set the appropriate IRR bits on the valid target 4555 * vcpus. So, we just need to kick the appropriate vcpu. 4556 */ 4557 kvm_for_each_vcpu(i, vcpu, kvm) { 4558 bool m = kvm_apic_match_dest(vcpu, apic, 4559 icrl & KVM_APIC_SHORT_MASK, 4560 GET_APIC_DEST_FIELD(icrh), 4561 icrl & KVM_APIC_DEST_MASK); 4562 4563 if (m && !avic_vcpu_is_running(vcpu)) 4564 kvm_vcpu_wake_up(vcpu); 4565 } 4566 break; 4567 } 4568 case AVIC_IPI_FAILURE_INVALID_TARGET: 4569 WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n", 4570 index, svm->vcpu.vcpu_id, icrh, icrl); 4571 break; 4572 case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: 4573 WARN_ONCE(1, "Invalid backing page\n"); 4574 break; 4575 default: 4576 pr_err("Unknown IPI interception\n"); 4577 } 4578 4579 return 1; 4580 } 4581 4582 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) 4583 { 4584 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 4585 int index; 4586 u32 *logical_apic_id_table; 4587 int dlid = GET_APIC_LOGICAL_ID(ldr); 4588 4589 if (!dlid) 4590 return NULL; 4591 4592 if (flat) { /* flat */ 4593 index = ffs(dlid) - 1; 4594 if (index > 7) 4595 return NULL; 4596 } else { /* cluster */ 4597 int cluster = (dlid & 0xf0) >> 4; 4598 int apic = ffs(dlid & 0x0f) - 1; 4599 4600 if ((apic < 0) || (apic > 7) || 4601 (cluster >= 0xf)) 4602 return NULL; 4603 index = (cluster << 2) + apic; 4604 } 4605 4606 logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page); 4607 4608 return &logical_apic_id_table[index]; 4609 } 4610 4611 static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) 4612 { 4613 bool flat; 4614 u32 *entry, new_entry; 4615 4616 flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT; 4617 entry = avic_get_logical_id_entry(vcpu, ldr, flat); 4618 if (!entry) 4619 return -EINVAL; 4620 4621 new_entry = READ_ONCE(*entry); 4622 new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; 4623 new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); 4624 new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; 4625 WRITE_ONCE(*entry, new_entry); 4626 4627 return 0; 4628 } 4629 4630 static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) 4631 { 4632 struct vcpu_svm *svm = to_svm(vcpu); 4633 bool flat = svm->dfr_reg == APIC_DFR_FLAT; 4634 u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); 4635 4636 if (entry) 4637 clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry); 4638 } 4639 4640 static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) 4641 { 4642 int ret = 0; 4643 struct vcpu_svm *svm = to_svm(vcpu); 4644 u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); 4645 4646 if (ldr == svm->ldr_reg) 4647 return 0; 4648 4649 avic_invalidate_logical_id_entry(vcpu); 4650 4651 if (ldr) 4652 ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr); 4653 4654 if (!ret) 4655 svm->ldr_reg = ldr; 4656 4657 return ret; 4658 } 4659 4660 static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu) 4661 { 4662 u64 *old, *new; 4663 struct vcpu_svm *svm = to_svm(vcpu); 4664 u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID); 4665 u32 id = (apic_id_reg >> 24) & 0xff; 4666 4667 if (vcpu->vcpu_id == id) 4668 return 0; 4669 4670 old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id); 4671 new = avic_get_physical_id_entry(vcpu, id); 4672 if (!new || !old) 4673 return 1; 4674 4675 /* We need to move physical_id_entry to new offset */ 4676 *new = *old; 4677 *old = 0ULL; 4678 to_svm(vcpu)->avic_physical_id_cache = new; 4679 4680 /* 4681 * Also update the guest physical APIC ID in the logical 4682 * APIC ID table entry if already setup the LDR. 4683 */ 4684 if (svm->ldr_reg) 4685 avic_handle_ldr_update(vcpu); 4686 4687 return 0; 4688 } 4689 4690 static void avic_handle_dfr_update(struct kvm_vcpu *vcpu) 4691 { 4692 struct vcpu_svm *svm = to_svm(vcpu); 4693 u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR); 4694 4695 if (svm->dfr_reg == dfr) 4696 return; 4697 4698 avic_invalidate_logical_id_entry(vcpu); 4699 svm->dfr_reg = dfr; 4700 } 4701 4702 static int avic_unaccel_trap_write(struct vcpu_svm *svm) 4703 { 4704 struct kvm_lapic *apic = svm->vcpu.arch.apic; 4705 u32 offset = svm->vmcb->control.exit_info_1 & 4706 AVIC_UNACCEL_ACCESS_OFFSET_MASK; 4707 4708 switch (offset) { 4709 case APIC_ID: 4710 if (avic_handle_apic_id_update(&svm->vcpu)) 4711 return 0; 4712 break; 4713 case APIC_LDR: 4714 if (avic_handle_ldr_update(&svm->vcpu)) 4715 return 0; 4716 break; 4717 case APIC_DFR: 4718 avic_handle_dfr_update(&svm->vcpu); 4719 break; 4720 default: 4721 break; 4722 } 4723 4724 kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); 4725 4726 return 1; 4727 } 4728 4729 static bool is_avic_unaccelerated_access_trap(u32 offset) 4730 { 4731 bool ret = false; 4732 4733 switch (offset) { 4734 case APIC_ID: 4735 case APIC_EOI: 4736 case APIC_RRR: 4737 case APIC_LDR: 4738 case APIC_DFR: 4739 case APIC_SPIV: 4740 case APIC_ESR: 4741 case APIC_ICR: 4742 case APIC_LVTT: 4743 case APIC_LVTTHMR: 4744 case APIC_LVTPC: 4745 case APIC_LVT0: 4746 case APIC_LVT1: 4747 case APIC_LVTERR: 4748 case APIC_TMICT: 4749 case APIC_TDCR: 4750 ret = true; 4751 break; 4752 default: 4753 break; 4754 } 4755 return ret; 4756 } 4757 4758 static int avic_unaccelerated_access_interception(struct vcpu_svm *svm) 4759 { 4760 int ret = 0; 4761 u32 offset = svm->vmcb->control.exit_info_1 & 4762 AVIC_UNACCEL_ACCESS_OFFSET_MASK; 4763 u32 vector = svm->vmcb->control.exit_info_2 & 4764 AVIC_UNACCEL_ACCESS_VECTOR_MASK; 4765 bool write = (svm->vmcb->control.exit_info_1 >> 32) & 4766 AVIC_UNACCEL_ACCESS_WRITE_MASK; 4767 bool trap = is_avic_unaccelerated_access_trap(offset); 4768 4769 trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset, 4770 trap, write, vector); 4771 if (trap) { 4772 /* Handling Trap */ 4773 WARN_ONCE(!write, "svm: Handling trap read.\n"); 4774 ret = avic_unaccel_trap_write(svm); 4775 } else { 4776 /* Handling Fault */ 4777 ret = (kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE); 4778 } 4779 4780 return ret; 4781 } 4782 4783 static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { 4784 [SVM_EXIT_READ_CR0] = cr_interception, 4785 [SVM_EXIT_READ_CR3] = cr_interception, 4786 [SVM_EXIT_READ_CR4] = cr_interception, 4787 [SVM_EXIT_READ_CR8] = cr_interception, 4788 [SVM_EXIT_CR0_SEL_WRITE] = cr_interception, 4789 [SVM_EXIT_WRITE_CR0] = cr_interception, 4790 [SVM_EXIT_WRITE_CR3] = cr_interception, 4791 [SVM_EXIT_WRITE_CR4] = cr_interception, 4792 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 4793 [SVM_EXIT_READ_DR0] = dr_interception, 4794 [SVM_EXIT_READ_DR1] = dr_interception, 4795 [SVM_EXIT_READ_DR2] = dr_interception, 4796 [SVM_EXIT_READ_DR3] = dr_interception, 4797 [SVM_EXIT_READ_DR4] = dr_interception, 4798 [SVM_EXIT_READ_DR5] = dr_interception, 4799 [SVM_EXIT_READ_DR6] = dr_interception, 4800 [SVM_EXIT_READ_DR7] = dr_interception, 4801 [SVM_EXIT_WRITE_DR0] = dr_interception, 4802 [SVM_EXIT_WRITE_DR1] = dr_interception, 4803 [SVM_EXIT_WRITE_DR2] = dr_interception, 4804 [SVM_EXIT_WRITE_DR3] = dr_interception, 4805 [SVM_EXIT_WRITE_DR4] = dr_interception, 4806 [SVM_EXIT_WRITE_DR5] = dr_interception, 4807 [SVM_EXIT_WRITE_DR6] = dr_interception, 4808 [SVM_EXIT_WRITE_DR7] = dr_interception, 4809 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 4810 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 4811 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, 4812 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 4813 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, 4814 [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception, 4815 [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception, 4816 [SVM_EXIT_INTR] = intr_interception, 4817 [SVM_EXIT_NMI] = nmi_interception, 4818 [SVM_EXIT_SMI] = nop_on_interception, 4819 [SVM_EXIT_INIT] = nop_on_interception, 4820 [SVM_EXIT_VINTR] = interrupt_window_interception, 4821 [SVM_EXIT_RDPMC] = rdpmc_interception, 4822 [SVM_EXIT_CPUID] = cpuid_interception, 4823 [SVM_EXIT_IRET] = iret_interception, 4824 [SVM_EXIT_INVD] = emulate_on_interception, 4825 [SVM_EXIT_PAUSE] = pause_interception, 4826 [SVM_EXIT_HLT] = halt_interception, 4827 [SVM_EXIT_INVLPG] = invlpg_interception, 4828 [SVM_EXIT_INVLPGA] = invlpga_interception, 4829 [SVM_EXIT_IOIO] = io_interception, 4830 [SVM_EXIT_MSR] = msr_interception, 4831 [SVM_EXIT_TASK_SWITCH] = task_switch_interception, 4832 [SVM_EXIT_SHUTDOWN] = shutdown_interception, 4833 [SVM_EXIT_VMRUN] = vmrun_interception, 4834 [SVM_EXIT_VMMCALL] = vmmcall_interception, 4835 [SVM_EXIT_VMLOAD] = vmload_interception, 4836 [SVM_EXIT_VMSAVE] = vmsave_interception, 4837 [SVM_EXIT_STGI] = stgi_interception, 4838 [SVM_EXIT_CLGI] = clgi_interception, 4839 [SVM_EXIT_SKINIT] = skinit_interception, 4840 [SVM_EXIT_WBINVD] = wbinvd_interception, 4841 [SVM_EXIT_MONITOR] = monitor_interception, 4842 [SVM_EXIT_MWAIT] = mwait_interception, 4843 [SVM_EXIT_XSETBV] = xsetbv_interception, 4844 [SVM_EXIT_NPF] = npf_interception, 4845 [SVM_EXIT_RSM] = rsm_interception, 4846 [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, 4847 [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, 4848 }; 4849 4850 static void dump_vmcb(struct kvm_vcpu *vcpu) 4851 { 4852 struct vcpu_svm *svm = to_svm(vcpu); 4853 struct vmcb_control_area *control = &svm->vmcb->control; 4854 struct vmcb_save_area *save = &svm->vmcb->save; 4855 4856 if (!dump_invalid_vmcb) { 4857 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); 4858 return; 4859 } 4860 4861 pr_err("VMCB Control Area:\n"); 4862 pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff); 4863 pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16); 4864 pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff); 4865 pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16); 4866 pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions); 4867 pr_err("%-20s%016llx\n", "intercepts:", control->intercept); 4868 pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count); 4869 pr_err("%-20s%d\n", "pause filter threshold:", 4870 control->pause_filter_thresh); 4871 pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa); 4872 pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa); 4873 pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset); 4874 pr_err("%-20s%d\n", "asid:", control->asid); 4875 pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl); 4876 pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); 4877 pr_err("%-20s%08x\n", "int_vector:", control->int_vector); 4878 pr_err("%-20s%08x\n", "int_state:", control->int_state); 4879 pr_err("%-20s%08x\n", "exit_code:", control->exit_code); 4880 pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1); 4881 pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2); 4882 pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info); 4883 pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); 4884 pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); 4885 pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); 4886 pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); 4887 pr_err("%-20s%08x\n", "event_inj:", control->event_inj); 4888 pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); 4889 pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext); 4890 pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); 4891 pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page); 4892 pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); 4893 pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); 4894 pr_err("VMCB State Save Area:\n"); 4895 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 4896 "es:", 4897 save->es.selector, save->es.attrib, 4898 save->es.limit, save->es.base); 4899 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 4900 "cs:", 4901 save->cs.selector, save->cs.attrib, 4902 save->cs.limit, save->cs.base); 4903 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 4904 "ss:", 4905 save->ss.selector, save->ss.attrib, 4906 save->ss.limit, save->ss.base); 4907 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 4908 "ds:", 4909 save->ds.selector, save->ds.attrib, 4910 save->ds.limit, save->ds.base); 4911 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 4912 "fs:", 4913 save->fs.selector, save->fs.attrib, 4914 save->fs.limit, save->fs.base); 4915 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 4916 "gs:", 4917 save->gs.selector, save->gs.attrib, 4918 save->gs.limit, save->gs.base); 4919 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 4920 "gdtr:", 4921 save->gdtr.selector, save->gdtr.attrib, 4922 save->gdtr.limit, save->gdtr.base); 4923 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 4924 "ldtr:", 4925 save->ldtr.selector, save->ldtr.attrib, 4926 save->ldtr.limit, save->ldtr.base); 4927 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 4928 "idtr:", 4929 save->idtr.selector, save->idtr.attrib, 4930 save->idtr.limit, save->idtr.base); 4931 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 4932 "tr:", 4933 save->tr.selector, save->tr.attrib, 4934 save->tr.limit, save->tr.base); 4935 pr_err("cpl: %d efer: %016llx\n", 4936 save->cpl, save->efer); 4937 pr_err("%-15s %016llx %-13s %016llx\n", 4938 "cr0:", save->cr0, "cr2:", save->cr2); 4939 pr_err("%-15s %016llx %-13s %016llx\n", 4940 "cr3:", save->cr3, "cr4:", save->cr4); 4941 pr_err("%-15s %016llx %-13s %016llx\n", 4942 "dr6:", save->dr6, "dr7:", save->dr7); 4943 pr_err("%-15s %016llx %-13s %016llx\n", 4944 "rip:", save->rip, "rflags:", save->rflags); 4945 pr_err("%-15s %016llx %-13s %016llx\n", 4946 "rsp:", save->rsp, "rax:", save->rax); 4947 pr_err("%-15s %016llx %-13s %016llx\n", 4948 "star:", save->star, "lstar:", save->lstar); 4949 pr_err("%-15s %016llx %-13s %016llx\n", 4950 "cstar:", save->cstar, "sfmask:", save->sfmask); 4951 pr_err("%-15s %016llx %-13s %016llx\n", 4952 "kernel_gs_base:", save->kernel_gs_base, 4953 "sysenter_cs:", save->sysenter_cs); 4954 pr_err("%-15s %016llx %-13s %016llx\n", 4955 "sysenter_esp:", save->sysenter_esp, 4956 "sysenter_eip:", save->sysenter_eip); 4957 pr_err("%-15s %016llx %-13s %016llx\n", 4958 "gpat:", save->g_pat, "dbgctl:", save->dbgctl); 4959 pr_err("%-15s %016llx %-13s %016llx\n", 4960 "br_from:", save->br_from, "br_to:", save->br_to); 4961 pr_err("%-15s %016llx %-13s %016llx\n", 4962 "excp_from:", save->last_excp_from, 4963 "excp_to:", save->last_excp_to); 4964 } 4965 4966 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) 4967 { 4968 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; 4969 4970 *info1 = control->exit_info_1; 4971 *info2 = control->exit_info_2; 4972 } 4973 4974 static int handle_exit(struct kvm_vcpu *vcpu) 4975 { 4976 struct vcpu_svm *svm = to_svm(vcpu); 4977 struct kvm_run *kvm_run = vcpu->run; 4978 u32 exit_code = svm->vmcb->control.exit_code; 4979 4980 trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); 4981 4982 if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) 4983 vcpu->arch.cr0 = svm->vmcb->save.cr0; 4984 if (npt_enabled) 4985 vcpu->arch.cr3 = svm->vmcb->save.cr3; 4986 4987 if (unlikely(svm->nested.exit_required)) { 4988 nested_svm_vmexit(svm); 4989 svm->nested.exit_required = false; 4990 4991 return 1; 4992 } 4993 4994 if (is_guest_mode(vcpu)) { 4995 int vmexit; 4996 4997 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, 4998 svm->vmcb->control.exit_info_1, 4999 svm->vmcb->control.exit_info_2, 5000 svm->vmcb->control.exit_int_info, 5001 svm->vmcb->control.exit_int_info_err, 5002 KVM_ISA_SVM); 5003 5004 vmexit = nested_svm_exit_special(svm); 5005 5006 if (vmexit == NESTED_EXIT_CONTINUE) 5007 vmexit = nested_svm_exit_handled(svm); 5008 5009 if (vmexit == NESTED_EXIT_DONE) 5010 return 1; 5011 } 5012 5013 svm_complete_interrupts(svm); 5014 5015 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { 5016 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 5017 kvm_run->fail_entry.hardware_entry_failure_reason 5018 = svm->vmcb->control.exit_code; 5019 dump_vmcb(vcpu); 5020 return 0; 5021 } 5022 5023 if (is_external_interrupt(svm->vmcb->control.exit_int_info) && 5024 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && 5025 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH && 5026 exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) 5027 printk(KERN_ERR "%s: unexpected exit_int_info 0x%x " 5028 "exit_code 0x%x\n", 5029 __func__, svm->vmcb->control.exit_int_info, 5030 exit_code); 5031 5032 if (exit_code >= ARRAY_SIZE(svm_exit_handlers) 5033 || !svm_exit_handlers[exit_code]) { 5034 WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code); 5035 kvm_queue_exception(vcpu, UD_VECTOR); 5036 return 1; 5037 } 5038 5039 return svm_exit_handlers[exit_code](svm); 5040 } 5041 5042 static void reload_tss(struct kvm_vcpu *vcpu) 5043 { 5044 int cpu = raw_smp_processor_id(); 5045 5046 struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 5047 sd->tss_desc->type = 9; /* available 32/64-bit TSS */ 5048 load_TR_desc(); 5049 } 5050 5051 static void pre_sev_run(struct vcpu_svm *svm, int cpu) 5052 { 5053 struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 5054 int asid = sev_get_asid(svm->vcpu.kvm); 5055 5056 /* Assign the asid allocated with this SEV guest */ 5057 svm->vmcb->control.asid = asid; 5058 5059 /* 5060 * Flush guest TLB: 5061 * 5062 * 1) when different VMCB for the same ASID is to be run on the same host CPU. 5063 * 2) or this VMCB was executed on different host CPU in previous VMRUNs. 5064 */ 5065 if (sd->sev_vmcbs[asid] == svm->vmcb && 5066 svm->last_cpu == cpu) 5067 return; 5068 5069 svm->last_cpu = cpu; 5070 sd->sev_vmcbs[asid] = svm->vmcb; 5071 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; 5072 mark_dirty(svm->vmcb, VMCB_ASID); 5073 } 5074 5075 static void pre_svm_run(struct vcpu_svm *svm) 5076 { 5077 int cpu = raw_smp_processor_id(); 5078 5079 struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 5080 5081 if (sev_guest(svm->vcpu.kvm)) 5082 return pre_sev_run(svm, cpu); 5083 5084 /* FIXME: handle wraparound of asid_generation */ 5085 if (svm->asid_generation != sd->asid_generation) 5086 new_asid(svm, sd); 5087 } 5088 5089 static void svm_inject_nmi(struct kvm_vcpu *vcpu) 5090 { 5091 struct vcpu_svm *svm = to_svm(vcpu); 5092 5093 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; 5094 vcpu->arch.hflags |= HF_NMI_MASK; 5095 set_intercept(svm, INTERCEPT_IRET); 5096 ++vcpu->stat.nmi_injections; 5097 } 5098 5099 static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) 5100 { 5101 struct vmcb_control_area *control; 5102 5103 /* The following fields are ignored when AVIC is enabled */ 5104 control = &svm->vmcb->control; 5105 control->int_vector = irq; 5106 control->int_ctl &= ~V_INTR_PRIO_MASK; 5107 control->int_ctl |= V_IRQ_MASK | 5108 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); 5109 mark_dirty(svm->vmcb, VMCB_INTR); 5110 } 5111 5112 static void svm_set_irq(struct kvm_vcpu *vcpu) 5113 { 5114 struct vcpu_svm *svm = to_svm(vcpu); 5115 5116 BUG_ON(!(gif_set(svm))); 5117 5118 trace_kvm_inj_virq(vcpu->arch.interrupt.nr); 5119 ++vcpu->stat.irq_injections; 5120 5121 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | 5122 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; 5123 } 5124 5125 static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu) 5126 { 5127 return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK); 5128 } 5129 5130 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 5131 { 5132 struct vcpu_svm *svm = to_svm(vcpu); 5133 5134 if (svm_nested_virtualize_tpr(vcpu) || 5135 kvm_vcpu_apicv_active(vcpu)) 5136 return; 5137 5138 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); 5139 5140 if (irr == -1) 5141 return; 5142 5143 if (tpr >= irr) 5144 set_cr_intercept(svm, INTERCEPT_CR8_WRITE); 5145 } 5146 5147 static void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu) 5148 { 5149 return; 5150 } 5151 5152 static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu) 5153 { 5154 return avic && irqchip_split(vcpu->kvm); 5155 } 5156 5157 static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) 5158 { 5159 } 5160 5161 static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) 5162 { 5163 } 5164 5165 /* Note: Currently only used by Hyper-V. */ 5166 static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 5167 { 5168 struct vcpu_svm *svm = to_svm(vcpu); 5169 struct vmcb *vmcb = svm->vmcb; 5170 5171 if (kvm_vcpu_apicv_active(vcpu)) 5172 vmcb->control.int_ctl |= AVIC_ENABLE_MASK; 5173 else 5174 vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; 5175 mark_dirty(vmcb, VMCB_AVIC); 5176 } 5177 5178 static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 5179 { 5180 return; 5181 } 5182 5183 static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) 5184 { 5185 kvm_lapic_set_irr(vec, vcpu->arch.apic); 5186 smp_mb__after_atomic(); 5187 5188 if (avic_vcpu_is_running(vcpu)) { 5189 int cpuid = vcpu->cpu; 5190 5191 if (cpuid != get_cpu()) 5192 wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpuid)); 5193 put_cpu(); 5194 } else 5195 kvm_vcpu_wake_up(vcpu); 5196 } 5197 5198 static bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) 5199 { 5200 return false; 5201 } 5202 5203 static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) 5204 { 5205 unsigned long flags; 5206 struct amd_svm_iommu_ir *cur; 5207 5208 spin_lock_irqsave(&svm->ir_list_lock, flags); 5209 list_for_each_entry(cur, &svm->ir_list, node) { 5210 if (cur->data != pi->ir_data) 5211 continue; 5212 list_del(&cur->node); 5213 kfree(cur); 5214 break; 5215 } 5216 spin_unlock_irqrestore(&svm->ir_list_lock, flags); 5217 } 5218 5219 static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) 5220 { 5221 int ret = 0; 5222 unsigned long flags; 5223 struct amd_svm_iommu_ir *ir; 5224 5225 /** 5226 * In some cases, the existing irte is updaed and re-set, 5227 * so we need to check here if it's already been * added 5228 * to the ir_list. 5229 */ 5230 if (pi->ir_data && (pi->prev_ga_tag != 0)) { 5231 struct kvm *kvm = svm->vcpu.kvm; 5232 u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); 5233 struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); 5234 struct vcpu_svm *prev_svm; 5235 5236 if (!prev_vcpu) { 5237 ret = -EINVAL; 5238 goto out; 5239 } 5240 5241 prev_svm = to_svm(prev_vcpu); 5242 svm_ir_list_del(prev_svm, pi); 5243 } 5244 5245 /** 5246 * Allocating new amd_iommu_pi_data, which will get 5247 * add to the per-vcpu ir_list. 5248 */ 5249 ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT); 5250 if (!ir) { 5251 ret = -ENOMEM; 5252 goto out; 5253 } 5254 ir->data = pi->ir_data; 5255 5256 spin_lock_irqsave(&svm->ir_list_lock, flags); 5257 list_add(&ir->node, &svm->ir_list); 5258 spin_unlock_irqrestore(&svm->ir_list_lock, flags); 5259 out: 5260 return ret; 5261 } 5262 5263 /** 5264 * Note: 5265 * The HW cannot support posting multicast/broadcast 5266 * interrupts to a vCPU. So, we still use legacy interrupt 5267 * remapping for these kind of interrupts. 5268 * 5269 * For lowest-priority interrupts, we only support 5270 * those with single CPU as the destination, e.g. user 5271 * configures the interrupts via /proc/irq or uses 5272 * irqbalance to make the interrupts single-CPU. 5273 */ 5274 static int 5275 get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, 5276 struct vcpu_data *vcpu_info, struct vcpu_svm **svm) 5277 { 5278 struct kvm_lapic_irq irq; 5279 struct kvm_vcpu *vcpu = NULL; 5280 5281 kvm_set_msi_irq(kvm, e, &irq); 5282 5283 if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || 5284 !kvm_irq_is_postable(&irq)) { 5285 pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n", 5286 __func__, irq.vector); 5287 return -1; 5288 } 5289 5290 pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, 5291 irq.vector); 5292 *svm = to_svm(vcpu); 5293 vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page)); 5294 vcpu_info->vector = irq.vector; 5295 5296 return 0; 5297 } 5298 5299 /* 5300 * svm_update_pi_irte - set IRTE for Posted-Interrupts 5301 * 5302 * @kvm: kvm 5303 * @host_irq: host irq of the interrupt 5304 * @guest_irq: gsi of the interrupt 5305 * @set: set or unset PI 5306 * returns 0 on success, < 0 on failure 5307 */ 5308 static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, 5309 uint32_t guest_irq, bool set) 5310 { 5311 struct kvm_kernel_irq_routing_entry *e; 5312 struct kvm_irq_routing_table *irq_rt; 5313 int idx, ret = -EINVAL; 5314 5315 if (!kvm_arch_has_assigned_device(kvm) || 5316 !irq_remapping_cap(IRQ_POSTING_CAP)) 5317 return 0; 5318 5319 pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", 5320 __func__, host_irq, guest_irq, set); 5321 5322 idx = srcu_read_lock(&kvm->irq_srcu); 5323 irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); 5324 WARN_ON(guest_irq >= irq_rt->nr_rt_entries); 5325 5326 hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { 5327 struct vcpu_data vcpu_info; 5328 struct vcpu_svm *svm = NULL; 5329 5330 if (e->type != KVM_IRQ_ROUTING_MSI) 5331 continue; 5332 5333 /** 5334 * Here, we setup with legacy mode in the following cases: 5335 * 1. When cannot target interrupt to a specific vcpu. 5336 * 2. Unsetting posted interrupt. 5337 * 3. APIC virtialization is disabled for the vcpu. 5338 * 4. IRQ has incompatible delivery mode (SMI, INIT, etc) 5339 */ 5340 if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && 5341 kvm_vcpu_apicv_active(&svm->vcpu)) { 5342 struct amd_iommu_pi_data pi; 5343 5344 /* Try to enable guest_mode in IRTE */ 5345 pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & 5346 AVIC_HPA_MASK); 5347 pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, 5348 svm->vcpu.vcpu_id); 5349 pi.is_guest_mode = true; 5350 pi.vcpu_data = &vcpu_info; 5351 ret = irq_set_vcpu_affinity(host_irq, &pi); 5352 5353 /** 5354 * Here, we successfully setting up vcpu affinity in 5355 * IOMMU guest mode. Now, we need to store the posted 5356 * interrupt information in a per-vcpu ir_list so that 5357 * we can reference to them directly when we update vcpu 5358 * scheduling information in IOMMU irte. 5359 */ 5360 if (!ret && pi.is_guest_mode) 5361 svm_ir_list_add(svm, &pi); 5362 } else { 5363 /* Use legacy mode in IRTE */ 5364 struct amd_iommu_pi_data pi; 5365 5366 /** 5367 * Here, pi is used to: 5368 * - Tell IOMMU to use legacy mode for this interrupt. 5369 * - Retrieve ga_tag of prior interrupt remapping data. 5370 */ 5371 pi.is_guest_mode = false; 5372 ret = irq_set_vcpu_affinity(host_irq, &pi); 5373 5374 /** 5375 * Check if the posted interrupt was previously 5376 * setup with the guest_mode by checking if the ga_tag 5377 * was cached. If so, we need to clean up the per-vcpu 5378 * ir_list. 5379 */ 5380 if (!ret && pi.prev_ga_tag) { 5381 int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); 5382 struct kvm_vcpu *vcpu; 5383 5384 vcpu = kvm_get_vcpu_by_id(kvm, id); 5385 if (vcpu) 5386 svm_ir_list_del(to_svm(vcpu), &pi); 5387 } 5388 } 5389 5390 if (!ret && svm) { 5391 trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id, 5392 e->gsi, vcpu_info.vector, 5393 vcpu_info.pi_desc_addr, set); 5394 } 5395 5396 if (ret < 0) { 5397 pr_err("%s: failed to update PI IRTE\n", __func__); 5398 goto out; 5399 } 5400 } 5401 5402 ret = 0; 5403 out: 5404 srcu_read_unlock(&kvm->irq_srcu, idx); 5405 return ret; 5406 } 5407 5408 static int svm_nmi_allowed(struct kvm_vcpu *vcpu) 5409 { 5410 struct vcpu_svm *svm = to_svm(vcpu); 5411 struct vmcb *vmcb = svm->vmcb; 5412 int ret; 5413 ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && 5414 !(svm->vcpu.arch.hflags & HF_NMI_MASK); 5415 ret = ret && gif_set(svm) && nested_svm_nmi(svm); 5416 5417 return ret; 5418 } 5419 5420 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) 5421 { 5422 struct vcpu_svm *svm = to_svm(vcpu); 5423 5424 return !!(svm->vcpu.arch.hflags & HF_NMI_MASK); 5425 } 5426 5427 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 5428 { 5429 struct vcpu_svm *svm = to_svm(vcpu); 5430 5431 if (masked) { 5432 svm->vcpu.arch.hflags |= HF_NMI_MASK; 5433 set_intercept(svm, INTERCEPT_IRET); 5434 } else { 5435 svm->vcpu.arch.hflags &= ~HF_NMI_MASK; 5436 clr_intercept(svm, INTERCEPT_IRET); 5437 } 5438 } 5439 5440 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) 5441 { 5442 struct vcpu_svm *svm = to_svm(vcpu); 5443 struct vmcb *vmcb = svm->vmcb; 5444 int ret; 5445 5446 if (!gif_set(svm) || 5447 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)) 5448 return 0; 5449 5450 ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF); 5451 5452 if (is_guest_mode(vcpu)) 5453 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); 5454 5455 return ret; 5456 } 5457 5458 static void enable_irq_window(struct kvm_vcpu *vcpu) 5459 { 5460 struct vcpu_svm *svm = to_svm(vcpu); 5461 5462 if (kvm_vcpu_apicv_active(vcpu)) 5463 return; 5464 5465 /* 5466 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes 5467 * 1, because that's a separate STGI/VMRUN intercept. The next time we 5468 * get that intercept, this function will be called again though and 5469 * we'll get the vintr intercept. However, if the vGIF feature is 5470 * enabled, the STGI interception will not occur. Enable the irq 5471 * window under the assumption that the hardware will set the GIF. 5472 */ 5473 if ((vgif_enabled(svm) || gif_set(svm)) && nested_svm_intr(svm)) { 5474 svm_set_vintr(svm); 5475 svm_inject_irq(svm, 0x0); 5476 } 5477 } 5478 5479 static void enable_nmi_window(struct kvm_vcpu *vcpu) 5480 { 5481 struct vcpu_svm *svm = to_svm(vcpu); 5482 5483 if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) 5484 == HF_NMI_MASK) 5485 return; /* IRET will cause a vm exit */ 5486 5487 if (!gif_set(svm)) { 5488 if (vgif_enabled(svm)) 5489 set_intercept(svm, INTERCEPT_STGI); 5490 return; /* STGI will cause a vm exit */ 5491 } 5492 5493 if (svm->nested.exit_required) 5494 return; /* we're not going to run the guest yet */ 5495 5496 /* 5497 * Something prevents NMI from been injected. Single step over possible 5498 * problem (IRET or exception injection or interrupt shadow) 5499 */ 5500 svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu); 5501 svm->nmi_singlestep = true; 5502 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 5503 } 5504 5505 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) 5506 { 5507 return 0; 5508 } 5509 5510 static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) 5511 { 5512 return 0; 5513 } 5514 5515 static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) 5516 { 5517 struct vcpu_svm *svm = to_svm(vcpu); 5518 5519 if (static_cpu_has(X86_FEATURE_FLUSHBYASID)) 5520 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; 5521 else 5522 svm->asid_generation--; 5523 } 5524 5525 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) 5526 { 5527 struct vcpu_svm *svm = to_svm(vcpu); 5528 5529 invlpga(gva, svm->vmcb->control.asid); 5530 } 5531 5532 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) 5533 { 5534 } 5535 5536 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) 5537 { 5538 struct vcpu_svm *svm = to_svm(vcpu); 5539 5540 if (svm_nested_virtualize_tpr(vcpu)) 5541 return; 5542 5543 if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) { 5544 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 5545 kvm_set_cr8(vcpu, cr8); 5546 } 5547 } 5548 5549 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) 5550 { 5551 struct vcpu_svm *svm = to_svm(vcpu); 5552 u64 cr8; 5553 5554 if (svm_nested_virtualize_tpr(vcpu) || 5555 kvm_vcpu_apicv_active(vcpu)) 5556 return; 5557 5558 cr8 = kvm_get_cr8(vcpu); 5559 svm->vmcb->control.int_ctl &= ~V_TPR_MASK; 5560 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; 5561 } 5562 5563 static void svm_complete_interrupts(struct vcpu_svm *svm) 5564 { 5565 u8 vector; 5566 int type; 5567 u32 exitintinfo = svm->vmcb->control.exit_int_info; 5568 unsigned int3_injected = svm->int3_injected; 5569 5570 svm->int3_injected = 0; 5571 5572 /* 5573 * If we've made progress since setting HF_IRET_MASK, we've 5574 * executed an IRET and can allow NMI injection. 5575 */ 5576 if ((svm->vcpu.arch.hflags & HF_IRET_MASK) 5577 && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) { 5578 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); 5579 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 5580 } 5581 5582 svm->vcpu.arch.nmi_injected = false; 5583 kvm_clear_exception_queue(&svm->vcpu); 5584 kvm_clear_interrupt_queue(&svm->vcpu); 5585 5586 if (!(exitintinfo & SVM_EXITINTINFO_VALID)) 5587 return; 5588 5589 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 5590 5591 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; 5592 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; 5593 5594 switch (type) { 5595 case SVM_EXITINTINFO_TYPE_NMI: 5596 svm->vcpu.arch.nmi_injected = true; 5597 break; 5598 case SVM_EXITINTINFO_TYPE_EXEPT: 5599 /* 5600 * In case of software exceptions, do not reinject the vector, 5601 * but re-execute the instruction instead. Rewind RIP first 5602 * if we emulated INT3 before. 5603 */ 5604 if (kvm_exception_is_soft(vector)) { 5605 if (vector == BP_VECTOR && int3_injected && 5606 kvm_is_linear_rip(&svm->vcpu, svm->int3_rip)) 5607 kvm_rip_write(&svm->vcpu, 5608 kvm_rip_read(&svm->vcpu) - 5609 int3_injected); 5610 break; 5611 } 5612 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { 5613 u32 err = svm->vmcb->control.exit_int_info_err; 5614 kvm_requeue_exception_e(&svm->vcpu, vector, err); 5615 5616 } else 5617 kvm_requeue_exception(&svm->vcpu, vector); 5618 break; 5619 case SVM_EXITINTINFO_TYPE_INTR: 5620 kvm_queue_interrupt(&svm->vcpu, vector, false); 5621 break; 5622 default: 5623 break; 5624 } 5625 } 5626 5627 static void svm_cancel_injection(struct kvm_vcpu *vcpu) 5628 { 5629 struct vcpu_svm *svm = to_svm(vcpu); 5630 struct vmcb_control_area *control = &svm->vmcb->control; 5631 5632 control->exit_int_info = control->event_inj; 5633 control->exit_int_info_err = control->event_inj_err; 5634 control->event_inj = 0; 5635 svm_complete_interrupts(svm); 5636 } 5637 5638 static void svm_vcpu_run(struct kvm_vcpu *vcpu) 5639 { 5640 struct vcpu_svm *svm = to_svm(vcpu); 5641 5642 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 5643 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 5644 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 5645 5646 /* 5647 * A vmexit emulation is required before the vcpu can be executed 5648 * again. 5649 */ 5650 if (unlikely(svm->nested.exit_required)) 5651 return; 5652 5653 /* 5654 * Disable singlestep if we're injecting an interrupt/exception. 5655 * We don't want our modified rflags to be pushed on the stack where 5656 * we might not be able to easily reset them if we disabled NMI 5657 * singlestep later. 5658 */ 5659 if (svm->nmi_singlestep && svm->vmcb->control.event_inj) { 5660 /* 5661 * Event injection happens before external interrupts cause a 5662 * vmexit and interrupts are disabled here, so smp_send_reschedule 5663 * is enough to force an immediate vmexit. 5664 */ 5665 disable_nmi_singlestep(svm); 5666 smp_send_reschedule(vcpu->cpu); 5667 } 5668 5669 pre_svm_run(svm); 5670 5671 sync_lapic_to_cr8(vcpu); 5672 5673 svm->vmcb->save.cr2 = vcpu->arch.cr2; 5674 5675 clgi(); 5676 kvm_load_guest_xcr0(vcpu); 5677 5678 if (lapic_in_kernel(vcpu) && 5679 vcpu->arch.apic->lapic_timer.timer_advance_ns) 5680 kvm_wait_lapic_expire(vcpu); 5681 5682 /* 5683 * If this vCPU has touched SPEC_CTRL, restore the guest's value if 5684 * it's non-zero. Since vmentry is serialising on affected CPUs, there 5685 * is no need to worry about the conditional branch over the wrmsr 5686 * being speculatively taken. 5687 */ 5688 x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); 5689 5690 local_irq_enable(); 5691 5692 asm volatile ( 5693 "push %%" _ASM_BP "; \n\t" 5694 "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t" 5695 "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t" 5696 "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t" 5697 "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t" 5698 "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t" 5699 "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t" 5700 #ifdef CONFIG_X86_64 5701 "mov %c[r8](%[svm]), %%r8 \n\t" 5702 "mov %c[r9](%[svm]), %%r9 \n\t" 5703 "mov %c[r10](%[svm]), %%r10 \n\t" 5704 "mov %c[r11](%[svm]), %%r11 \n\t" 5705 "mov %c[r12](%[svm]), %%r12 \n\t" 5706 "mov %c[r13](%[svm]), %%r13 \n\t" 5707 "mov %c[r14](%[svm]), %%r14 \n\t" 5708 "mov %c[r15](%[svm]), %%r15 \n\t" 5709 #endif 5710 5711 /* Enter guest mode */ 5712 "push %%" _ASM_AX " \n\t" 5713 "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t" 5714 __ex("vmload %%" _ASM_AX) "\n\t" 5715 __ex("vmrun %%" _ASM_AX) "\n\t" 5716 __ex("vmsave %%" _ASM_AX) "\n\t" 5717 "pop %%" _ASM_AX " \n\t" 5718 5719 /* Save guest registers, load host registers */ 5720 "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t" 5721 "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t" 5722 "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t" 5723 "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t" 5724 "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t" 5725 "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t" 5726 #ifdef CONFIG_X86_64 5727 "mov %%r8, %c[r8](%[svm]) \n\t" 5728 "mov %%r9, %c[r9](%[svm]) \n\t" 5729 "mov %%r10, %c[r10](%[svm]) \n\t" 5730 "mov %%r11, %c[r11](%[svm]) \n\t" 5731 "mov %%r12, %c[r12](%[svm]) \n\t" 5732 "mov %%r13, %c[r13](%[svm]) \n\t" 5733 "mov %%r14, %c[r14](%[svm]) \n\t" 5734 "mov %%r15, %c[r15](%[svm]) \n\t" 5735 /* 5736 * Clear host registers marked as clobbered to prevent 5737 * speculative use. 5738 */ 5739 "xor %%r8d, %%r8d \n\t" 5740 "xor %%r9d, %%r9d \n\t" 5741 "xor %%r10d, %%r10d \n\t" 5742 "xor %%r11d, %%r11d \n\t" 5743 "xor %%r12d, %%r12d \n\t" 5744 "xor %%r13d, %%r13d \n\t" 5745 "xor %%r14d, %%r14d \n\t" 5746 "xor %%r15d, %%r15d \n\t" 5747 #endif 5748 "xor %%ebx, %%ebx \n\t" 5749 "xor %%ecx, %%ecx \n\t" 5750 "xor %%edx, %%edx \n\t" 5751 "xor %%esi, %%esi \n\t" 5752 "xor %%edi, %%edi \n\t" 5753 "pop %%" _ASM_BP 5754 : 5755 : [svm]"a"(svm), 5756 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), 5757 [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])), 5758 [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])), 5759 [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])), 5760 [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])), 5761 [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])), 5762 [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP])) 5763 #ifdef CONFIG_X86_64 5764 , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])), 5765 [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])), 5766 [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])), 5767 [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])), 5768 [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])), 5769 [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])), 5770 [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])), 5771 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) 5772 #endif 5773 : "cc", "memory" 5774 #ifdef CONFIG_X86_64 5775 , "rbx", "rcx", "rdx", "rsi", "rdi" 5776 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" 5777 #else 5778 , "ebx", "ecx", "edx", "esi", "edi" 5779 #endif 5780 ); 5781 5782 /* Eliminate branch target predictions from guest mode */ 5783 vmexit_fill_RSB(); 5784 5785 #ifdef CONFIG_X86_64 5786 wrmsrl(MSR_GS_BASE, svm->host.gs_base); 5787 #else 5788 loadsegment(fs, svm->host.fs); 5789 #ifndef CONFIG_X86_32_LAZY_GS 5790 loadsegment(gs, svm->host.gs); 5791 #endif 5792 #endif 5793 5794 /* 5795 * We do not use IBRS in the kernel. If this vCPU has used the 5796 * SPEC_CTRL MSR it may have left it on; save the value and 5797 * turn it off. This is much more efficient than blindly adding 5798 * it to the atomic save/restore list. Especially as the former 5799 * (Saving guest MSRs on vmexit) doesn't even exist in KVM. 5800 * 5801 * For non-nested case: 5802 * If the L01 MSR bitmap does not intercept the MSR, then we need to 5803 * save it. 5804 * 5805 * For nested case: 5806 * If the L02 MSR bitmap does not intercept the MSR, then we need to 5807 * save it. 5808 */ 5809 if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) 5810 svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); 5811 5812 reload_tss(vcpu); 5813 5814 local_irq_disable(); 5815 5816 x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); 5817 5818 vcpu->arch.cr2 = svm->vmcb->save.cr2; 5819 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; 5820 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; 5821 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; 5822 5823 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) 5824 kvm_before_interrupt(&svm->vcpu); 5825 5826 kvm_put_guest_xcr0(vcpu); 5827 stgi(); 5828 5829 /* Any pending NMI will happen here */ 5830 5831 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) 5832 kvm_after_interrupt(&svm->vcpu); 5833 5834 sync_cr8_to_lapic(vcpu); 5835 5836 svm->next_rip = 0; 5837 5838 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; 5839 5840 /* if exit due to PF check for async PF */ 5841 if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) 5842 svm->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); 5843 5844 if (npt_enabled) { 5845 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); 5846 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); 5847 } 5848 5849 /* 5850 * We need to handle MC intercepts here before the vcpu has a chance to 5851 * change the physical cpu 5852 */ 5853 if (unlikely(svm->vmcb->control.exit_code == 5854 SVM_EXIT_EXCP_BASE + MC_VECTOR)) 5855 svm_handle_mce(svm); 5856 5857 mark_all_clean(svm->vmcb); 5858 } 5859 STACK_FRAME_NON_STANDARD(svm_vcpu_run); 5860 5861 static vo