~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/x86/kvm/vmx.c

Version: ~ [ linux-5.4-rc3 ] ~ [ linux-5.3.6 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.79 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.149 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.196 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.196 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.75 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-3.9.11 ] ~ [ linux-3.8.13 ] ~ [ linux-3.7.10 ] ~ [ linux-3.6.11 ] ~ [ linux-3.5.7 ] ~ [ linux-3.4.113 ] ~ [ linux-3.3.8 ] ~ [ linux-3.2.102 ] ~ [ linux-3.1.10 ] ~ [ linux-3.0.101 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * Kernel-based Virtual Machine driver for Linux
  3  *
  4  * This module enables machines with Intel VT-x extensions to run virtual
  5  * machines without emulation or binary translation.
  6  *
  7  * Copyright (C) 2006 Qumranet, Inc.
  8  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  9  *
 10  * Authors:
 11  *   Avi Kivity   <avi@qumranet.com>
 12  *   Yaniv Kamay  <yaniv@qumranet.com>
 13  *
 14  * This work is licensed under the terms of the GNU GPL, version 2.  See
 15  * the COPYING file in the top-level directory.
 16  *
 17  */
 18 
 19 #include "irq.h"
 20 #include "mmu.h"
 21 #include "cpuid.h"
 22 #include "lapic.h"
 23 
 24 #include <linux/kvm_host.h>
 25 #include <linux/module.h>
 26 #include <linux/kernel.h>
 27 #include <linux/mm.h>
 28 #include <linux/highmem.h>
 29 #include <linux/sched.h>
 30 #include <linux/moduleparam.h>
 31 #include <linux/mod_devicetable.h>
 32 #include <linux/trace_events.h>
 33 #include <linux/slab.h>
 34 #include <linux/tboot.h>
 35 #include <linux/hrtimer.h>
 36 #include <linux/frame.h>
 37 #include <linux/nospec.h>
 38 #include "kvm_cache_regs.h"
 39 #include "x86.h"
 40 
 41 #include <asm/cpu.h>
 42 #include <asm/io.h>
 43 #include <asm/desc.h>
 44 #include <asm/vmx.h>
 45 #include <asm/virtext.h>
 46 #include <asm/mce.h>
 47 #include <asm/fpu/internal.h>
 48 #include <asm/perf_event.h>
 49 #include <asm/debugreg.h>
 50 #include <asm/kexec.h>
 51 #include <asm/apic.h>
 52 #include <asm/irq_remapping.h>
 53 #include <asm/mmu_context.h>
 54 #include <asm/microcode.h>
 55 #include <asm/nospec-branch.h>
 56 
 57 #include "trace.h"
 58 #include "pmu.h"
 59 
 60 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 61 #define __ex_clear(x, reg) \
 62         ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
 63 
 64 MODULE_AUTHOR("Qumranet");
 65 MODULE_LICENSE("GPL");
 66 
 67 static const struct x86_cpu_id vmx_cpu_id[] = {
 68         X86_FEATURE_MATCH(X86_FEATURE_VMX),
 69         {}
 70 };
 71 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
 72 
 73 static bool __read_mostly enable_vpid = 1;
 74 module_param_named(vpid, enable_vpid, bool, 0444);
 75 
 76 static bool __read_mostly enable_vnmi = 1;
 77 module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
 78 
 79 static bool __read_mostly flexpriority_enabled = 1;
 80 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
 81 
 82 static bool __read_mostly enable_ept = 1;
 83 module_param_named(ept, enable_ept, bool, S_IRUGO);
 84 
 85 static bool __read_mostly enable_unrestricted_guest = 1;
 86 module_param_named(unrestricted_guest,
 87                         enable_unrestricted_guest, bool, S_IRUGO);
 88 
 89 static bool __read_mostly enable_ept_ad_bits = 1;
 90 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
 91 
 92 static bool __read_mostly emulate_invalid_guest_state = true;
 93 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 94 
 95 static bool __read_mostly fasteoi = 1;
 96 module_param(fasteoi, bool, S_IRUGO);
 97 
 98 static bool __read_mostly enable_apicv = 1;
 99 module_param(enable_apicv, bool, S_IRUGO);
100 
101 static bool __read_mostly enable_shadow_vmcs = 1;
102 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
103 /*
104  * If nested=1, nested virtualization is supported, i.e., guests may use
105  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
106  * use VMX instructions.
107  */
108 static bool __read_mostly nested = 0;
109 module_param(nested, bool, S_IRUGO);
110 
111 static u64 __read_mostly host_xss;
112 
113 static bool __read_mostly enable_pml = 1;
114 module_param_named(pml, enable_pml, bool, S_IRUGO);
115 
116 #define MSR_TYPE_R      1
117 #define MSR_TYPE_W      2
118 #define MSR_TYPE_RW     3
119 
120 #define MSR_BITMAP_MODE_X2APIC          1
121 #define MSR_BITMAP_MODE_X2APIC_APICV    2
122 #define MSR_BITMAP_MODE_LM              4
123 
124 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
125 
126 /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
127 static int __read_mostly cpu_preemption_timer_multi;
128 static bool __read_mostly enable_preemption_timer = 1;
129 #ifdef CONFIG_X86_64
130 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
131 #endif
132 
133 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
134 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
135 #define KVM_VM_CR0_ALWAYS_ON                                            \
136         (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
137 #define KVM_CR4_GUEST_OWNED_BITS                                      \
138         (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR      \
139          | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
140 
141 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
142 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
143 
144 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
145 
146 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
147 
148 /*
149  * Hyper-V requires all of these, so mark them as supported even though
150  * they are just treated the same as all-context.
151  */
152 #define VMX_VPID_EXTENT_SUPPORTED_MASK          \
153         (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |  \
154         VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |    \
155         VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |    \
156         VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
157 
158 /*
159  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
160  * ple_gap:    upper bound on the amount of time between two successive
161  *             executions of PAUSE in a loop. Also indicate if ple enabled.
162  *             According to test, this time is usually smaller than 128 cycles.
163  * ple_window: upper bound on the amount of time a guest is allowed to execute
164  *             in a PAUSE loop. Tests indicate that most spinlocks are held for
165  *             less than 2^12 cycles
166  * Time is measured based on a counter that runs at the same rate as the TSC,
167  * refer SDM volume 3b section 21.6.13 & 22.1.3.
168  */
169 #define KVM_VMX_DEFAULT_PLE_GAP           128
170 #define KVM_VMX_DEFAULT_PLE_WINDOW        4096
171 #define KVM_VMX_DEFAULT_PLE_WINDOW_GROW   2
172 #define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0
173 #define KVM_VMX_DEFAULT_PLE_WINDOW_MAX    \
174                 INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW
175 
176 static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
177 module_param(ple_gap, int, S_IRUGO);
178 
179 static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
180 module_param(ple_window, int, S_IRUGO);
181 
182 /* Default doubles per-vcpu window every exit. */
183 static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW;
184 module_param(ple_window_grow, int, S_IRUGO);
185 
186 /* Default resets per-vcpu window every exit to ple_window. */
187 static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK;
188 module_param(ple_window_shrink, int, S_IRUGO);
189 
190 /* Default is to compute the maximum so we can never overflow. */
191 static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
192 static int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
193 module_param(ple_window_max, int, S_IRUGO);
194 
195 extern const ulong vmx_return;
196 
197 #define NR_AUTOLOAD_MSRS 8
198 
199 struct vmcs {
200         u32 revision_id;
201         u32 abort;
202         char data[0];
203 };
204 
205 /*
206  * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
207  * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
208  * loaded on this CPU (so we can clear them if the CPU goes down).
209  */
210 struct loaded_vmcs {
211         struct vmcs *vmcs;
212         struct vmcs *shadow_vmcs;
213         int cpu;
214         bool launched;
215         bool nmi_known_unmasked;
216         unsigned long vmcs_host_cr3;    /* May not match real cr3 */
217         unsigned long vmcs_host_cr4;    /* May not match real cr4 */
218         /* Support for vnmi-less CPUs */
219         int soft_vnmi_blocked;
220         ktime_t entry_time;
221         s64 vnmi_blocked_time;
222         unsigned long *msr_bitmap;
223         struct list_head loaded_vmcss_on_cpu_link;
224 };
225 
226 struct shared_msr_entry {
227         unsigned index;
228         u64 data;
229         u64 mask;
230 };
231 
232 /*
233  * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
234  * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
235  * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
236  * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
237  * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
238  * More than one of these structures may exist, if L1 runs multiple L2 guests.
239  * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
240  * underlying hardware which will be used to run L2.
241  * This structure is packed to ensure that its layout is identical across
242  * machines (necessary for live migration).
243  * If there are changes in this struct, VMCS12_REVISION must be changed.
244  */
245 typedef u64 natural_width;
246 struct __packed vmcs12 {
247         /* According to the Intel spec, a VMCS region must start with the
248          * following two fields. Then follow implementation-specific data.
249          */
250         u32 revision_id;
251         u32 abort;
252 
253         u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
254         u32 padding[7]; /* room for future expansion */
255 
256         u64 io_bitmap_a;
257         u64 io_bitmap_b;
258         u64 msr_bitmap;
259         u64 vm_exit_msr_store_addr;
260         u64 vm_exit_msr_load_addr;
261         u64 vm_entry_msr_load_addr;
262         u64 tsc_offset;
263         u64 virtual_apic_page_addr;
264         u64 apic_access_addr;
265         u64 posted_intr_desc_addr;
266         u64 vm_function_control;
267         u64 ept_pointer;
268         u64 eoi_exit_bitmap0;
269         u64 eoi_exit_bitmap1;
270         u64 eoi_exit_bitmap2;
271         u64 eoi_exit_bitmap3;
272         u64 eptp_list_address;
273         u64 xss_exit_bitmap;
274         u64 guest_physical_address;
275         u64 vmcs_link_pointer;
276         u64 pml_address;
277         u64 guest_ia32_debugctl;
278         u64 guest_ia32_pat;
279         u64 guest_ia32_efer;
280         u64 guest_ia32_perf_global_ctrl;
281         u64 guest_pdptr0;
282         u64 guest_pdptr1;
283         u64 guest_pdptr2;
284         u64 guest_pdptr3;
285         u64 guest_bndcfgs;
286         u64 host_ia32_pat;
287         u64 host_ia32_efer;
288         u64 host_ia32_perf_global_ctrl;
289         u64 padding64[8]; /* room for future expansion */
290         /*
291          * To allow migration of L1 (complete with its L2 guests) between
292          * machines of different natural widths (32 or 64 bit), we cannot have
293          * unsigned long fields with no explict size. We use u64 (aliased
294          * natural_width) instead. Luckily, x86 is little-endian.
295          */
296         natural_width cr0_guest_host_mask;
297         natural_width cr4_guest_host_mask;
298         natural_width cr0_read_shadow;
299         natural_width cr4_read_shadow;
300         natural_width cr3_target_value0;
301         natural_width cr3_target_value1;
302         natural_width cr3_target_value2;
303         natural_width cr3_target_value3;
304         natural_width exit_qualification;
305         natural_width guest_linear_address;
306         natural_width guest_cr0;
307         natural_width guest_cr3;
308         natural_width guest_cr4;
309         natural_width guest_es_base;
310         natural_width guest_cs_base;
311         natural_width guest_ss_base;
312         natural_width guest_ds_base;
313         natural_width guest_fs_base;
314         natural_width guest_gs_base;
315         natural_width guest_ldtr_base;
316         natural_width guest_tr_base;
317         natural_width guest_gdtr_base;
318         natural_width guest_idtr_base;
319         natural_width guest_dr7;
320         natural_width guest_rsp;
321         natural_width guest_rip;
322         natural_width guest_rflags;
323         natural_width guest_pending_dbg_exceptions;
324         natural_width guest_sysenter_esp;
325         natural_width guest_sysenter_eip;
326         natural_width host_cr0;
327         natural_width host_cr3;
328         natural_width host_cr4;
329         natural_width host_fs_base;
330         natural_width host_gs_base;
331         natural_width host_tr_base;
332         natural_width host_gdtr_base;
333         natural_width host_idtr_base;
334         natural_width host_ia32_sysenter_esp;
335         natural_width host_ia32_sysenter_eip;
336         natural_width host_rsp;
337         natural_width host_rip;
338         natural_width paddingl[8]; /* room for future expansion */
339         u32 pin_based_vm_exec_control;
340         u32 cpu_based_vm_exec_control;
341         u32 exception_bitmap;
342         u32 page_fault_error_code_mask;
343         u32 page_fault_error_code_match;
344         u32 cr3_target_count;
345         u32 vm_exit_controls;
346         u32 vm_exit_msr_store_count;
347         u32 vm_exit_msr_load_count;
348         u32 vm_entry_controls;
349         u32 vm_entry_msr_load_count;
350         u32 vm_entry_intr_info_field;
351         u32 vm_entry_exception_error_code;
352         u32 vm_entry_instruction_len;
353         u32 tpr_threshold;
354         u32 secondary_vm_exec_control;
355         u32 vm_instruction_error;
356         u32 vm_exit_reason;
357         u32 vm_exit_intr_info;
358         u32 vm_exit_intr_error_code;
359         u32 idt_vectoring_info_field;
360         u32 idt_vectoring_error_code;
361         u32 vm_exit_instruction_len;
362         u32 vmx_instruction_info;
363         u32 guest_es_limit;
364         u32 guest_cs_limit;
365         u32 guest_ss_limit;
366         u32 guest_ds_limit;
367         u32 guest_fs_limit;
368         u32 guest_gs_limit;
369         u32 guest_ldtr_limit;
370         u32 guest_tr_limit;
371         u32 guest_gdtr_limit;
372         u32 guest_idtr_limit;
373         u32 guest_es_ar_bytes;
374         u32 guest_cs_ar_bytes;
375         u32 guest_ss_ar_bytes;
376         u32 guest_ds_ar_bytes;
377         u32 guest_fs_ar_bytes;
378         u32 guest_gs_ar_bytes;
379         u32 guest_ldtr_ar_bytes;
380         u32 guest_tr_ar_bytes;
381         u32 guest_interruptibility_info;
382         u32 guest_activity_state;
383         u32 guest_sysenter_cs;
384         u32 host_ia32_sysenter_cs;
385         u32 vmx_preemption_timer_value;
386         u32 padding32[7]; /* room for future expansion */
387         u16 virtual_processor_id;
388         u16 posted_intr_nv;
389         u16 guest_es_selector;
390         u16 guest_cs_selector;
391         u16 guest_ss_selector;
392         u16 guest_ds_selector;
393         u16 guest_fs_selector;
394         u16 guest_gs_selector;
395         u16 guest_ldtr_selector;
396         u16 guest_tr_selector;
397         u16 guest_intr_status;
398         u16 guest_pml_index;
399         u16 host_es_selector;
400         u16 host_cs_selector;
401         u16 host_ss_selector;
402         u16 host_ds_selector;
403         u16 host_fs_selector;
404         u16 host_gs_selector;
405         u16 host_tr_selector;
406 };
407 
408 /*
409  * VMCS12_REVISION is an arbitrary id that should be changed if the content or
410  * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
411  * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
412  */
413 #define VMCS12_REVISION 0x11e57ed0
414 
415 /*
416  * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
417  * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
418  * current implementation, 4K are reserved to avoid future complications.
419  */
420 #define VMCS12_SIZE 0x1000
421 
422 /*
423  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
424  * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
425  */
426 struct nested_vmx {
427         /* Has the level1 guest done vmxon? */
428         bool vmxon;
429         gpa_t vmxon_ptr;
430         bool pml_full;
431 
432         /* The guest-physical address of the current VMCS L1 keeps for L2 */
433         gpa_t current_vmptr;
434         /*
435          * Cache of the guest's VMCS, existing outside of guest memory.
436          * Loaded from guest memory during VMPTRLD. Flushed to guest
437          * memory during VMCLEAR and VMPTRLD.
438          */
439         struct vmcs12 *cached_vmcs12;
440         /*
441          * Indicates if the shadow vmcs must be updated with the
442          * data hold by vmcs12
443          */
444         bool sync_shadow_vmcs;
445 
446         bool change_vmcs01_virtual_x2apic_mode;
447         /* L2 must run next, and mustn't decide to exit to L1. */
448         bool nested_run_pending;
449 
450         struct loaded_vmcs vmcs02;
451 
452         /*
453          * Guest pages referred to in the vmcs02 with host-physical
454          * pointers, so we must keep them pinned while L2 runs.
455          */
456         struct page *apic_access_page;
457         struct page *virtual_apic_page;
458         struct page *pi_desc_page;
459         struct pi_desc *pi_desc;
460         bool pi_pending;
461         u16 posted_intr_nv;
462 
463         struct hrtimer preemption_timer;
464         bool preemption_timer_expired;
465 
466         /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
467         u64 vmcs01_debugctl;
468 
469         u16 vpid02;
470         u16 last_vpid;
471 
472         /*
473          * We only store the "true" versions of the VMX capability MSRs. We
474          * generate the "non-true" versions by setting the must-be-1 bits
475          * according to the SDM.
476          */
477         u32 nested_vmx_procbased_ctls_low;
478         u32 nested_vmx_procbased_ctls_high;
479         u32 nested_vmx_secondary_ctls_low;
480         u32 nested_vmx_secondary_ctls_high;
481         u32 nested_vmx_pinbased_ctls_low;
482         u32 nested_vmx_pinbased_ctls_high;
483         u32 nested_vmx_exit_ctls_low;
484         u32 nested_vmx_exit_ctls_high;
485         u32 nested_vmx_entry_ctls_low;
486         u32 nested_vmx_entry_ctls_high;
487         u32 nested_vmx_misc_low;
488         u32 nested_vmx_misc_high;
489         u32 nested_vmx_ept_caps;
490         u32 nested_vmx_vpid_caps;
491         u64 nested_vmx_basic;
492         u64 nested_vmx_cr0_fixed0;
493         u64 nested_vmx_cr0_fixed1;
494         u64 nested_vmx_cr4_fixed0;
495         u64 nested_vmx_cr4_fixed1;
496         u64 nested_vmx_vmcs_enum;
497         u64 nested_vmx_vmfunc_controls;
498 
499         /* SMM related state */
500         struct {
501                 /* in VMX operation on SMM entry? */
502                 bool vmxon;
503                 /* in guest mode on SMM entry? */
504                 bool guest_mode;
505         } smm;
506 };
507 
508 #define POSTED_INTR_ON  0
509 #define POSTED_INTR_SN  1
510 
511 /* Posted-Interrupt Descriptor */
512 struct pi_desc {
513         u32 pir[8];     /* Posted interrupt requested */
514         union {
515                 struct {
516                                 /* bit 256 - Outstanding Notification */
517                         u16     on      : 1,
518                                 /* bit 257 - Suppress Notification */
519                                 sn      : 1,
520                                 /* bit 271:258 - Reserved */
521                                 rsvd_1  : 14;
522                                 /* bit 279:272 - Notification Vector */
523                         u8      nv;
524                                 /* bit 287:280 - Reserved */
525                         u8      rsvd_2;
526                                 /* bit 319:288 - Notification Destination */
527                         u32     ndst;
528                 };
529                 u64 control;
530         };
531         u32 rsvd[6];
532 } __aligned(64);
533 
534 static bool pi_test_and_set_on(struct pi_desc *pi_desc)
535 {
536         return test_and_set_bit(POSTED_INTR_ON,
537                         (unsigned long *)&pi_desc->control);
538 }
539 
540 static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
541 {
542         return test_and_clear_bit(POSTED_INTR_ON,
543                         (unsigned long *)&pi_desc->control);
544 }
545 
546 static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
547 {
548         return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
549 }
550 
551 static inline void pi_clear_sn(struct pi_desc *pi_desc)
552 {
553         return clear_bit(POSTED_INTR_SN,
554                         (unsigned long *)&pi_desc->control);
555 }
556 
557 static inline void pi_set_sn(struct pi_desc *pi_desc)
558 {
559         return set_bit(POSTED_INTR_SN,
560                         (unsigned long *)&pi_desc->control);
561 }
562 
563 static inline void pi_clear_on(struct pi_desc *pi_desc)
564 {
565         clear_bit(POSTED_INTR_ON,
566                   (unsigned long *)&pi_desc->control);
567 }
568 
569 static inline int pi_test_on(struct pi_desc *pi_desc)
570 {
571         return test_bit(POSTED_INTR_ON,
572                         (unsigned long *)&pi_desc->control);
573 }
574 
575 static inline int pi_test_sn(struct pi_desc *pi_desc)
576 {
577         return test_bit(POSTED_INTR_SN,
578                         (unsigned long *)&pi_desc->control);
579 }
580 
581 struct vcpu_vmx {
582         struct kvm_vcpu       vcpu;
583         unsigned long         host_rsp;
584         u8                    fail;
585         u8                    msr_bitmap_mode;
586         u32                   exit_intr_info;
587         u32                   idt_vectoring_info;
588         ulong                 rflags;
589         struct shared_msr_entry *guest_msrs;
590         int                   nmsrs;
591         int                   save_nmsrs;
592         unsigned long         host_idt_base;
593 #ifdef CONFIG_X86_64
594         u64                   msr_host_kernel_gs_base;
595         u64                   msr_guest_kernel_gs_base;
596 #endif
597 
598         u64                   arch_capabilities;
599         u64                   spec_ctrl;
600 
601         u32 vm_entry_controls_shadow;
602         u32 vm_exit_controls_shadow;
603         u32 secondary_exec_control;
604 
605         /*
606          * loaded_vmcs points to the VMCS currently used in this vcpu. For a
607          * non-nested (L1) guest, it always points to vmcs01. For a nested
608          * guest (L2), it points to a different VMCS.
609          */
610         struct loaded_vmcs    vmcs01;
611         struct loaded_vmcs   *loaded_vmcs;
612         bool                  __launched; /* temporary, used in vmx_vcpu_run */
613         struct msr_autoload {
614                 unsigned nr;
615                 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
616                 struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
617         } msr_autoload;
618         struct {
619                 int           loaded;
620                 u16           fs_sel, gs_sel, ldt_sel;
621 #ifdef CONFIG_X86_64
622                 u16           ds_sel, es_sel;
623 #endif
624                 int           gs_ldt_reload_needed;
625                 int           fs_reload_needed;
626                 u64           msr_host_bndcfgs;
627         } host_state;
628         struct {
629                 int vm86_active;
630                 ulong save_rflags;
631                 struct kvm_segment segs[8];
632         } rmode;
633         struct {
634                 u32 bitmask; /* 4 bits per segment (1 bit per field) */
635                 struct kvm_save_segment {
636                         u16 selector;
637                         unsigned long base;
638                         u32 limit;
639                         u32 ar;
640                 } seg[8];
641         } segment_cache;
642         int vpid;
643         bool emulation_required;
644 
645         u32 exit_reason;
646 
647         /* Posted interrupt descriptor */
648         struct pi_desc pi_desc;
649 
650         /* Support for a guest hypervisor (nested VMX) */
651         struct nested_vmx nested;
652 
653         /* Dynamic PLE window. */
654         int ple_window;
655         bool ple_window_dirty;
656 
657         /* Support for PML */
658 #define PML_ENTITY_NUM          512
659         struct page *pml_pg;
660 
661         /* apic deadline value in host tsc */
662         u64 hv_deadline_tsc;
663 
664         u64 current_tsc_ratio;
665 
666         u32 host_pkru;
667 
668         /*
669          * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
670          * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
671          * in msr_ia32_feature_control_valid_bits.
672          */
673         u64 msr_ia32_feature_control;
674         u64 msr_ia32_feature_control_valid_bits;
675 };
676 
677 enum segment_cache_field {
678         SEG_FIELD_SEL = 0,
679         SEG_FIELD_BASE = 1,
680         SEG_FIELD_LIMIT = 2,
681         SEG_FIELD_AR = 3,
682 
683         SEG_FIELD_NR = 4
684 };
685 
686 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
687 {
688         return container_of(vcpu, struct vcpu_vmx, vcpu);
689 }
690 
691 static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
692 {
693         return &(to_vmx(vcpu)->pi_desc);
694 }
695 
696 #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
697 #define FIELD(number, name)     [number] = VMCS12_OFFSET(name)
698 #define FIELD64(number, name)   [number] = VMCS12_OFFSET(name), \
699                                 [number##_HIGH] = VMCS12_OFFSET(name)+4
700 
701 
702 static unsigned long shadow_read_only_fields[] = {
703         /*
704          * We do NOT shadow fields that are modified when L0
705          * traps and emulates any vmx instruction (e.g. VMPTRLD,
706          * VMXON...) executed by L1.
707          * For example, VM_INSTRUCTION_ERROR is read
708          * by L1 if a vmx instruction fails (part of the error path).
709          * Note the code assumes this logic. If for some reason
710          * we start shadowing these fields then we need to
711          * force a shadow sync when L0 emulates vmx instructions
712          * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
713          * by nested_vmx_failValid)
714          */
715         VM_EXIT_REASON,
716         VM_EXIT_INTR_INFO,
717         VM_EXIT_INSTRUCTION_LEN,
718         IDT_VECTORING_INFO_FIELD,
719         IDT_VECTORING_ERROR_CODE,
720         VM_EXIT_INTR_ERROR_CODE,
721         EXIT_QUALIFICATION,
722         GUEST_LINEAR_ADDRESS,
723         GUEST_PHYSICAL_ADDRESS
724 };
725 static int max_shadow_read_only_fields =
726         ARRAY_SIZE(shadow_read_only_fields);
727 
728 static unsigned long shadow_read_write_fields[] = {
729         TPR_THRESHOLD,
730         GUEST_RIP,
731         GUEST_RSP,
732         GUEST_CR0,
733         GUEST_CR3,
734         GUEST_CR4,
735         GUEST_INTERRUPTIBILITY_INFO,
736         GUEST_RFLAGS,
737         GUEST_CS_SELECTOR,
738         GUEST_CS_AR_BYTES,
739         GUEST_CS_LIMIT,
740         GUEST_CS_BASE,
741         GUEST_ES_BASE,
742         GUEST_BNDCFGS,
743         CR0_GUEST_HOST_MASK,
744         CR0_READ_SHADOW,
745         CR4_READ_SHADOW,
746         TSC_OFFSET,
747         EXCEPTION_BITMAP,
748         CPU_BASED_VM_EXEC_CONTROL,
749         VM_ENTRY_EXCEPTION_ERROR_CODE,
750         VM_ENTRY_INTR_INFO_FIELD,
751         VM_ENTRY_INSTRUCTION_LEN,
752         VM_ENTRY_EXCEPTION_ERROR_CODE,
753         HOST_FS_BASE,
754         HOST_GS_BASE,
755         HOST_FS_SELECTOR,
756         HOST_GS_SELECTOR
757 };
758 static int max_shadow_read_write_fields =
759         ARRAY_SIZE(shadow_read_write_fields);
760 
761 static const unsigned short vmcs_field_to_offset_table[] = {
762         FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
763         FIELD(POSTED_INTR_NV, posted_intr_nv),
764         FIELD(GUEST_ES_SELECTOR, guest_es_selector),
765         FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
766         FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
767         FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
768         FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
769         FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
770         FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
771         FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
772         FIELD(GUEST_INTR_STATUS, guest_intr_status),
773         FIELD(GUEST_PML_INDEX, guest_pml_index),
774         FIELD(HOST_ES_SELECTOR, host_es_selector),
775         FIELD(HOST_CS_SELECTOR, host_cs_selector),
776         FIELD(HOST_SS_SELECTOR, host_ss_selector),
777         FIELD(HOST_DS_SELECTOR, host_ds_selector),
778         FIELD(HOST_FS_SELECTOR, host_fs_selector),
779         FIELD(HOST_GS_SELECTOR, host_gs_selector),
780         FIELD(HOST_TR_SELECTOR, host_tr_selector),
781         FIELD64(IO_BITMAP_A, io_bitmap_a),
782         FIELD64(IO_BITMAP_B, io_bitmap_b),
783         FIELD64(MSR_BITMAP, msr_bitmap),
784         FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
785         FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
786         FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
787         FIELD64(TSC_OFFSET, tsc_offset),
788         FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
789         FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
790         FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
791         FIELD64(VM_FUNCTION_CONTROL, vm_function_control),
792         FIELD64(EPT_POINTER, ept_pointer),
793         FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
794         FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
795         FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
796         FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
797         FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
798         FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
799         FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
800         FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
801         FIELD64(PML_ADDRESS, pml_address),
802         FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
803         FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
804         FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
805         FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
806         FIELD64(GUEST_PDPTR0, guest_pdptr0),
807         FIELD64(GUEST_PDPTR1, guest_pdptr1),
808         FIELD64(GUEST_PDPTR2, guest_pdptr2),
809         FIELD64(GUEST_PDPTR3, guest_pdptr3),
810         FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
811         FIELD64(HOST_IA32_PAT, host_ia32_pat),
812         FIELD64(HOST_IA32_EFER, host_ia32_efer),
813         FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
814         FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
815         FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
816         FIELD(EXCEPTION_BITMAP, exception_bitmap),
817         FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
818         FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
819         FIELD(CR3_TARGET_COUNT, cr3_target_count),
820         FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
821         FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
822         FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
823         FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
824         FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
825         FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
826         FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
827         FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
828         FIELD(TPR_THRESHOLD, tpr_threshold),
829         FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
830         FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
831         FIELD(VM_EXIT_REASON, vm_exit_reason),
832         FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
833         FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
834         FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
835         FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
836         FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
837         FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
838         FIELD(GUEST_ES_LIMIT, guest_es_limit),
839         FIELD(GUEST_CS_LIMIT, guest_cs_limit),
840         FIELD(GUEST_SS_LIMIT, guest_ss_limit),
841         FIELD(GUEST_DS_LIMIT, guest_ds_limit),
842         FIELD(GUEST_FS_LIMIT, guest_fs_limit),
843         FIELD(GUEST_GS_LIMIT, guest_gs_limit),
844         FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
845         FIELD(GUEST_TR_LIMIT, guest_tr_limit),
846         FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
847         FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
848         FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
849         FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
850         FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
851         FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
852         FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
853         FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
854         FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
855         FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
856         FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
857         FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
858         FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
859         FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
860         FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
861         FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
862         FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
863         FIELD(CR0_READ_SHADOW, cr0_read_shadow),
864         FIELD(CR4_READ_SHADOW, cr4_read_shadow),
865         FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
866         FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
867         FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
868         FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
869         FIELD(EXIT_QUALIFICATION, exit_qualification),
870         FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
871         FIELD(GUEST_CR0, guest_cr0),
872         FIELD(GUEST_CR3, guest_cr3),
873         FIELD(GUEST_CR4, guest_cr4),
874         FIELD(GUEST_ES_BASE, guest_es_base),
875         FIELD(GUEST_CS_BASE, guest_cs_base),
876         FIELD(GUEST_SS_BASE, guest_ss_base),
877         FIELD(GUEST_DS_BASE, guest_ds_base),
878         FIELD(GUEST_FS_BASE, guest_fs_base),
879         FIELD(GUEST_GS_BASE, guest_gs_base),
880         FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
881         FIELD(GUEST_TR_BASE, guest_tr_base),
882         FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
883         FIELD(GUEST_IDTR_BASE, guest_idtr_base),
884         FIELD(GUEST_DR7, guest_dr7),
885         FIELD(GUEST_RSP, guest_rsp),
886         FIELD(GUEST_RIP, guest_rip),
887         FIELD(GUEST_RFLAGS, guest_rflags),
888         FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
889         FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
890         FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
891         FIELD(HOST_CR0, host_cr0),
892         FIELD(HOST_CR3, host_cr3),
893         FIELD(HOST_CR4, host_cr4),
894         FIELD(HOST_FS_BASE, host_fs_base),
895         FIELD(HOST_GS_BASE, host_gs_base),
896         FIELD(HOST_TR_BASE, host_tr_base),
897         FIELD(HOST_GDTR_BASE, host_gdtr_base),
898         FIELD(HOST_IDTR_BASE, host_idtr_base),
899         FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
900         FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
901         FIELD(HOST_RSP, host_rsp),
902         FIELD(HOST_RIP, host_rip),
903 };
904 
905 static inline short vmcs_field_to_offset(unsigned long field)
906 {
907         const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table);
908         unsigned short offset;
909 
910         BUILD_BUG_ON(size > SHRT_MAX);
911         if (field >= size)
912                 return -ENOENT;
913 
914         field = array_index_nospec(field, size);
915         offset = vmcs_field_to_offset_table[field];
916         if (offset == 0)
917                 return -ENOENT;
918         return offset;
919 }
920 
921 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
922 {
923         return to_vmx(vcpu)->nested.cached_vmcs12;
924 }
925 
926 static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
927 static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
928 static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
929 static bool vmx_xsaves_supported(void);
930 static void vmx_set_segment(struct kvm_vcpu *vcpu,
931                             struct kvm_segment *var, int seg);
932 static void vmx_get_segment(struct kvm_vcpu *vcpu,
933                             struct kvm_segment *var, int seg);
934 static bool guest_state_valid(struct kvm_vcpu *vcpu);
935 static u32 vmx_segment_access_rights(struct kvm_segment *var);
936 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
937 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
938 static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
939 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
940                                             u16 error_code);
941 static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
942 static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
943                                                           u32 msr, int type);
944 
945 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
946 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
947 /*
948  * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
949  * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
950  */
951 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
952 
953 /*
954  * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
955  * can find which vCPU should be waken up.
956  */
957 static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
958 static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
959 
960 enum {
961         VMX_IO_BITMAP_A,
962         VMX_IO_BITMAP_B,
963         VMX_VMREAD_BITMAP,
964         VMX_VMWRITE_BITMAP,
965         VMX_BITMAP_NR
966 };
967 
968 static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
969 
970 #define vmx_io_bitmap_a                      (vmx_bitmap[VMX_IO_BITMAP_A])
971 #define vmx_io_bitmap_b                      (vmx_bitmap[VMX_IO_BITMAP_B])
972 #define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
973 #define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
974 
975 static bool cpu_has_load_ia32_efer;
976 static bool cpu_has_load_perf_global_ctrl;
977 
978 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
979 static DEFINE_SPINLOCK(vmx_vpid_lock);
980 
981 static struct vmcs_config {
982         int size;
983         int order;
984         u32 basic_cap;
985         u32 revision_id;
986         u32 pin_based_exec_ctrl;
987         u32 cpu_based_exec_ctrl;
988         u32 cpu_based_2nd_exec_ctrl;
989         u32 vmexit_ctrl;
990         u32 vmentry_ctrl;
991 } vmcs_config;
992 
993 static struct vmx_capability {
994         u32 ept;
995         u32 vpid;
996 } vmx_capability;
997 
998 #define VMX_SEGMENT_FIELD(seg)                                  \
999         [VCPU_SREG_##seg] = {                                   \
1000                 .selector = GUEST_##seg##_SELECTOR,             \
1001                 .base = GUEST_##seg##_BASE,                     \
1002                 .limit = GUEST_##seg##_LIMIT,                   \
1003                 .ar_bytes = GUEST_##seg##_AR_BYTES,             \
1004         }
1005 
1006 static const struct kvm_vmx_segment_field {
1007         unsigned selector;
1008         unsigned base;
1009         unsigned limit;
1010         unsigned ar_bytes;
1011 } kvm_vmx_segment_fields[] = {
1012         VMX_SEGMENT_FIELD(CS),
1013         VMX_SEGMENT_FIELD(DS),
1014         VMX_SEGMENT_FIELD(ES),
1015         VMX_SEGMENT_FIELD(FS),
1016         VMX_SEGMENT_FIELD(GS),
1017         VMX_SEGMENT_FIELD(SS),
1018         VMX_SEGMENT_FIELD(TR),
1019         VMX_SEGMENT_FIELD(LDTR),
1020 };
1021 
1022 static u64 host_efer;
1023 
1024 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
1025 
1026 /*
1027  * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
1028  * away by decrementing the array size.
1029  */
1030 static const u32 vmx_msr_index[] = {
1031 #ifdef CONFIG_X86_64
1032         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
1033 #endif
1034         MSR_EFER, MSR_TSC_AUX, MSR_STAR,
1035 };
1036 
1037 static inline bool is_exception_n(u32 intr_info, u8 vector)
1038 {
1039         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
1040                              INTR_INFO_VALID_MASK)) ==
1041                 (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
1042 }
1043 
1044 static inline bool is_debug(u32 intr_info)
1045 {
1046         return is_exception_n(intr_info, DB_VECTOR);
1047 }
1048 
1049 static inline bool is_breakpoint(u32 intr_info)
1050 {
1051         return is_exception_n(intr_info, BP_VECTOR);
1052 }
1053 
1054 static inline bool is_page_fault(u32 intr_info)
1055 {
1056         return is_exception_n(intr_info, PF_VECTOR);
1057 }
1058 
1059 static inline bool is_no_device(u32 intr_info)
1060 {
1061         return is_exception_n(intr_info, NM_VECTOR);
1062 }
1063 
1064 static inline bool is_invalid_opcode(u32 intr_info)
1065 {
1066         return is_exception_n(intr_info, UD_VECTOR);
1067 }
1068 
1069 static inline bool is_external_interrupt(u32 intr_info)
1070 {
1071         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
1072                 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
1073 }
1074 
1075 static inline bool is_machine_check(u32 intr_info)
1076 {
1077         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
1078                              INTR_INFO_VALID_MASK)) ==
1079                 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
1080 }
1081 
1082 /* Undocumented: icebp/int1 */
1083 static inline bool is_icebp(u32 intr_info)
1084 {
1085         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
1086                 == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK);
1087 }
1088 
1089 static inline bool cpu_has_vmx_msr_bitmap(void)
1090 {
1091         return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
1092 }
1093 
1094 static inline bool cpu_has_vmx_tpr_shadow(void)
1095 {
1096         return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
1097 }
1098 
1099 static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
1100 {
1101         return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
1102 }
1103 
1104 static inline bool cpu_has_secondary_exec_ctrls(void)
1105 {
1106         return vmcs_config.cpu_based_exec_ctrl &
1107                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1108 }
1109 
1110 static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
1111 {
1112         return vmcs_config.cpu_based_2nd_exec_ctrl &
1113                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1114 }
1115 
1116 static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
1117 {
1118         return vmcs_config.cpu_based_2nd_exec_ctrl &
1119                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
1120 }
1121 
1122 static inline bool cpu_has_vmx_apic_register_virt(void)
1123 {
1124         return vmcs_config.cpu_based_2nd_exec_ctrl &
1125                 SECONDARY_EXEC_APIC_REGISTER_VIRT;
1126 }
1127 
1128 static inline bool cpu_has_vmx_virtual_intr_delivery(void)
1129 {
1130         return vmcs_config.cpu_based_2nd_exec_ctrl &
1131                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
1132 }
1133 
1134 /*
1135  * Comment's format: document - errata name - stepping - processor name.
1136  * Refer from
1137  * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
1138  */
1139 static u32 vmx_preemption_cpu_tfms[] = {
1140 /* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
1141 0x000206E6,
1142 /* 323056.pdf - AAX65  - C2 - Xeon L3406 */
1143 /* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
1144 /* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
1145 0x00020652,
1146 /* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
1147 0x00020655,
1148 /* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
1149 /* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
1150 /*
1151  * 320767.pdf - AAP86  - B1 -
1152  * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
1153  */
1154 0x000106E5,
1155 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */
1156 0x000106A0,
1157 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */
1158 0x000106A1,
1159 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
1160 0x000106A4,
1161  /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
1162  /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
1163  /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
1164 0x000106A5,
1165 };
1166 
1167 static inline bool cpu_has_broken_vmx_preemption_timer(void)
1168 {
1169         u32 eax = cpuid_eax(0x00000001), i;
1170 
1171         /* Clear the reserved bits */
1172         eax &= ~(0x3U << 14 | 0xfU << 28);
1173         for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
1174                 if (eax == vmx_preemption_cpu_tfms[i])
1175                         return true;
1176 
1177         return false;
1178 }
1179 
1180 static inline bool cpu_has_vmx_preemption_timer(void)
1181 {
1182         return vmcs_config.pin_based_exec_ctrl &
1183                 PIN_BASED_VMX_PREEMPTION_TIMER;
1184 }
1185 
1186 static inline bool cpu_has_vmx_posted_intr(void)
1187 {
1188         return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
1189                 vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
1190 }
1191 
1192 static inline bool cpu_has_vmx_apicv(void)
1193 {
1194         return cpu_has_vmx_apic_register_virt() &&
1195                 cpu_has_vmx_virtual_intr_delivery() &&
1196                 cpu_has_vmx_posted_intr();
1197 }
1198 
1199 static inline bool cpu_has_vmx_flexpriority(void)
1200 {
1201         return cpu_has_vmx_tpr_shadow() &&
1202                 cpu_has_vmx_virtualize_apic_accesses();
1203 }
1204 
1205 static inline bool cpu_has_vmx_ept_execute_only(void)
1206 {
1207         return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
1208 }
1209 
1210 static inline bool cpu_has_vmx_ept_2m_page(void)
1211 {
1212         return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
1213 }
1214 
1215 static inline bool cpu_has_vmx_ept_1g_page(void)
1216 {
1217         return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
1218 }
1219 
1220 static inline bool cpu_has_vmx_ept_4levels(void)
1221 {
1222         return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
1223 }
1224 
1225 static inline bool cpu_has_vmx_ept_mt_wb(void)
1226 {
1227         return vmx_capability.ept & VMX_EPTP_WB_BIT;
1228 }
1229 
1230 static inline bool cpu_has_vmx_ept_5levels(void)
1231 {
1232         return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT;
1233 }
1234 
1235 static inline bool cpu_has_vmx_ept_ad_bits(void)
1236 {
1237         return vmx_capability.ept & VMX_EPT_AD_BIT;
1238 }
1239 
1240 static inline bool cpu_has_vmx_invept_context(void)
1241 {
1242         return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
1243 }
1244 
1245 static inline bool cpu_has_vmx_invept_global(void)
1246 {
1247         return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
1248 }
1249 
1250 static inline bool cpu_has_vmx_invvpid_single(void)
1251 {
1252         return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
1253 }
1254 
1255 static inline bool cpu_has_vmx_invvpid_global(void)
1256 {
1257         return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
1258 }
1259 
1260 static inline bool cpu_has_vmx_invvpid(void)
1261 {
1262         return vmx_capability.vpid & VMX_VPID_INVVPID_BIT;
1263 }
1264 
1265 static inline bool cpu_has_vmx_ept(void)
1266 {
1267         return vmcs_config.cpu_based_2nd_exec_ctrl &
1268                 SECONDARY_EXEC_ENABLE_EPT;
1269 }
1270 
1271 static inline bool cpu_has_vmx_unrestricted_guest(void)
1272 {
1273         return vmcs_config.cpu_based_2nd_exec_ctrl &
1274                 SECONDARY_EXEC_UNRESTRICTED_GUEST;
1275 }
1276 
1277 static inline bool cpu_has_vmx_ple(void)
1278 {
1279         return vmcs_config.cpu_based_2nd_exec_ctrl &
1280                 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
1281 }
1282 
1283 static inline bool cpu_has_vmx_basic_inout(void)
1284 {
1285         return  (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
1286 }
1287 
1288 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
1289 {
1290         return flexpriority_enabled && lapic_in_kernel(vcpu);
1291 }
1292 
1293 static inline bool cpu_has_vmx_vpid(void)
1294 {
1295         return vmcs_config.cpu_based_2nd_exec_ctrl &
1296                 SECONDARY_EXEC_ENABLE_VPID;
1297 }
1298 
1299 static inline bool cpu_has_vmx_rdtscp(void)
1300 {
1301         return vmcs_config.cpu_based_2nd_exec_ctrl &
1302                 SECONDARY_EXEC_RDTSCP;
1303 }
1304 
1305 static inline bool cpu_has_vmx_invpcid(void)
1306 {
1307         return vmcs_config.cpu_based_2nd_exec_ctrl &
1308                 SECONDARY_EXEC_ENABLE_INVPCID;
1309 }
1310 
1311 static inline bool cpu_has_virtual_nmis(void)
1312 {
1313         return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
1314 }
1315 
1316 static inline bool cpu_has_vmx_wbinvd_exit(void)
1317 {
1318         return vmcs_config.cpu_based_2nd_exec_ctrl &
1319                 SECONDARY_EXEC_WBINVD_EXITING;
1320 }
1321 
1322 static inline bool cpu_has_vmx_shadow_vmcs(void)
1323 {
1324         u64 vmx_msr;
1325         rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
1326         /* check if the cpu supports writing r/o exit information fields */
1327         if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
1328                 return false;
1329 
1330         return vmcs_config.cpu_based_2nd_exec_ctrl &
1331                 SECONDARY_EXEC_SHADOW_VMCS;
1332 }
1333 
1334 static inline bool cpu_has_vmx_pml(void)
1335 {
1336         return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
1337 }
1338 
1339 static inline bool cpu_has_vmx_tsc_scaling(void)
1340 {
1341         return vmcs_config.cpu_based_2nd_exec_ctrl &
1342                 SECONDARY_EXEC_TSC_SCALING;
1343 }
1344 
1345 static inline bool cpu_has_vmx_vmfunc(void)
1346 {
1347         return vmcs_config.cpu_based_2nd_exec_ctrl &
1348                 SECONDARY_EXEC_ENABLE_VMFUNC;
1349 }
1350 
1351 static inline bool report_flexpriority(void)
1352 {
1353         return flexpriority_enabled;
1354 }
1355 
1356 static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
1357 {
1358         return vmx_misc_cr3_count(to_vmx(vcpu)->nested.nested_vmx_misc_low);
1359 }
1360 
1361 static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
1362 {
1363         return vmcs12->cpu_based_vm_exec_control & bit;
1364 }
1365 
1366 static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
1367 {
1368         return (vmcs12->cpu_based_vm_exec_control &
1369                         CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
1370                 (vmcs12->secondary_vm_exec_control & bit);
1371 }
1372 
1373 static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
1374 {
1375         return vmcs12->pin_based_vm_exec_control &
1376                 PIN_BASED_VMX_PREEMPTION_TIMER;
1377 }
1378 
1379 static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
1380 {
1381         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
1382 }
1383 
1384 static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
1385 {
1386         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
1387 }
1388 
1389 static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12)
1390 {
1391         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML);
1392 }
1393 
1394 static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
1395 {
1396         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
1397 }
1398 
1399 static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
1400 {
1401         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
1402 }
1403 
1404 static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
1405 {
1406         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
1407 }
1408 
1409 static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
1410 {
1411         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
1412 }
1413 
1414 static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
1415 {
1416         return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
1417 }
1418 
1419 static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
1420 {
1421         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
1422 }
1423 
1424 static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
1425 {
1426         return nested_cpu_has_vmfunc(vmcs12) &&
1427                 (vmcs12->vm_function_control &
1428                  VMX_VMFUNC_EPTP_SWITCHING);
1429 }
1430 
1431 static inline bool is_nmi(u32 intr_info)
1432 {
1433         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
1434                 == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
1435 }
1436 
1437 static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
1438                               u32 exit_intr_info,
1439                               unsigned long exit_qualification);
1440 static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
1441                         struct vmcs12 *vmcs12,
1442                         u32 reason, unsigned long qualification);
1443 
1444 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
1445 {
1446         int i;
1447 
1448         for (i = 0; i < vmx->nmsrs; ++i)
1449                 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
1450                         return i;
1451         return -1;
1452 }
1453 
1454 static inline void __invvpid(int ext, u16 vpid, gva_t gva)
1455 {
1456     struct {
1457         u64 vpid : 16;
1458         u64 rsvd : 48;
1459         u64 gva;
1460     } operand = { vpid, 0, gva };
1461 
1462     asm volatile (__ex(ASM_VMX_INVVPID)
1463                   /* CF==1 or ZF==1 --> rc = -1 */
1464                   "; ja 1f ; ud2 ; 1:"
1465                   : : "a"(&operand), "c"(ext) : "cc", "memory");
1466 }
1467 
1468 static inline void __invept(int ext, u64 eptp, gpa_t gpa)
1469 {
1470         struct {
1471                 u64 eptp, gpa;
1472         } operand = {eptp, gpa};
1473 
1474         asm volatile (__ex(ASM_VMX_INVEPT)
1475                         /* CF==1 or ZF==1 --> rc = -1 */
1476                         "; ja 1f ; ud2 ; 1:\n"
1477                         : : "a" (&operand), "c" (ext) : "cc", "memory");
1478 }
1479 
1480 static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
1481 {
1482         int i;
1483 
1484         i = __find_msr_index(vmx, msr);
1485         if (i >= 0)
1486                 return &vmx->guest_msrs[i];
1487         return NULL;
1488 }
1489 
1490 static void vmcs_clear(struct vmcs *vmcs)
1491 {
1492         u64 phys_addr = __pa(vmcs);
1493         u8 error;
1494 
1495         asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
1496                       : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
1497                       : "cc", "memory");
1498         if (error)
1499                 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
1500                        vmcs, phys_addr);
1501 }
1502 
1503 static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
1504 {
1505         vmcs_clear(loaded_vmcs->vmcs);
1506         if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
1507                 vmcs_clear(loaded_vmcs->shadow_vmcs);
1508         loaded_vmcs->cpu = -1;
1509         loaded_vmcs->launched = 0;
1510 }
1511 
1512 static void vmcs_load(struct vmcs *vmcs)
1513 {
1514         u64 phys_addr = __pa(vmcs);
1515         u8 error;
1516 
1517         asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
1518                         : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
1519                         : "cc", "memory");
1520         if (error)
1521                 printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
1522                        vmcs, phys_addr);
1523 }
1524 
1525 #ifdef CONFIG_KEXEC_CORE
1526 /*
1527  * This bitmap is used to indicate whether the vmclear
1528  * operation is enabled on all cpus. All disabled by
1529  * default.
1530  */
1531 static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
1532 
1533 static inline void crash_enable_local_vmclear(int cpu)
1534 {
1535         cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
1536 }
1537 
1538 static inline void crash_disable_local_vmclear(int cpu)
1539 {
1540         cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
1541 }
1542 
1543 static inline int crash_local_vmclear_enabled(int cpu)
1544 {
1545         return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
1546 }
1547 
1548 static void crash_vmclear_local_loaded_vmcss(void)
1549 {
1550         int cpu = raw_smp_processor_id();
1551         struct loaded_vmcs *v;
1552 
1553         if (!crash_local_vmclear_enabled(cpu))
1554                 return;
1555 
1556         list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
1557                             loaded_vmcss_on_cpu_link)
1558                 vmcs_clear(v->vmcs);
1559 }
1560 #else
1561 static inline void crash_enable_local_vmclear(int cpu) { }
1562 static inline void crash_disable_local_vmclear(int cpu) { }
1563 #endif /* CONFIG_KEXEC_CORE */
1564 
1565 static void __loaded_vmcs_clear(void *arg)
1566 {
1567         struct loaded_vmcs *loaded_vmcs = arg;
1568         int cpu = raw_smp_processor_id();
1569 
1570         if (loaded_vmcs->cpu != cpu)
1571                 return; /* vcpu migration can race with cpu offline */
1572         if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
1573                 per_cpu(current_vmcs, cpu) = NULL;
1574         crash_disable_local_vmclear(cpu);
1575         list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
1576 
1577         /*
1578          * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
1579          * is before setting loaded_vmcs->vcpu to -1 which is done in
1580          * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
1581          * then adds the vmcs into percpu list before it is deleted.
1582          */
1583         smp_wmb();
1584 
1585         loaded_vmcs_init(loaded_vmcs);
1586         crash_enable_local_vmclear(cpu);
1587 }
1588 
1589 static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
1590 {
1591         int cpu = loaded_vmcs->cpu;
1592 
1593         if (cpu != -1)
1594                 smp_call_function_single(cpu,
1595                          __loaded_vmcs_clear, loaded_vmcs, 1);
1596 }
1597 
1598 static inline void vpid_sync_vcpu_single(int vpid)
1599 {
1600         if (vpid == 0)
1601                 return;
1602 
1603         if (cpu_has_vmx_invvpid_single())
1604                 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
1605 }
1606 
1607 static inline void vpid_sync_vcpu_global(void)
1608 {
1609         if (cpu_has_vmx_invvpid_global())
1610                 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
1611 }
1612 
1613 static inline void vpid_sync_context(int vpid)
1614 {
1615         if (cpu_has_vmx_invvpid_single())
1616                 vpid_sync_vcpu_single(vpid);
1617         else
1618                 vpid_sync_vcpu_global();
1619 }
1620 
1621 static inline void ept_sync_global(void)
1622 {
1623         __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
1624 }
1625 
1626 static inline void ept_sync_context(u64 eptp)
1627 {
1628         if (cpu_has_vmx_invept_context())
1629                 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
1630         else
1631                 ept_sync_global();
1632 }
1633 
1634 static __always_inline void vmcs_check16(unsigned long field)
1635 {
1636         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
1637                          "16-bit accessor invalid for 64-bit field");
1638         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
1639                          "16-bit accessor invalid for 64-bit high field");
1640         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
1641                          "16-bit accessor invalid for 32-bit high field");
1642         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
1643                          "16-bit accessor invalid for natural width field");
1644 }
1645 
1646 static __always_inline void vmcs_check32(unsigned long field)
1647 {
1648         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
1649                          "32-bit accessor invalid for 16-bit field");
1650         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
1651                          "32-bit accessor invalid for natural width field");
1652 }
1653 
1654 static __always_inline void vmcs_check64(unsigned long field)
1655 {
1656         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
1657                          "64-bit accessor invalid for 16-bit field");
1658         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
1659                          "64-bit accessor invalid for 64-bit high field");
1660         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
1661                          "64-bit accessor invalid for 32-bit field");
1662         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
1663                          "64-bit accessor invalid for natural width field");
1664 }
1665 
1666 static __always_inline void vmcs_checkl(unsigned long field)
1667 {
1668         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
1669                          "Natural width accessor invalid for 16-bit field");
1670         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
1671                          "Natural width accessor invalid for 64-bit field");
1672         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
1673                          "Natural width accessor invalid for 64-bit high field");
1674         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
1675                          "Natural width accessor invalid for 32-bit field");
1676 }
1677 
1678 static __always_inline unsigned long __vmcs_readl(unsigned long field)
1679 {
1680         unsigned long value;
1681 
1682         asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
1683                       : "=a"(value) : "d"(field) : "cc");
1684         return value;
1685 }
1686 
1687 static __always_inline u16 vmcs_read16(unsigned long field)
1688 {
1689         vmcs_check16(field);
1690         return __vmcs_readl(field);
1691 }
1692 
1693 static __always_inline u32 vmcs_read32(unsigned long field)
1694 {
1695         vmcs_check32(field);
1696         return __vmcs_readl(field);
1697 }
1698 
1699 static __always_inline u64 vmcs_read64(unsigned long field)
1700 {
1701         vmcs_check64(field);
1702 #ifdef CONFIG_X86_64
1703         return __vmcs_readl(field);
1704 #else
1705         return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32);
1706 #endif
1707 }
1708 
1709 static __always_inline unsigned long vmcs_readl(unsigned long field)
1710 {
1711         vmcs_checkl(field);
1712         return __vmcs_readl(field);
1713 }
1714 
1715 static noinline void vmwrite_error(unsigned long field, unsigned long value)
1716 {
1717         printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
1718                field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
1719         dump_stack();
1720 }
1721 
1722 static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
1723 {
1724         u8 error;
1725 
1726         asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
1727                        : "=q"(error) : "a"(value), "d"(field) : "cc");
1728         if (unlikely(error))
1729                 vmwrite_error(field, value);
1730 }
1731 
1732 static __always_inline void vmcs_write16(unsigned long field, u16 value)
1733 {
1734         vmcs_check16(field);
1735         __vmcs_writel(field, value);
1736 }
1737 
1738 static __always_inline void vmcs_write32(unsigned long field, u32 value)
1739 {
1740         vmcs_check32(field);
1741         __vmcs_writel(field, value);
1742 }
1743 
1744 static __always_inline void vmcs_write64(unsigned long field, u64 value)
1745 {
1746         vmcs_check64(field);
1747         __vmcs_writel(field, value);
1748 #ifndef CONFIG_X86_64
1749         asm volatile ("");
1750         __vmcs_writel(field+1, value >> 32);
1751 #endif
1752 }
1753 
1754 static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
1755 {
1756         vmcs_checkl(field);
1757         __vmcs_writel(field, value);
1758 }
1759 
1760 static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
1761 {
1762         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
1763                          "vmcs_clear_bits does not support 64-bit fields");
1764         __vmcs_writel(field, __vmcs_readl(field) & ~mask);
1765 }
1766 
1767 static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
1768 {
1769         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
1770                          "vmcs_set_bits does not support 64-bit fields");
1771         __vmcs_writel(field, __vmcs_readl(field) | mask);
1772 }
1773 
1774 static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
1775 {
1776         vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
1777 }
1778 
1779 static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
1780 {
1781         vmcs_write32(VM_ENTRY_CONTROLS, val);
1782         vmx->vm_entry_controls_shadow = val;
1783 }
1784 
1785 static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
1786 {
1787         if (vmx->vm_entry_controls_shadow != val)
1788                 vm_entry_controls_init(vmx, val);
1789 }
1790 
1791 static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
1792 {
1793         return vmx->vm_entry_controls_shadow;
1794 }
1795 
1796 
1797 static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
1798 {
1799         vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
1800 }
1801 
1802 static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
1803 {
1804         vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
1805 }
1806 
1807 static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
1808 {
1809         vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
1810 }
1811 
1812 static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
1813 {
1814         vmcs_write32(VM_EXIT_CONTROLS, val);
1815         vmx->vm_exit_controls_shadow = val;
1816 }
1817 
1818 static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
1819 {
1820         if (vmx->vm_exit_controls_shadow != val)
1821                 vm_exit_controls_init(vmx, val);
1822 }
1823 
1824 static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
1825 {
1826         return vmx->vm_exit_controls_shadow;
1827 }
1828 
1829 
1830 static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
1831 {
1832         vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
1833 }
1834 
1835 static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
1836 {
1837         vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
1838 }
1839 
1840 static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
1841 {
1842         vmx->segment_cache.bitmask = 0;
1843 }
1844 
1845 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
1846                                        unsigned field)
1847 {
1848         bool ret;
1849         u32 mask = 1 << (seg * SEG_FIELD_NR + field);
1850 
1851         if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
1852                 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
1853                 vmx->segment_cache.bitmask = 0;
1854         }
1855         ret = vmx->segment_cache.bitmask & mask;
1856         vmx->segment_cache.bitmask |= mask;
1857         return ret;
1858 }
1859 
1860 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
1861 {
1862         u16 *p = &vmx->segment_cache.seg[seg].selector;
1863 
1864         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
1865                 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
1866         return *p;
1867 }
1868 
1869 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
1870 {
1871         ulong *p = &vmx->segment_cache.seg[seg].base;
1872 
1873         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
1874                 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
1875         return *p;
1876 }
1877 
1878 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
1879 {
1880         u32 *p = &vmx->segment_cache.seg[seg].limit;
1881 
1882         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
1883                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
1884         return *p;
1885 }
1886 
1887 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
1888 {
1889         u32 *p = &vmx->segment_cache.seg[seg].ar;
1890 
1891         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
1892                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
1893         return *p;
1894 }
1895 
1896 static void update_exception_bitmap(struct kvm_vcpu *vcpu)
1897 {
1898         u32 eb;
1899 
1900         eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
1901              (1u << DB_VECTOR) | (1u << AC_VECTOR);
1902         if ((vcpu->guest_debug &
1903              (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
1904             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
1905                 eb |= 1u << BP_VECTOR;
1906         if (to_vmx(vcpu)->rmode.vm86_active)
1907                 eb = ~0;
1908         if (enable_ept)
1909                 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
1910 
1911         /* When we are running a nested L2 guest and L1 specified for it a
1912          * certain exception bitmap, we must trap the same exceptions and pass
1913          * them to L1. When running L2, we will only handle the exceptions
1914          * specified above if L1 did not want them.
1915          */
1916         if (is_guest_mode(vcpu))
1917                 eb |= get_vmcs12(vcpu)->exception_bitmap;
1918 
1919         vmcs_write32(EXCEPTION_BITMAP, eb);
1920 }
1921 
1922 /*
1923  * Check if MSR is intercepted for currently loaded MSR bitmap.
1924  */
1925 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
1926 {
1927         unsigned long *msr_bitmap;
1928         int f = sizeof(unsigned long);
1929 
1930         if (!cpu_has_vmx_msr_bitmap())
1931                 return true;
1932 
1933         msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
1934 
1935         if (msr <= 0x1fff) {
1936                 return !!test_bit(msr, msr_bitmap + 0x800 / f);
1937         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
1938                 msr &= 0x1fff;
1939                 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
1940         }
1941 
1942         return true;
1943 }
1944 
1945 /*
1946  * Check if MSR is intercepted for L01 MSR bitmap.
1947  */
1948 static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
1949 {
1950         unsigned long *msr_bitmap;
1951         int f = sizeof(unsigned long);
1952 
1953         if (!cpu_has_vmx_msr_bitmap())
1954                 return true;
1955 
1956         msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
1957 
1958         if (msr <= 0x1fff) {
1959                 return !!test_bit(msr, msr_bitmap + 0x800 / f);
1960         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
1961                 msr &= 0x1fff;
1962                 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
1963         }
1964 
1965         return true;
1966 }
1967 
1968 static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1969                 unsigned long entry, unsigned long exit)
1970 {
1971         vm_entry_controls_clearbit(vmx, entry);
1972         vm_exit_controls_clearbit(vmx, exit);
1973 }
1974 
1975 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
1976 {
1977         unsigned i;
1978         struct msr_autoload *m = &vmx->msr_autoload;
1979 
1980         switch (msr) {
1981         case MSR_EFER:
1982                 if (cpu_has_load_ia32_efer) {
1983                         clear_atomic_switch_msr_special(vmx,
1984                                         VM_ENTRY_LOAD_IA32_EFER,
1985                                         VM_EXIT_LOAD_IA32_EFER);
1986                         return;
1987                 }
1988                 break;
1989         case MSR_CORE_PERF_GLOBAL_CTRL:
1990                 if (cpu_has_load_perf_global_ctrl) {
1991                         clear_atomic_switch_msr_special(vmx,
1992                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1993                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
1994                         return;
1995                 }
1996                 break;
1997         }
1998 
1999         for (i = 0; i < m->nr; ++i)
2000                 if (m->guest[i].index == msr)
2001                         break;
2002 
2003         if (i == m->nr)
2004                 return;
2005         --m->nr;
2006         m->guest[i] = m->guest[m->nr];
2007         m->host[i] = m->host[m->nr];
2008         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
2009         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
2010 }
2011 
2012 static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
2013                 unsigned long entry, unsigned long exit,
2014                 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
2015                 u64 guest_val, u64 host_val)
2016 {
2017         vmcs_write64(guest_val_vmcs, guest_val);
2018         vmcs_write64(host_val_vmcs, host_val);
2019         vm_entry_controls_setbit(vmx, entry);
2020         vm_exit_controls_setbit(vmx, exit);
2021 }
2022 
2023 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
2024                                   u64 guest_val, u64 host_val)
2025 {
2026         unsigned i;
2027         struct msr_autoload *m = &vmx->msr_autoload;
2028 
2029         switch (msr) {
2030         case MSR_EFER:
2031                 if (cpu_has_load_ia32_efer) {
2032                         add_atomic_switch_msr_special(vmx,
2033                                         VM_ENTRY_LOAD_IA32_EFER,
2034                                         VM_EXIT_LOAD_IA32_EFER,
2035                                         GUEST_IA32_EFER,
2036                                         HOST_IA32_EFER,
2037                                         guest_val, host_val);
2038                         return;
2039                 }
2040                 break;
2041         case MSR_CORE_PERF_GLOBAL_CTRL:
2042                 if (cpu_has_load_perf_global_ctrl) {
2043                         add_atomic_switch_msr_special(vmx,
2044                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
2045                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
2046                                         GUEST_IA32_PERF_GLOBAL_CTRL,
2047                                         HOST_IA32_PERF_GLOBAL_CTRL,
2048                                         guest_val, host_val);
2049                         return;
2050                 }
2051                 break;
2052         case MSR_IA32_PEBS_ENABLE:
2053                 /* PEBS needs a quiescent period after being disabled (to write
2054                  * a record).  Disabling PEBS through VMX MSR swapping doesn't
2055                  * provide that period, so a CPU could write host's record into
2056                  * guest's memory.
2057                  */
2058                 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
2059         }
2060 
2061         for (i = 0; i < m->nr; ++i)
2062                 if (m->guest[i].index == msr)
2063                         break;
2064 
2065         if (i == NR_AUTOLOAD_MSRS) {
2066                 printk_once(KERN_WARNING "Not enough msr switch entries. "
2067                                 "Can't add msr %x\n", msr);
2068                 return;
2069         } else if (i == m->nr) {
2070                 ++m->nr;
2071                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
2072                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
2073         }
2074 
2075         m->guest[i].index = msr;
2076         m->guest[i].value = guest_val;
2077         m->host[i].index = msr;
2078         m->host[i].value = host_val;
2079 }
2080 
2081 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
2082 {
2083         u64 guest_efer = vmx->vcpu.arch.efer;
2084         u64 ignore_bits = 0;
2085 
2086         if (!enable_ept) {
2087                 /*
2088                  * NX is needed to handle CR0.WP=1, CR4.SMEP=1.  Testing
2089                  * host CPUID is more efficient than testing guest CPUID
2090                  * or CR4.  Host SMEP is anyway a requirement for guest SMEP.
2091                  */
2092                 if (boot_cpu_has(X86_FEATURE_SMEP))
2093                         guest_efer |= EFER_NX;
2094                 else if (!(guest_efer & EFER_NX))
2095                         ignore_bits |= EFER_NX;
2096         }
2097 
2098         /*
2099          * LMA and LME handled by hardware; SCE meaningless outside long mode.
2100          */
2101         ignore_bits |= EFER_SCE;
2102 #ifdef CONFIG_X86_64
2103         ignore_bits |= EFER_LMA | EFER_LME;
2104         /* SCE is meaningful only in long mode on Intel */
2105         if (guest_efer & EFER_LMA)
2106                 ignore_bits &= ~(u64)EFER_SCE;
2107 #endif
2108 
2109         clear_atomic_switch_msr(vmx, MSR_EFER);
2110 
2111         /*
2112          * On EPT, we can't emulate NX, so we must switch EFER atomically.
2113          * On CPUs that support "load IA32_EFER", always switch EFER
2114          * atomically, since it's faster than switching it manually.
2115          */
2116         if (cpu_has_load_ia32_efer ||
2117             (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
2118                 if (!(guest_efer & EFER_LMA))
2119                         guest_efer &= ~EFER_LME;
2120                 if (guest_efer != host_efer)
2121                         add_atomic_switch_msr(vmx, MSR_EFER,
2122                                               guest_efer, host_efer);
2123                 return false;
2124         } else {
2125                 guest_efer &= ~ignore_bits;
2126                 guest_efer |= host_efer & ignore_bits;
2127 
2128                 vmx->guest_msrs[efer_offset].data = guest_efer;
2129                 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
2130 
2131                 return true;
2132         }
2133 }
2134 
2135 #ifdef CONFIG_X86_32
2136 /*
2137  * On 32-bit kernels, VM exits still load the FS and GS bases from the
2138  * VMCS rather than the segment table.  KVM uses this helper to figure
2139  * out the current bases to poke them into the VMCS before entry.
2140  */
2141 static unsigned long segment_base(u16 selector)
2142 {
2143         struct desc_struct *table;
2144         unsigned long v;
2145 
2146         if (!(selector & ~SEGMENT_RPL_MASK))
2147                 return 0;
2148 
2149         table = get_current_gdt_ro();
2150 
2151         if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
2152                 u16 ldt_selector = kvm_read_ldt();
2153 
2154                 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
2155                         return 0;
2156 
2157                 table = (struct desc_struct *)segment_base(ldt_selector);
2158         }
2159         v = get_desc_base(&table[selector >> 3]);
2160         return v;
2161 }
2162 #endif
2163 
2164 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
2165 {
2166         struct vcpu_vmx *vmx = to_vmx(vcpu);
2167         int i;
2168 
2169         if (vmx->host_state.loaded)
2170                 return;
2171 
2172         vmx->host_state.loaded = 1;
2173         /*
2174          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
2175          * allow segment selectors with cpl > 0 or ti == 1.
2176          */
2177         vmx->host_state.ldt_sel = kvm_read_ldt();
2178         vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
2179         savesegment(fs, vmx->host_state.fs_sel);
2180         if (!(vmx->host_state.fs_sel & 7)) {
2181                 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
2182                 vmx->host_state.fs_reload_needed = 0;
2183         } else {
2184                 vmcs_write16(HOST_FS_SELECTOR, 0);
2185                 vmx->host_state.fs_reload_needed = 1;
2186         }
2187         savesegment(gs, vmx->host_state.gs_sel);
2188         if (!(vmx->host_state.gs_sel & 7))
2189                 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
2190         else {
2191                 vmcs_write16(HOST_GS_SELECTOR, 0);
2192                 vmx->host_state.gs_ldt_reload_needed = 1;
2193         }
2194 
2195 #ifdef CONFIG_X86_64
2196         savesegment(ds, vmx->host_state.ds_sel);
2197         savesegment(es, vmx->host_state.es_sel);
2198 #endif
2199 
2200 #ifdef CONFIG_X86_64
2201         vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
2202         vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
2203 #else
2204         vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
2205         vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
2206 #endif
2207 
2208 #ifdef CONFIG_X86_64
2209         rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
2210         if (is_long_mode(&vmx->vcpu))
2211                 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2212 #endif
2213         if (boot_cpu_has(X86_FEATURE_MPX))
2214                 rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
2215         for (i = 0; i < vmx->save_nmsrs; ++i)
2216                 kvm_set_shared_msr(vmx->guest_msrs[i].index,
2217                                    vmx->guest_msrs[i].data,
2218                                    vmx->guest_msrs[i].mask);
2219 }
2220 
2221 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
2222 {
2223         if (!vmx->host_state.loaded)
2224                 return;
2225 
2226         ++vmx->vcpu.stat.host_state_reload;
2227         vmx->host_state.loaded = 0;
2228 #ifdef CONFIG_X86_64
2229         if (is_long_mode(&vmx->vcpu))
2230                 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2231 #endif
2232         if (vmx->host_state.gs_ldt_reload_needed) {
2233                 kvm_load_ldt(vmx->host_state.ldt_sel);
2234 #ifdef CONFIG_X86_64
2235                 load_gs_index(vmx->host_state.gs_sel);
2236 #else
2237                 loadsegment(gs, vmx->host_state.gs_sel);
2238 #endif
2239         }
2240         if (vmx->host_state.fs_reload_needed)
2241                 loadsegment(fs, vmx->host_state.fs_sel);
2242 #ifdef CONFIG_X86_64
2243         if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
2244                 loadsegment(ds, vmx->host_state.ds_sel);
2245                 loadsegment(es, vmx->host_state.es_sel);
2246         }
2247 #endif
2248         invalidate_tss_limit();
2249 #ifdef CONFIG_X86_64
2250         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
2251 #endif
2252         if (vmx->host_state.msr_host_bndcfgs)
2253                 wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
2254         load_fixmap_gdt(raw_smp_processor_id());
2255 }
2256 
2257 static void vmx_load_host_state(struct vcpu_vmx *vmx)
2258 {
2259         preempt_disable();
2260         __vmx_load_host_state(vmx);
2261         preempt_enable();
2262 }
2263 
2264 static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
2265 {
2266         struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
2267         struct pi_desc old, new;
2268         unsigned int dest;
2269 
2270         /*
2271          * In case of hot-plug or hot-unplug, we may have to undo
2272          * vmx_vcpu_pi_put even if there is no assigned device.  And we
2273          * always keep PI.NDST up to date for simplicity: it makes the
2274          * code easier, and CPU migration is not a fast path.
2275          */
2276         if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
2277                 return;
2278 
2279         /*
2280          * First handle the simple case where no cmpxchg is necessary; just
2281          * allow posting non-urgent interrupts.
2282          *
2283          * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
2284          * PI.NDST: pi_post_block will do it for us and the wakeup_handler
2285          * expects the VCPU to be on the blocked_vcpu_list that matches
2286          * PI.NDST.
2287          */
2288         if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
2289             vcpu->cpu == cpu) {
2290                 pi_clear_sn(pi_desc);
2291                 return;
2292         }
2293 
2294         /* The full case.  */
2295         do {
2296                 old.control = new.control = pi_desc->control;
2297 
2298                 dest = cpu_physical_id(cpu);
2299 
2300                 if (x2apic_enabled())
2301                         new.ndst = dest;
2302                 else
2303                         new.ndst = (dest << 8) & 0xFF00;
2304 
2305                 new.sn = 0;
2306         } while (cmpxchg64(&pi_desc->control, old.control,
2307                            new.control) != old.control);
2308 }
2309 
2310 static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
2311 {
2312         vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
2313         vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
2314 }
2315 
2316 /*
2317  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
2318  * vcpu mutex is already taken.
2319  */
2320 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2321 {
2322         struct vcpu_vmx *vmx = to_vmx(vcpu);
2323         bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
2324 
2325         if (!already_loaded) {
2326                 loaded_vmcs_clear(vmx->loaded_vmcs);
2327                 local_irq_disable();
2328                 crash_disable_local_vmclear(cpu);
2329 
2330                 /*
2331                  * Read loaded_vmcs->cpu should be before fetching
2332                  * loaded_vmcs->loaded_vmcss_on_cpu_link.
2333                  * See the comments in __loaded_vmcs_clear().
2334                  */
2335                 smp_rmb();
2336 
2337                 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
2338                          &per_cpu(loaded_vmcss_on_cpu, cpu));
2339                 crash_enable_local_vmclear(cpu);
2340                 local_irq_enable();
2341         }
2342 
2343         if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
2344                 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
2345                 vmcs_load(vmx->loaded_vmcs->vmcs);
2346                 indirect_branch_prediction_barrier();
2347         }
2348 
2349         if (!already_loaded) {
2350                 void *gdt = get_current_gdt_ro();
2351                 unsigned long sysenter_esp;
2352 
2353                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2354 
2355                 /*
2356                  * Linux uses per-cpu TSS and GDT, so set these when switching
2357                  * processors.  See 22.2.4.
2358                  */
2359                 vmcs_writel(HOST_TR_BASE,
2360                             (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
2361                 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
2362 
2363                 /*
2364                  * VM exits change the host TR limit to 0x67 after a VM
2365                  * exit.  This is okay, since 0x67 covers everything except
2366                  * the IO bitmap and have have code to handle the IO bitmap
2367                  * being lost after a VM exit.
2368                  */
2369                 BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
2370 
2371                 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
2372                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
2373 
2374                 vmx->loaded_vmcs->cpu = cpu;
2375         }
2376 
2377         /* Setup TSC multiplier */
2378         if (kvm_has_tsc_control &&
2379             vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
2380                 decache_tsc_multiplier(vmx);
2381 
2382         vmx_vcpu_pi_load(vcpu, cpu);
2383         vmx->host_pkru = read_pkru();
2384 }
2385 
2386 static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
2387 {
2388         struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
2389 
2390         if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
2391                 !irq_remapping_cap(IRQ_POSTING_CAP)  ||
2392                 !kvm_vcpu_apicv_active(vcpu))
2393                 return;
2394 
2395         /* Set SN when the vCPU is preempted */
2396         if (vcpu->preempted)
2397                 pi_set_sn(pi_desc);
2398 }
2399 
2400 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
2401 {
2402         vmx_vcpu_pi_put(vcpu);
2403 
2404         __vmx_load_host_state(to_vmx(vcpu));
2405 }
2406 
2407 static bool emulation_required(struct kvm_vcpu *vcpu)
2408 {
2409         return emulate_invalid_guest_state && !guest_state_valid(vcpu);
2410 }
2411 
2412 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
2413 
2414 /*
2415  * Return the cr0 value that a nested guest would read. This is a combination
2416  * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
2417  * its hypervisor (cr0_read_shadow).
2418  */
2419 static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
2420 {
2421         return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
2422                 (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
2423 }
2424 static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
2425 {
2426         return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
2427                 (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
2428 }
2429 
2430 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
2431 {
2432         unsigned long rflags, save_rflags;
2433 
2434         if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
2435                 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
2436                 rflags = vmcs_readl(GUEST_RFLAGS);
2437                 if (to_vmx(vcpu)->rmode.vm86_active) {
2438                         rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
2439                         save_rflags = to_vmx(vcpu)->rmode.save_rflags;
2440                         rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
2441                 }
2442                 to_vmx(vcpu)->rflags = rflags;
2443         }
2444         return to_vmx(vcpu)->rflags;
2445 }
2446 
2447 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
2448 {
2449         unsigned long old_rflags = vmx_get_rflags(vcpu);
2450 
2451         __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
2452         to_vmx(vcpu)->rflags = rflags;
2453         if (to_vmx(vcpu)->rmode.vm86_active) {
2454                 to_vmx(vcpu)->rmode.save_rflags = rflags;
2455                 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
2456         }
2457         vmcs_writel(GUEST_RFLAGS, rflags);
2458 
2459         if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
2460                 to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
2461 }
2462 
2463 static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
2464 {
2465         u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2466         int ret = 0;
2467 
2468         if (interruptibility & GUEST_INTR_STATE_STI)
2469                 ret |= KVM_X86_SHADOW_INT_STI;
2470         if (interruptibility & GUEST_INTR_STATE_MOV_SS)
2471                 ret |= KVM_X86_SHADOW_INT_MOV_SS;
2472 
2473         return ret;
2474 }
2475 
2476 static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
2477 {
2478         u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2479         u32 interruptibility = interruptibility_old;
2480 
2481         interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
2482 
2483         if (mask & KVM_X86_SHADOW_INT_MOV_SS)
2484                 interruptibility |= GUEST_INTR_STATE_MOV_SS;
2485         else if (mask & KVM_X86_SHADOW_INT_STI)
2486                 interruptibility |= GUEST_INTR_STATE_STI;
2487 
2488         if ((interruptibility != interruptibility_old))
2489                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
2490 }
2491 
2492 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
2493 {
2494         unsigned long rip;
2495 
2496         rip = kvm_rip_read(vcpu);
2497         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2498         kvm_rip_write(vcpu, rip);
2499 
2500         /* skipping an emulated instruction also counts */
2501         vmx_set_interrupt_shadow(vcpu, 0);
2502 }
2503 
2504 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
2505                                                unsigned long exit_qual)
2506 {
2507         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2508         unsigned int nr = vcpu->arch.exception.nr;
2509         u32 intr_info = nr | INTR_INFO_VALID_MASK;
2510 
2511         if (vcpu->arch.exception.has_error_code) {
2512                 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
2513                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
2514         }
2515 
2516         if (kvm_exception_is_soft(nr))
2517                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
2518         else
2519                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
2520 
2521         if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
2522             vmx_get_nmi_mask(vcpu))
2523                 intr_info |= INTR_INFO_UNBLOCK_NMI;
2524 
2525         nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
2526 }
2527 
2528 /*
2529  * KVM wants to inject page-faults which it got to the guest. This function
2530  * checks whether in a nested guest, we need to inject them to L1 or L2.
2531  */
2532 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
2533 {
2534         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2535         unsigned int nr = vcpu->arch.exception.nr;
2536 
2537         if (nr == PF_VECTOR) {
2538                 if (vcpu->arch.exception.nested_apf) {
2539                         *exit_qual = vcpu->arch.apf.nested_apf_token;
2540                         return 1;
2541                 }
2542                 /*
2543                  * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
2544                  * The fix is to add the ancillary datum (CR2 or DR6) to structs
2545                  * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
2546                  * can be written only when inject_pending_event runs.  This should be
2547                  * conditional on a new capability---if the capability is disabled,
2548                  * kvm_multiple_exception would write the ancillary information to
2549                  * CR2 or DR6, for backwards ABI-compatibility.
2550                  */
2551                 if (nested_vmx_is_page_fault_vmexit(vmcs12,
2552                                                     vcpu->arch.exception.error_code)) {
2553                         *exit_qual = vcpu->arch.cr2;
2554                         return 1;
2555                 }
2556         } else {
2557                 if (vmcs12->exception_bitmap & (1u << nr)) {
2558                         if (nr == DB_VECTOR)
2559                                 *exit_qual = vcpu->arch.dr6;
2560                         else
2561                                 *exit_qual = 0;
2562                         return 1;
2563                 }
2564         }
2565 
2566         return 0;
2567 }
2568 
2569 static void vmx_queue_exception(struct kvm_vcpu *vcpu)
2570 {
2571         struct vcpu_vmx *vmx = to_vmx(vcpu);
2572         unsigned nr = vcpu->arch.exception.nr;
2573         bool has_error_code = vcpu->arch.exception.has_error_code;
2574         u32 error_code = vcpu->arch.exception.error_code;
2575         u32 intr_info = nr | INTR_INFO_VALID_MASK;
2576 
2577         if (has_error_code) {
2578                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
2579                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
2580         }
2581 
2582         if (vmx->rmode.vm86_active) {
2583                 int inc_eip = 0;
2584                 if (kvm_exception_is_soft(nr))
2585                         inc_eip = vcpu->arch.event_exit_inst_len;
2586                 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
2587                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2588                 return;
2589         }
2590 
2591         if (kvm_exception_is_soft(nr)) {
2592                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2593                              vmx->vcpu.arch.event_exit_inst_len);
2594                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
2595         } else
2596                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
2597 
2598         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
2599 }
2600 
2601 static bool vmx_rdtscp_supported(void)
2602 {
2603         return cpu_has_vmx_rdtscp();
2604 }
2605 
2606 static bool vmx_invpcid_supported(void)
2607 {
2608         return cpu_has_vmx_invpcid() && enable_ept;
2609 }
2610 
2611 /*
2612  * Swap MSR entry in host/guest MSR entry array.
2613  */
2614 static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
2615 {
2616         struct shared_msr_entry tmp;
2617 
2618         tmp = vmx->guest_msrs[to];
2619         vmx->guest_msrs[to] = vmx->guest_msrs[from];
2620         vmx->guest_msrs[from] = tmp;
2621 }
2622 
2623 /*
2624  * Set up the vmcs to automatically save and restore system
2625  * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
2626  * mode, as fiddling with msrs is very expensive.
2627  */
2628 static void setup_msrs(struct vcpu_vmx *vmx)
2629 {
2630         int save_nmsrs, index;
2631 
2632         save_nmsrs = 0;
2633 #ifdef CONFIG_X86_64
2634         if (is_long_mode(&vmx->vcpu)) {
2635                 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
2636                 if (index >= 0)
2637                         move_msr_up(vmx, index, save_nmsrs++);
2638                 index = __find_msr_index(vmx, MSR_LSTAR);
2639                 if (index >= 0)
2640                         move_msr_up(vmx, index, save_nmsrs++);
2641                 index = __find_msr_index(vmx, MSR_CSTAR);
2642                 if (index >= 0)
2643                         move_msr_up(vmx, index, save_nmsrs++);
2644                 index = __find_msr_index(vmx, MSR_TSC_AUX);
2645                 if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
2646                         move_msr_up(vmx, index, save_nmsrs++);
2647                 /*
2648                  * MSR_STAR is only needed on long mode guests, and only
2649                  * if efer.sce is enabled.
2650                  */
2651                 index = __find_msr_index(vmx, MSR_STAR);
2652                 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
2653                         move_msr_up(vmx, index, save_nmsrs++);
2654         }
2655 #endif
2656         index = __find_msr_index(vmx, MSR_EFER);
2657         if (index >= 0 && update_transition_efer(vmx, index))
2658                 move_msr_up(vmx, index, save_nmsrs++);
2659 
2660         vmx->save_nmsrs = save_nmsrs;
2661 
2662         if (cpu_has_vmx_msr_bitmap())
2663                 vmx_update_msr_bitmap(&vmx->vcpu);
2664 }
2665 
2666 /*
2667  * reads and returns guest's timestamp counter "register"
2668  * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset
2669  * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3
2670  */
2671 static u64 guest_read_tsc(struct kvm_vcpu *vcpu)
2672 {
2673         u64 host_tsc, tsc_offset;
2674 
2675         host_tsc = rdtsc();
2676         tsc_offset = vmcs_read64(TSC_OFFSET);
2677         return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset;
2678 }
2679 
2680 /*
2681  * writes 'offset' into guest's timestamp counter offset register
2682  */
2683 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
2684 {
2685         if (is_guest_mode(vcpu)) {
2686                 /*
2687                  * We're here if L1 chose not to trap WRMSR to TSC. According
2688                  * to the spec, this should set L1's TSC; The offset that L1
2689                  * set for L2 remains unchanged, and still needs to be added
2690                  * to the newly set TSC to get L2's TSC.
2691                  */
2692                 struct vmcs12 *vmcs12;
2693                 /* recalculate vmcs02.TSC_OFFSET: */
2694                 vmcs12 = get_vmcs12(vcpu);
2695                 vmcs_write64(TSC_OFFSET, offset +
2696                         (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
2697                          vmcs12->tsc_offset : 0));
2698         } else {
2699                 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
2700                                            vmcs_read64(TSC_OFFSET), offset);
2701                 vmcs_write64(TSC_OFFSET, offset);
2702         }
2703 }
2704 
2705 /*
2706  * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
2707  * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
2708  * all guests if the "nested" module option is off, and can also be disabled
2709  * for a single guest by disabling its VMX cpuid bit.
2710  */
2711 static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
2712 {
2713         return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
2714 }
2715 
2716 /*
2717  * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
2718  * returned for the various VMX controls MSRs when nested VMX is enabled.
2719  * The same values should also be used to verify that vmcs12 control fields are
2720  * valid during nested entry from L1 to L2.
2721  * Each of these control msrs has a low and high 32-bit half: A low bit is on
2722  * if the corresponding bit in the (32-bit) control field *must* be on, and a
2723  * bit in the high half is on if the corresponding bit in the control field
2724  * may be on. See also vmx_control_verify().
2725  */
2726 static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2727 {
2728         /*
2729          * Note that as a general rule, the high half of the MSRs (bits in
2730          * the control fields which may be 1) should be initialized by the
2731          * intersection of the underlying hardware's MSR (i.e., features which
2732          * can be supported) and the list of features we want to expose -
2733          * because they are known to be properly supported in our code.
2734          * Also, usually, the low half of the MSRs (bits which must be 1) can
2735          * be set to 0, meaning that L1 may turn off any of these bits. The
2736          * reason is that if one of these bits is necessary, it will appear
2737          * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
2738          * fields of vmcs01 and vmcs02, will turn these bits off - and
2739          * nested_vmx_exit_reflected() will not pass related exits to L1.
2740          * These rules have exceptions below.
2741          */
2742 
2743         /* pin-based controls */
2744         rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
2745                 vmx->nested.nested_vmx_pinbased_ctls_low,
2746                 vmx->nested.nested_vmx_pinbased_ctls_high);
2747         vmx->nested.nested_vmx_pinbased_ctls_low |=
2748                 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2749         vmx->nested.nested_vmx_pinbased_ctls_high &=
2750                 PIN_BASED_EXT_INTR_MASK |
2751                 PIN_BASED_NMI_EXITING |
2752                 PIN_BASED_VIRTUAL_NMIS;
2753         vmx->nested.nested_vmx_pinbased_ctls_high |=
2754                 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2755                 PIN_BASED_VMX_PREEMPTION_TIMER;
2756         if (kvm_vcpu_apicv_active(&vmx->vcpu))
2757                 vmx->nested.nested_vmx_pinbased_ctls_high |=
2758                         PIN_BASED_POSTED_INTR;
2759 
2760         /* exit controls */
2761         rdmsr(MSR_IA32_VMX_EXIT_CTLS,
2762                 vmx->nested.nested_vmx_exit_ctls_low,
2763                 vmx->nested.nested_vmx_exit_ctls_high);
2764         vmx->nested.nested_vmx_exit_ctls_low =
2765                 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
2766 
2767         vmx->nested.nested_vmx_exit_ctls_high &=
2768 #ifdef CONFIG_X86_64
2769                 VM_EXIT_HOST_ADDR_SPACE_SIZE |
2770 #endif
2771                 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
2772         vmx->nested.nested_vmx_exit_ctls_high |=
2773                 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
2774                 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
2775                 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
2776 
2777         if (kvm_mpx_supported())
2778                 vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
2779 
2780         /* We support free control of debug control saving. */
2781         vmx->nested.nested_vmx_exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
2782 
2783         /* entry controls */
2784         rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
2785                 vmx->nested.nested_vmx_entry_ctls_low,
2786                 vmx->nested.nested_vmx_entry_ctls_high);
2787         vmx->nested.nested_vmx_entry_ctls_low =
2788                 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
2789         vmx->nested.nested_vmx_entry_ctls_high &=
2790 #ifdef CONFIG_X86_64
2791                 VM_ENTRY_IA32E_MODE |
2792 #endif
2793                 VM_ENTRY_LOAD_IA32_PAT;
2794         vmx->nested.nested_vmx_entry_ctls_high |=
2795                 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
2796         if (kvm_mpx_supported())
2797                 vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
2798 
2799         /* We support free control of debug control loading. */
2800         vmx->nested.nested_vmx_entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
2801 
2802         /* cpu-based controls */
2803         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
2804                 vmx->nested.nested_vmx_procbased_ctls_low,
2805                 vmx->nested.nested_vmx_procbased_ctls_high);
2806         vmx->nested.nested_vmx_procbased_ctls_low =
2807                 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2808         vmx->nested.nested_vmx_procbased_ctls_high &=
2809                 CPU_BASED_VIRTUAL_INTR_PENDING |
2810                 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
2811                 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
2812                 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
2813                 CPU_BASED_CR3_STORE_EXITING |
2814 #ifdef CONFIG_X86_64
2815                 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
2816 #endif
2817                 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
2818                 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
2819                 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
2820                 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
2821                 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2822         /*
2823          * We can allow some features even when not supported by the
2824          * hardware. For example, L1 can specify an MSR bitmap - and we
2825          * can use it to avoid exits to L1 - even when L0 runs L2
2826          * without MSR bitmaps.
2827          */
2828         vmx->nested.nested_vmx_procbased_ctls_high |=
2829                 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2830                 CPU_BASED_USE_MSR_BITMAPS;
2831 
2832         /* We support free control of CR3 access interception. */
2833         vmx->nested.nested_vmx_procbased_ctls_low &=
2834                 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
2835 
2836         /*
2837          * secondary cpu-based controls.  Do not include those that
2838          * depend on CPUID bits, they are added later by vmx_cpuid_update.
2839          */
2840         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
2841                 vmx->nested.nested_vmx_secondary_ctls_low,
2842                 vmx->nested.nested_vmx_secondary_ctls_high);
2843         vmx->nested.nested_vmx_secondary_ctls_low = 0;
2844         vmx->nested.nested_vmx_secondary_ctls_high &=
2845                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2846                 SECONDARY_EXEC_DESC |
2847                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2848                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2849                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2850                 SECONDARY_EXEC_WBINVD_EXITING;
2851 
2852         if (enable_ept) {
2853                 /* nested EPT: emulate EPT also to L1 */
2854                 vmx->nested.nested_vmx_secondary_ctls_high |=
2855                         SECONDARY_EXEC_ENABLE_EPT;
2856                 vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
2857                          VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
2858                 if (cpu_has_vmx_ept_execute_only())
2859                         vmx->nested.nested_vmx_ept_caps |=
2860                                 VMX_EPT_EXECUTE_ONLY_BIT;
2861                 vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
2862                 vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
2863                         VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
2864                         VMX_EPT_1GB_PAGE_BIT;
2865                 if (enable_ept_ad_bits) {
2866                         vmx->nested.nested_vmx_secondary_ctls_high |=
2867                                 SECONDARY_EXEC_ENABLE_PML;
2868                         vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
2869                 }
2870         }
2871 
2872         if (cpu_has_vmx_vmfunc()) {
2873                 vmx->nested.nested_vmx_secondary_ctls_high |=
2874                         SECONDARY_EXEC_ENABLE_VMFUNC;
2875                 /*
2876                  * Advertise EPTP switching unconditionally
2877                  * since we emulate it
2878                  */
2879                 if (enable_ept)
2880                         vmx->nested.nested_vmx_vmfunc_controls =
2881                                 VMX_VMFUNC_EPTP_SWITCHING;
2882         }
2883 
2884         /*
2885          * Old versions of KVM use the single-context version without
2886          * checking for support, so declare that it is supported even
2887          * though it is treated as global context.  The alternative is
2888          * not failing the single-context invvpid, and it is worse.
2889          */
2890         if (enable_vpid) {
2891                 vmx->nested.nested_vmx_secondary_ctls_high |=
2892                         SECONDARY_EXEC_ENABLE_VPID;
2893                 vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
2894                         VMX_VPID_EXTENT_SUPPORTED_MASK;
2895         }
2896 
2897         if (enable_unrestricted_guest)
2898                 vmx->nested.nested_vmx_secondary_ctls_high |=
2899                         SECONDARY_EXEC_UNRESTRICTED_GUEST;
2900 
2901         /* miscellaneous data */
2902         rdmsr(MSR_IA32_VMX_MISC,
2903                 vmx->nested.nested_vmx_misc_low,
2904                 vmx->nested.nested_vmx_misc_high);
2905         vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
2906         vmx->nested.nested_vmx_misc_low |=
2907                 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
2908                 VMX_MISC_ACTIVITY_HLT;
2909         vmx->nested.nested_vmx_misc_high = 0;
2910 
2911         /*
2912          * This MSR reports some information about VMX support. We
2913          * should return information about the VMX we emulate for the
2914          * guest, and the VMCS structure we give it - not about the
2915          * VMX support of the underlying hardware.
2916          */
2917         vmx->nested.nested_vmx_basic =
2918                 VMCS12_REVISION |
2919                 VMX_BASIC_TRUE_CTLS |
2920                 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
2921                 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
2922 
2923         if (cpu_has_vmx_basic_inout())
2924                 vmx->nested.nested_vmx_basic |= VMX_BASIC_INOUT;
2925 
2926         /*
2927          * These MSRs specify bits which the guest must keep fixed on
2928          * while L1 is in VMXON mode (in L1's root mode, or running an L2).
2929          * We picked the standard core2 setting.
2930          */
2931 #define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
2932 #define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
2933         vmx->nested.nested_vmx_cr0_fixed0 = VMXON_CR0_ALWAYSON;
2934         vmx->nested.nested_vmx_cr4_fixed0 = VMXON_CR4_ALWAYSON;
2935 
2936         /* These MSRs specify bits which the guest must keep fixed off. */
2937         rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx->nested.nested_vmx_cr0_fixed1);
2938         rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1);
2939 
2940         /* highest index: VMX_PREEMPTION_TIMER_VALUE */
2941         vmx->nested.nested_vmx_vmcs_enum = 0x2e;
2942 }
2943 
2944 /*
2945  * if fixed0[i] == 1: val[i] must be 1
2946  * if fixed1[i] == 0: val[i] must be 0
2947  */
2948 static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1)
2949 {
2950         return ((val & fixed1) | fixed0) == val;
2951 }
2952 
2953 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
2954 {
2955         return fixed_bits_valid(control, low, high);
2956 }
2957 
2958 static inline u64 vmx_control_msr(u32 low, u32 high)
2959 {
2960         return low | ((u64)high << 32);
2961 }
2962 
2963 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
2964 {
2965         superset &= mask;
2966         subset &= mask;
2967 
2968         return (superset | subset) == superset;
2969 }
2970 
2971 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
2972 {
2973         const u64 feature_and_reserved =
2974                 /* feature (except bit 48; see below) */
2975                 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
2976                 /* reserved */
2977                 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
2978         u64 vmx_basic = vmx->nested.nested_vmx_basic;
2979 
2980         if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
2981                 return -EINVAL;
2982 
2983         /*
2984          * KVM does not emulate a version of VMX that constrains physical
2985          * addresses of VMX structures (e.g. VMCS) to 32-bits.
2986          */
2987         if (data & BIT_ULL(48))
2988                 return -EINVAL;
2989 
2990         if (vmx_basic_vmcs_revision_id(vmx_basic) !=
2991             vmx_basic_vmcs_revision_id(data))
2992                 return -EINVAL;
2993 
2994         if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
2995                 return -EINVAL;
2996 
2997         vmx->nested.nested_vmx_basic = data;
2998         return 0;
2999 }
3000 
3001 static int
3002 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
3003 {
3004         u64 supported;
3005         u32 *lowp, *highp;
3006 
3007         switch (msr_index) {
3008         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
3009                 lowp = &vmx->nested.nested_vmx_pinbased_ctls_low;
3010                 highp = &vmx->nested.nested_vmx_pinbased_ctls_high;
3011                 break;
3012         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
3013                 lowp = &vmx->nested.nested_vmx_procbased_ctls_low;
3014                 highp = &vmx->nested.nested_vmx_procbased_ctls_high;
3015                 break;
3016         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
3017                 lowp = &vmx->nested.nested_vmx_exit_ctls_low;
3018                 highp = &vmx->nested.nested_vmx_exit_ctls_high;
3019                 break;
3020         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
3021                 lowp = &vmx->nested.nested_vmx_entry_ctls_low;
3022                 highp = &vmx->nested.nested_vmx_entry_ctls_high;
3023                 break;
3024         case MSR_IA32_VMX_PROCBASED_CTLS2:
3025                 lowp = &vmx->nested.nested_vmx_secondary_ctls_low;
3026                 highp = &vmx->nested.nested_vmx_secondary_ctls_high;
3027                 break;
3028         default:
3029                 BUG();
3030         }
3031 
3032         supported = vmx_control_msr(*lowp, *highp);
3033 
3034         /* Check must-be-1 bits are still 1. */
3035         if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
3036                 return -EINVAL;
3037 
3038         /* Check must-be-0 bits are still 0. */
3039         if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
3040                 return -EINVAL;
3041 
3042         *lowp = data;
3043         *highp = data >> 32;
3044         return 0;
3045 }
3046 
3047 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
3048 {
3049         const u64 feature_and_reserved_bits =
3050                 /* feature */
3051                 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
3052                 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
3053                 /* reserved */
3054                 GENMASK_ULL(13, 9) | BIT_ULL(31);
3055         u64 vmx_misc;
3056 
3057         vmx_misc = vmx_control_msr(vmx->nested.nested_vmx_misc_low,
3058                                    vmx->nested.nested_vmx_misc_high);
3059 
3060         if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
3061                 return -EINVAL;
3062 
3063         if ((vmx->nested.nested_vmx_pinbased_ctls_high &
3064              PIN_BASED_VMX_PREEMPTION_TIMER) &&
3065             vmx_misc_preemption_timer_rate(data) !=
3066             vmx_misc_preemption_timer_rate(vmx_misc))
3067                 return -EINVAL;
3068 
3069         if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
3070                 return -EINVAL;
3071 
3072         if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
3073                 return -EINVAL;
3074 
3075         if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
3076                 return -EINVAL;
3077 
3078         vmx->nested.nested_vmx_misc_low = data;
3079         vmx->nested.nested_vmx_misc_high = data >> 32;
3080         return 0;
3081 }
3082 
3083 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
3084 {
3085         u64 vmx_ept_vpid_cap;
3086 
3087         vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.nested_vmx_ept_caps,
3088                                            vmx->nested.nested_vmx_vpid_caps);
3089 
3090         /* Every bit is either reserved or a feature bit. */
3091         if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
3092                 return -EINVAL;
3093 
3094         vmx->nested.nested_vmx_ept_caps = data;
3095         vmx->nested.nested_vmx_vpid_caps = data >> 32;
3096         return 0;
3097 }
3098 
3099 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
3100 {
3101         u64 *msr;
3102 
3103         switch (msr_index) {
3104         case MSR_IA32_VMX_CR0_FIXED0:
3105                 msr = &vmx->nested.nested_vmx_cr0_fixed0;
3106                 break;
3107         case MSR_IA32_VMX_CR4_FIXED0:
3108                 msr = &vmx->nested.nested_vmx_cr4_fixed0;
3109                 break;
3110         default:
3111                 BUG();
3112         }
3113 
3114         /*
3115          * 1 bits (which indicates bits which "must-be-1" during VMX operation)
3116          * must be 1 in the restored value.
3117          */
3118         if (!is_bitwise_subset(data, *msr, -1ULL))
3119                 return -EINVAL;
3120 
3121         *msr = data;
3122         return 0;
3123 }
3124 
3125 /*
3126  * Called when userspace is restoring VMX MSRs.
3127  *
3128  * Returns 0 on success, non-0 otherwise.
3129  */
3130 static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
3131 {
3132         struct vcpu_vmx *vmx = to_vmx(vcpu);
3133 
3134         switch (msr_index) {
3135         case MSR_IA32_VMX_BASIC:
3136                 return vmx_restore_vmx_basic(vmx, data);
3137         case MSR_IA32_VMX_PINBASED_CTLS:
3138         case MSR_IA32_VMX_PROCBASED_CTLS:
3139         case MSR_IA32_VMX_EXIT_CTLS:
3140         case MSR_IA32_VMX_ENTRY_CTLS:
3141                 /*
3142                  * The "non-true" VMX capability MSRs are generated from the
3143                  * "true" MSRs, so we do not support restoring them directly.
3144                  *
3145                  * If userspace wants to emulate VMX_BASIC[55]=0, userspace
3146                  * should restore the "true" MSRs with the must-be-1 bits
3147                  * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
3148                  * DEFAULT SETTINGS".
3149                  */
3150                 return -EINVAL;
3151         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
3152         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
3153         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
3154         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
3155         case MSR_IA32_VMX_PROCBASED_CTLS2:
3156                 return vmx_restore_control_msr(vmx, msr_index, data);
3157         case MSR_IA32_VMX_MISC:
3158                 return vmx_restore_vmx_misc(vmx, data);
3159         case MSR_IA32_VMX_CR0_FIXED0:
3160         case MSR_IA32_VMX_CR4_FIXED0:
3161                 return vmx_restore_fixed0_msr(vmx, msr_index, data);
3162         case MSR_IA32_VMX_CR0_FIXED1:
3163         case MSR_IA32_VMX_CR4_FIXED1:
3164                 /*
3165                  * These MSRs are generated based on the vCPU's CPUID, so we
3166                  * do not support restoring them directly.
3167                  */
3168                 return -EINVAL;
3169         case MSR_IA32_VMX_EPT_VPID_CAP:
3170                 return vmx_restore_vmx_ept_vpid_cap(vmx, data);
3171         case MSR_IA32_VMX_VMCS_ENUM:
3172                 vmx->nested.nested_vmx_vmcs_enum = data;
3173                 return 0;
3174         default:
3175                 /*
3176                  * The rest of the VMX capability MSRs do not support restore.
3177                  */
3178                 return -EINVAL;
3179         }
3180 }
3181 
3182 /* Returns 0 on success, non-0 otherwise. */
3183 static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
3184 {
3185         struct vcpu_vmx *vmx = to_vmx(vcpu);
3186 
3187         switch (msr_index) {
3188         case MSR_IA32_VMX_BASIC:
3189                 *pdata = vmx->nested.nested_vmx_basic;
3190                 break;
3191         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
3192         case MSR_IA32_VMX_PINBASED_CTLS:
3193                 *pdata = vmx_control_msr(
3194                         vmx->nested.nested_vmx_pinbased_ctls_low,
3195                         vmx->nested.nested_vmx_pinbased_ctls_high);
3196                 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
3197                         *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
3198                 break;
3199         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
3200         case MSR_IA32_VMX_PROCBASED_CTLS:
3201                 *pdata = vmx_control_msr(
3202                         vmx->nested.nested_vmx_procbased_ctls_low,
3203                         vmx->nested.nested_vmx_procbased_ctls_high);
3204                 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
3205                         *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
3206                 break;
3207         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
3208         case MSR_IA32_VMX_EXIT_CTLS:
3209                 *pdata = vmx_control_msr(
3210                         vmx->nested.nested_vmx_exit_ctls_low,
3211                         vmx->nested.nested_vmx_exit_ctls_high);
3212                 if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
3213                         *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
3214                 break;
3215         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
3216         case MSR_IA32_VMX_ENTRY_CTLS:
3217                 *pdata = vmx_control_msr(
3218                         vmx->nested.nested_vmx_entry_ctls_low,
3219                         vmx->nested.nested_vmx_entry_ctls_high);
3220                 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
3221                         *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
3222                 break;
3223         case MSR_IA32_VMX_MISC:
3224                 *pdata = vmx_control_msr(
3225                         vmx->nested.nested_vmx_misc_low,
3226                         vmx->nested.nested_vmx_misc_high);
3227                 break;
3228         case MSR_IA32_VMX_CR0_FIXED0:
3229                 *pdata = vmx->nested.nested_vmx_cr0_fixed0;
3230                 break;
3231         case MSR_IA32_VMX_CR0_FIXED1:
3232                 *pdata = vmx->nested.nested_vmx_cr0_fixed1;
3233                 break;
3234         case MSR_IA32_VMX_CR4_FIXED0:
3235                 *pdata = vmx->nested.nested_vmx_cr4_fixed0;
3236                 break;
3237         case MSR_IA32_VMX_CR4_FIXED1:
3238                 *pdata = vmx->nested.nested_vmx_cr4_fixed1;
3239                 break;
3240         case MSR_IA32_VMX_VMCS_ENUM:
3241                 *pdata = vmx->nested.nested_vmx_vmcs_enum;
3242                 break;
3243         case MSR_IA32_VMX_PROCBASED_CTLS2:
3244                 *pdata = vmx_control_msr(
3245                         vmx->nested.nested_vmx_secondary_ctls_low,
3246                         vmx->nested.nested_vmx_secondary_ctls_high);
3247                 break;
3248         case MSR_IA32_VMX_EPT_VPID_CAP:
3249                 *pdata = vmx->nested.nested_vmx_ept_caps |
3250                         ((u64)vmx->nested.nested_vmx_vpid_caps << 32);
3251                 break;
3252         case MSR_IA32_VMX_VMFUNC:
3253                 *pdata = vmx->nested.nested_vmx_vmfunc_controls;
3254                 break;
3255         default:
3256                 return 1;
3257         }
3258 
3259         return 0;
3260 }
3261 
3262 static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
3263                                                  uint64_t val)
3264 {
3265         uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
3266 
3267         return !(val & ~valid_bits);
3268 }
3269 
3270 /*
3271  * Reads an msr value (of 'msr_index') into 'pdata'.
3272  * Returns 0 on success, non-0 otherwise.
3273  * Assumes vcpu_load() was already called.
3274  */
3275 static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3276 {
3277         struct shared_msr_entry *msr;
3278 
3279         switch (msr_info->index) {
3280 #ifdef CONFIG_X86_64
3281         case MSR_FS_BASE:
3282                 msr_info->data = vmcs_readl(GUEST_FS_BASE);
3283                 break;
3284         case MSR_GS_BASE:
3285                 msr_info->data = vmcs_readl(GUEST_GS_BASE);
3286                 break;
3287         case MSR_KERNEL_GS_BASE:
3288                 vmx_load_host_state(to_vmx(vcpu));
3289                 msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
3290                 break;
3291 #endif
3292         case MSR_EFER:
3293                 return kvm_get_msr_common(vcpu, msr_info);
3294         case MSR_IA32_TSC:
3295                 msr_info->data = guest_read_tsc(vcpu);
3296                 break;
3297         case MSR_IA32_SPEC_CTRL:
3298                 if (!msr_info->host_initiated &&
3299                     !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
3300                     !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
3301                         return 1;
3302 
3303                 msr_info->data = to_vmx(vcpu)->spec_ctrl;
3304                 break;
3305         case MSR_IA32_ARCH_CAPABILITIES:
3306                 if (!msr_info->host_initiated &&
3307                     !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
3308                         return 1;
3309                 msr_info->data = to_vmx(vcpu)->arch_capabilities;
3310                 break;
3311         case MSR_IA32_SYSENTER_CS:
3312                 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
3313                 break;
3314         case MSR_IA32_SYSENTER_EIP:
3315                 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
3316                 break;
3317         case MSR_IA32_SYSENTER_ESP:
3318                 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
3319                 break;
3320         case MSR_IA32_BNDCFGS:
3321                 if (!kvm_mpx_supported() ||
3322                     (!msr_info->host_initiated &&
3323                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
3324                         return 1;
3325                 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
3326                 break;
3327         case MSR_IA32_MCG_EXT_CTL:
3328                 if (!msr_info->host_initiated &&
3329                     !(to_vmx(vcpu)->msr_ia32_feature_control &
3330                       FEATURE_CONTROL_LMCE))
3331                         return 1;
3332                 msr_info->data = vcpu->arch.mcg_ext_ctl;
3333                 break;
3334         case MSR_IA32_FEATURE_CONTROL:
3335                 msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control;
3336                 break;
3337         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
3338                 if (!nested_vmx_allowed(vcpu))
3339                         return 1;
3340                 return vmx_get_vmx_msr(vcpu, msr_info->index, &msr_info->data);
3341         case MSR_IA32_XSS:
3342                 if (!vmx_xsaves_supported())
3343                         return 1;
3344                 msr_info->data = vcpu->arch.ia32_xss;
3345                 break;
3346         case MSR_TSC_AUX:
3347                 if (!msr_info->host_initiated &&
3348                     !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
3349                         return 1;
3350                 /* Otherwise falls through */
3351         default:
3352                 msr = find_msr_entry(to_vmx(vcpu), msr_info->index);
3353                 if (msr) {
3354                         msr_info->data = msr->data;
3355                         break;
3356                 }
3357                 return kvm_get_msr_common(vcpu, msr_info);
3358         }
3359 
3360         return 0;
3361 }
3362 
3363 static void vmx_leave_nested(struct kvm_vcpu *vcpu);
3364 
3365 /*
3366  * Writes msr value into into the appropriate "register".
3367  * Returns 0 on success, non-0 otherwise.
3368  * Assumes vcpu_load() was already called.
3369  */
3370 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3371 {
3372         struct vcpu_vmx *vmx = to_vmx(vcpu);
3373         struct shared_msr_entry *msr;
3374         int ret = 0;
3375         u32 msr_index = msr_info->index;
3376         u64 data = msr_info->data;
3377 
3378         switch (msr_index) {
3379         case MSR_EFER:
3380                 ret = kvm_set_msr_common(vcpu, msr_info);
3381                 break;
3382 #ifdef CONFIG_X86_64
3383         case MSR_FS_BASE:
3384                 vmx_segment_cache_clear(vmx);
3385                 vmcs_writel(GUEST_FS_BASE, data);
3386                 break;
3387         case MSR_GS_BASE:
3388                 vmx_segment_cache_clear(vmx);
3389                 vmcs_writel(GUEST_GS_BASE, data);
3390                 break;
3391         case MSR_KERNEL_GS_BASE:
3392                 vmx_load_host_state(vmx);
3393                 vmx->msr_guest_kernel_gs_base = data;
3394                 break;
3395 #endif
3396         case MSR_IA32_SYSENTER_CS:
3397                 vmcs_write32(GUEST_SYSENTER_CS, data);
3398                 break;
3399         case MSR_IA32_SYSENTER_EIP:
3400                 vmcs_writel(GUEST_SYSENTER_EIP, data);
3401                 break;
3402         case MSR_IA32_SYSENTER_ESP:
3403                 vmcs_writel(GUEST_SYSENTER_ESP, data);
3404                 break;
3405         case MSR_IA32_BNDCFGS:
3406                 if (!kvm_mpx_supported() ||
3407                     (!msr_info->host_initiated &&
3408                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
3409                         return 1;
3410                 if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
3411                     (data & MSR_IA32_BNDCFGS_RSVD))
3412                         return 1;
3413                 vmcs_write64(GUEST_BNDCFGS, data);
3414                 break;
3415         case MSR_IA32_TSC:
3416                 kvm_write_tsc(vcpu, msr_info);
3417                 break;
3418         case MSR_IA32_SPEC_CTRL:
3419                 if (!msr_info->host_initiated &&
3420                     !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
3421                     !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
3422                         return 1;
3423 
3424                 /* The STIBP bit doesn't fault even if it's not advertised */
3425                 if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
3426                         return 1;
3427 
3428                 vmx->spec_ctrl = data;
3429 
3430                 if (!data)
3431                         break;
3432 
3433                 /*
3434                  * For non-nested:
3435                  * When it's written (to non-zero) for the first time, pass
3436                  * it through.
3437                  *
3438                  * For nested:
3439                  * The handling of the MSR bitmap for L2 guests is done in
3440                  * nested_vmx_merge_msr_bitmap. We should not touch the
3441                  * vmcs02.msr_bitmap here since it gets completely overwritten
3442                  * in the merging. We update the vmcs01 here for L1 as well
3443                  * since it will end up touching the MSR anyway now.
3444                  */
3445                 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
3446                                               MSR_IA32_SPEC_CTRL,
3447                                               MSR_TYPE_RW);
3448                 break;
3449         case MSR_IA32_PRED_CMD:
3450                 if (!msr_info->host_initiated &&
3451                     !guest_cpuid_has(vcpu, X86_FEATURE_IBPB) &&
3452                     !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
3453                         return 1;
3454 
3455                 if (data & ~PRED_CMD_IBPB)
3456                         return 1;
3457 
3458                 if (!data)
3459                         break;
3460 
3461                 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
3462 
3463                 /*
3464                  * For non-nested:
3465                  * When it's written (to non-zero) for the first time, pass
3466                  * it through.
3467                  *
3468                  * For nested:
3469                  * The handling of the MSR bitmap for L2 guests is done in
3470                  * nested_vmx_merge_msr_bitmap. We should not touch the
3471                  * vmcs02.msr_bitmap here since it gets completely overwritten
3472                  * in the merging.
3473                  */
3474                 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
3475                                               MSR_TYPE_W);
3476                 break;
3477         case MSR_IA32_ARCH_CAPABILITIES:
3478                 if (!msr_info->host_initiated)
3479                         return 1;
3480                 vmx->arch_capabilities = data;
3481                 break;
3482         case MSR_IA32_CR_PAT:
3483                 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
3484                         if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
3485                                 return 1;
3486                         vmcs_write64(GUEST_IA32_PAT, data);
3487                         vcpu->arch.pat = data;
3488                         break;
3489                 }
3490                 ret = kvm_set_msr_common(vcpu, msr_info);
3491                 break;
3492         case MSR_IA32_TSC_ADJUST:
3493                 ret = kvm_set_msr_common(vcpu, msr_info);
3494                 break;
3495         case MSR_IA32_MCG_EXT_CTL:
3496                 if ((!msr_info->host_initiated &&
3497                      !(to_vmx(vcpu)->msr_ia32_feature_control &
3498                        FEATURE_CONTROL_LMCE)) ||
3499                     (data & ~MCG_EXT_CTL_LMCE_EN))
3500                         return 1;
3501                 vcpu->arch.mcg_ext_ctl = data;
3502                 break;
3503         case MSR_IA32_FEATURE_CONTROL:
3504                 if (!vmx_feature_control_msr_valid(vcpu, data) ||
3505                     (to_vmx(vcpu)->msr_ia32_feature_control &
3506                      FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
3507                         return 1;
3508                 vmx->msr_ia32_feature_control = data;
3509                 if (msr_info->host_initiated && data == 0)
3510                         vmx_leave_nested(vcpu);
3511                 break;
3512         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
3513                 if (!msr_info->host_initiated)
3514                         return 1; /* they are read-only */
3515                 if (!nested_vmx_allowed(vcpu))
3516                         return 1;
3517                 return vmx_set_vmx_msr(vcpu, msr_index, data);
3518         case MSR_IA32_XSS:
3519                 if (!vmx_xsaves_supported())
3520                         return 1;
3521                 /*
3522                  * The only supported bit as of Skylake is bit 8, but
3523                  * it is not supported on KVM.
3524                  */
3525                 if (data != 0)
3526                         return 1;
3527                 vcpu->arch.ia32_xss = data;
3528                 if (vcpu->arch.ia32_xss != host_xss)
3529                         add_atomic_switch_msr(vmx, MSR_IA32_XSS,
3530                                 vcpu->arch.ia32_xss, host_xss);
3531                 else
3532                         clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
3533                 break;
3534         case MSR_TSC_AUX:
3535                 if (!msr_info->host_initiated &&
3536                     !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
3537                         return 1;
3538                 /* Check reserved bit, higher 32 bits should be zero */
3539                 if ((data >> 32) != 0)
3540                         return 1;
3541                 /* Otherwise falls through */
3542         default:
3543                 msr = find_msr_entry(vmx, msr_index);
3544                 if (msr) {
3545                         u64 old_msr_data = msr->data;
3546                         msr->data = data;
3547                         if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
3548                                 preempt_disable();
3549                                 ret = kvm_set_shared_msr(msr->index, msr->data,
3550                                                          msr->mask);
3551                                 preempt_enable();
3552                                 if (ret)
3553                                         msr->data = old_msr_data;
3554                         }
3555                         break;
3556                 }
3557                 ret = kvm_set_msr_common(vcpu, msr_info);
3558         }
3559 
3560         return ret;
3561 }
3562 
3563 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
3564 {
3565         __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
3566         switch (reg) {
3567         case VCPU_REGS_RSP:
3568                 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
3569                 break;
3570         case VCPU_REGS_RIP:
3571                 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
3572                 break;
3573         case VCPU_EXREG_PDPTR:
3574                 if (enable_ept)
3575                         ept_save_pdptrs(vcpu);
3576                 break;
3577         default:
3578                 break;
3579         }
3580 }
3581 
3582 static __init int cpu_has_kvm_support(void)
3583 {
3584         return cpu_has_vmx();
3585 }
3586 
3587 static __init int vmx_disabled_by_bios(void)
3588 {
3589         u64 msr;
3590 
3591         rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
3592         if (msr & FEATURE_CONTROL_LOCKED) {
3593                 /* launched w/ TXT and VMX disabled */
3594                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
3595                         && tboot_enabled())
3596                         return 1;
3597                 /* launched w/o TXT and VMX only enabled w/ TXT */
3598                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
3599                         && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
3600                         && !tboot_enabled()) {
3601                         printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
3602                                 "activate TXT before enabling KVM\n");
3603                         return 1;
3604                 }
3605                 /* launched w/o TXT and VMX disabled */
3606                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
3607                         && !tboot_enabled())
3608                         return 1;
3609         }
3610 
3611         return 0;
3612 }
3613 
3614 static void kvm_cpu_vmxon(u64 addr)
3615 {
3616         cr4_set_bits(X86_CR4_VMXE);
3617         intel_pt_handle_vmx(1);
3618 
3619         asm volatile (ASM_VMX_VMXON_RAX
3620                         : : "a"(&addr), "m"(addr)
3621                         : "memory", "cc");
3622 }
3623 
3624 static int hardware_enable(void)
3625 {
3626         int cpu = raw_smp_processor_id();
3627         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
3628         u64 old, test_bits;
3629 
3630         if (cr4_read_shadow() & X86_CR4_VMXE)
3631                 return -EBUSY;
3632 
3633         INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
3634         INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
3635         spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
3636 
3637         /*
3638          * Now we can enable the vmclear operation in kdump
3639          * since the loaded_vmcss_on_cpu list on this cpu
3640          * has been initialized.
3641          *
3642          * Though the cpu is not in VMX operation now, there
3643          * is no problem to enable the vmclear operation
3644          * for the loaded_vmcss_on_cpu list is empty!
3645          */
3646         crash_enable_local_vmclear(cpu);
3647 
3648         rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
3649 
3650         test_bits = FEATURE_CONTROL_LOCKED;
3651         test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
3652         if (tboot_enabled())
3653                 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
3654 
3655         if ((old & test_bits) != test_bits) {
3656                 /* enable and lock */
3657                 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
3658         }
3659         kvm_cpu_vmxon(phys_addr);
3660         if (enable_ept)
3661                 ept_sync_global();
3662 
3663         return 0;
3664 }
3665 
3666 static void vmclear_local_loaded_vmcss(void)
3667 {
3668         int cpu = raw_smp_processor_id();
3669         struct loaded_vmcs *v, *n;
3670 
3671         list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
3672                                  loaded_vmcss_on_cpu_link)
3673                 __loaded_vmcs_clear(v);
3674 }
3675 
3676 
3677 /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
3678  * tricks.
3679  */
3680 static void kvm_cpu_vmxoff(void)
3681 {
3682         asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
3683 
3684         intel_pt_handle_vmx(0);
3685         cr4_clear_bits(X86_CR4_VMXE);
3686 }
3687 
3688 static void hardware_disable(void)
3689 {
3690         vmclear_local_loaded_vmcss();
3691         kvm_cpu_vmxoff();
3692 }
3693 
3694 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
3695                                       u32 msr, u32 *result)
3696 {
3697         u32 vmx_msr_low, vmx_msr_high;
3698         u32 ctl = ctl_min | ctl_opt;
3699 
3700         rdmsr(msr, vmx_msr_low, vmx_msr_high);
3701 
3702         ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
3703         ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
3704 
3705         /* Ensure minimum (required) set of control bits are supported. */
3706         if (ctl_min & ~ctl)
3707                 return -EIO;
3708 
3709         *result = ctl;
3710         return 0;
3711 }
3712 
3713 static __init bool allow_1_setting(u32 msr, u32 ctl)
3714 {
3715         u32 vmx_msr_low, vmx_msr_high;
3716 
3717         rdmsr(msr, vmx_msr_low, vmx_msr_high);
3718         return vmx_msr_high & ctl;
3719 }
3720 
3721 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
3722 {
3723         u32 vmx_msr_low, vmx_msr_high;
3724         u32 min, opt, min2, opt2;
3725         u32 _pin_based_exec_control = 0;
3726         u32 _cpu_based_exec_control = 0;
3727         u32 _cpu_based_2nd_exec_control = 0;
3728         u32 _vmexit_control = 0;
3729         u32 _vmentry_control = 0;
3730 
3731         min = CPU_BASED_HLT_EXITING |
3732 #ifdef CONFIG_X86_64
3733               CPU_BASED_CR8_LOAD_EXITING |
3734               CPU_BASED_CR8_STORE_EXITING |
3735 #endif
3736               CPU_BASED_CR3_LOAD_EXITING |
3737               CPU_BASED_CR3_STORE_EXITING |
3738               CPU_BASED_USE_IO_BITMAPS |
3739               CPU_BASED_MOV_DR_EXITING |
3740               CPU_BASED_USE_TSC_OFFSETING |
3741               CPU_BASED_INVLPG_EXITING |
3742               CPU_BASED_RDPMC_EXITING;
3743 
3744         if (!kvm_mwait_in_guest())
3745                 min |= CPU_BASED_MWAIT_EXITING |
3746                         CPU_BASED_MONITOR_EXITING;
3747 
3748         opt = CPU_BASED_TPR_SHADOW |
3749               CPU_BASED_USE_MSR_BITMAPS |
3750               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
3751         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
3752                                 &_cpu_based_exec_control) < 0)
3753                 return -EIO;
3754 #ifdef CONFIG_X86_64
3755         if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
3756                 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
3757                                            ~CPU_BASED_CR8_STORE_EXITING;
3758 #endif
3759         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
3760                 min2 = 0;
3761                 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
3762                         SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
3763                         SECONDARY_EXEC_WBINVD_EXITING |
3764                         SECONDARY_EXEC_ENABLE_VPID |
3765                         SECONDARY_EXEC_ENABLE_EPT |
3766                         SECONDARY_EXEC_UNRESTRICTED_GUEST |
3767                         SECONDARY_EXEC_PAUSE_LOOP_EXITING |
3768                         SECONDARY_EXEC_RDTSCP |
3769                         SECONDARY_EXEC_ENABLE_INVPCID |
3770                         SECONDARY_EXEC_APIC_REGISTER_VIRT |
3771                         SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
3772                         SECONDARY_EXEC_SHADOW_VMCS |
3773                         SECONDARY_EXEC_XSAVES |
3774                         SECONDARY_EXEC_RDSEED_EXITING |
3775                         SECONDARY_EXEC_RDRAND_EXITING |
3776                         SECONDARY_EXEC_ENABLE_PML |
3777                         SECONDARY_EXEC_TSC_SCALING |
3778                         SECONDARY_EXEC_ENABLE_VMFUNC;
3779                 if (adjust_vmx_controls(min2, opt2,
3780                                         MSR_IA32_VMX_PROCBASED_CTLS2,
3781                                         &_cpu_based_2nd_exec_control) < 0)
3782                         return -EIO;
3783         }
3784 #ifndef CONFIG_X86_64
3785         if (!(_cpu_based_2nd_exec_control &
3786                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
3787                 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
3788 #endif
3789 
3790         if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
3791                 _cpu_based_2nd_exec_control &= ~(
3792                                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
3793                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
3794                                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3795 
3796         rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
3797                 &vmx_capability.ept, &vmx_capability.vpid);
3798 
3799         if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
3800                 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
3801                    enabled */
3802                 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
3803                                              CPU_BASED_CR3_STORE_EXITING |
3804                                              CPU_BASED_INVLPG_EXITING);
3805         } else if (vmx_capability.ept) {
3806                 vmx_capability.ept = 0;
3807                 pr_warn_once("EPT CAP should not exist if not support "
3808                                 "1-setting enable EPT VM-execution control\n");
3809         }
3810         if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
3811                 vmx_capability.vpid) {
3812                 vmx_capability.vpid = 0;
3813                 pr_warn_once("VPID CAP should not exist if not support "
3814                                 "1-setting enable VPID VM-execution control\n");
3815         }
3816 
3817         min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
3818 #ifdef CONFIG_X86_64
3819         min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
3820 #endif
3821         opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
3822                 VM_EXIT_CLEAR_BNDCFGS;
3823         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
3824                                 &_vmexit_control) < 0)
3825                 return -EIO;
3826 
3827         min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
3828         opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
3829                  PIN_BASED_VMX_PREEMPTION_TIMER;
3830         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
3831                                 &_pin_based_exec_control) < 0)
3832                 return -EIO;
3833 
3834         if (cpu_has_broken_vmx_preemption_timer())
3835                 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
3836         if (!(_cpu_based_2nd_exec_control &
3837                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
3838                 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
3839 
3840         min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
3841         opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
3842         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
3843                                 &_vmentry_control) < 0)
3844                 return -EIO;
3845 
3846         rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
3847 
3848         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
3849         if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
3850                 return -EIO;
3851 
3852 #ifdef CONFIG_X86_64
3853         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
3854         if (vmx_msr_high & (1u<<16))
3855                 return -EIO;
3856 #endif
3857 
3858         /* Require Write-Back (WB) memory type for VMCS accesses. */
3859         if (((vmx_msr_high >> 18) & 15) != 6)
3860                 return -EIO;
3861 
3862         vmcs_conf->size = vmx_msr_high & 0x1fff;
3863         vmcs_conf->order = get_order(vmcs_conf->size);
3864         vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
3865         vmcs_conf->revision_id = vmx_msr_low;
3866 
3867         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
3868         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
3869         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
3870         vmcs_conf->vmexit_ctrl         = _vmexit_control;
3871         vmcs_conf->vmentry_ctrl        = _vmentry_control;
3872 
3873         cpu_has_load_ia32_efer =
3874                 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
3875                                 VM_ENTRY_LOAD_IA32_EFER)
3876                 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
3877                                    VM_EXIT_LOAD_IA32_EFER);
3878 
3879         cpu_has_load_perf_global_ctrl =
3880                 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
3881                                 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
3882                 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
3883                                    VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
3884 
3885         /*
3886          * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
3887          * but due to errata below it can't be used. Workaround is to use
3888          * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
3889          *
3890          * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
3891          *
3892          * AAK155             (model 26)
3893          * AAP115             (model 30)
3894          * AAT100             (model 37)
3895          * BC86,AAY89,BD102   (model 44)
3896          * BA97               (model 46)
3897          *
3898          */
3899         if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) {
3900                 switch (boot_cpu_data.x86_model) {
3901                 case 26:
3902                 case 30:
3903                 case 37:
3904                 case 44:
3905                 case 46:
3906                         cpu_has_load_perf_global_ctrl = false;
3907                         printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
3908                                         "does not work properly. Using workaround\n");
3909                         break;
3910                 default:
3911                         break;
3912                 }
3913         }
3914 
3915         if (boot_cpu_has(X86_FEATURE_XSAVES))
3916                 rdmsrl(MSR_IA32_XSS, host_xss);
3917 
3918         return 0;
3919 }
3920 
3921 static struct vmcs *alloc_vmcs_cpu(int cpu)
3922 {
3923         int node = cpu_to_node(cpu);
3924         struct page *pages;
3925         struct vmcs *vmcs;
3926 
3927         pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
3928         if (!pages)
3929                 return NULL;
3930         vmcs = page_address(pages);
3931         memset(vmcs, 0, vmcs_config.size);
3932         vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
3933         return vmcs;
3934 }
3935 
3936 static void free_vmcs(struct vmcs *vmcs)
3937 {
3938         free_pages((unsigned long)vmcs, vmcs_config.order);
3939 }
3940 
3941 /*
3942  * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
3943  */
3944 static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
3945 {
3946         if (!loaded_vmcs->vmcs)
3947                 return;
3948         loaded_vmcs_clear(loaded_vmcs);
3949         free_vmcs(loaded_vmcs->vmcs);
3950         loaded_vmcs->vmcs = NULL;
3951         if (loaded_vmcs->msr_bitmap)
3952                 free_page((unsigned long)loaded_vmcs->msr_bitmap);
3953         WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
3954 }
3955 
3956 static struct vmcs *alloc_vmcs(void)
3957 {
3958         return alloc_vmcs_cpu(raw_smp_processor_id());
3959 }
3960 
3961 static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
3962 {
3963         loaded_vmcs->vmcs = alloc_vmcs();
3964         if (!loaded_vmcs->vmcs)
3965                 return -ENOMEM;
3966 
3967         loaded_vmcs->shadow_vmcs = NULL;
3968         loaded_vmcs_init(loaded_vmcs);
3969 
3970         if (cpu_has_vmx_msr_bitmap()) {
3971                 loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
3972                 if (!loaded_vmcs->msr_bitmap)
3973                         goto out_vmcs;
3974                 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
3975         }
3976         return 0;
3977 
3978 out_vmcs:
3979         free_loaded_vmcs(loaded_vmcs);
3980         return -ENOMEM;
3981 }
3982 
3983 static void free_kvm_area(void)
3984 {
3985         int cpu;
3986 
3987         for_each_possible_cpu(cpu) {
3988                 free_vmcs(per_cpu(vmxarea, cpu));
3989                 per_cpu(vmxarea, cpu) = NULL;
3990         }
3991 }
3992 
3993 enum vmcs_field_type {
3994         VMCS_FIELD_TYPE_U16 = 0,
3995         VMCS_FIELD_TYPE_U64 = 1,
3996         VMCS_FIELD_TYPE_U32 = 2,
3997         VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
3998 };
3999 
4000 static inline int vmcs_field_type(unsigned long field)
4001 {
4002         if (0x1 & field)        /* the *_HIGH fields are all 32 bit */
4003                 return VMCS_FIELD_TYPE_U32;
4004         return (field >> 13) & 0x3 ;
4005 }
4006 
4007 static inline int vmcs_field_readonly(unsigned long field)
4008 {
4009         return (((field >> 10) & 0x3) == 1);
4010 }
4011 
4012 static void init_vmcs_shadow_fields(void)
4013 {
4014         int i, j;
4015 
4016         /* No checks for read only fields yet */
4017 
4018         for (i = j = 0; i < max_shadow_read_write_fields; i++) {
4019                 switch (shadow_read_write_fields[i]) {
4020                 case GUEST_BNDCFGS:
4021                         if (!kvm_mpx_supported())
4022                                 continue;
4023                         break;
4024                 default:
4025                         break;
4026                 }
4027 
4028                 if (j < i)
4029                         shadow_read_write_fields[j] =
4030                                 shadow_read_write_fields[i];
4031                 j++;
4032         }
4033         max_shadow_read_write_fields = j;
4034 
4035         /* shadowed fields guest access without vmexit */
4036         for (i = 0; i < max_shadow_read_write_fields; i++) {
4037                 unsigned long field = shadow_read_write_fields[i];
4038 
4039                 clear_bit(field, vmx_vmwrite_bitmap);
4040                 clear_bit(field, vmx_vmread_bitmap);
4041                 if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) {
4042                         clear_bit(field + 1, vmx_vmwrite_bitmap);
4043                         clear_bit(field + 1, vmx_vmread_bitmap);
4044                 }
4045         }
4046         for (i = 0; i < max_shadow_read_only_fields; i++) {
4047                 unsigned long field = shadow_read_only_fields[i];
4048 
4049                 clear_bit(field, vmx_vmread_bitmap);
4050                 if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64)
4051                         clear_bit(field + 1, vmx_vmread_bitmap);
4052         }
4053 }
4054 
4055 static __init int alloc_kvm_area(void)
4056 {
4057         int cpu;
4058 
4059         for_each_possible_cpu(cpu) {
4060                 struct vmcs *vmcs;
4061 
4062                 vmcs = alloc_vmcs_cpu(cpu);
4063                 if (!vmcs) {
4064                         free_kvm_area();
4065                         return -ENOMEM;
4066                 }
4067 
4068                 per_cpu(vmxarea, cpu) = vmcs;
4069         }
4070         return 0;
4071 }
4072 
4073 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
4074                 struct kvm_segment *save)
4075 {
4076         if (!emulate_invalid_guest_state) {
4077                 /*
4078                  * CS and SS RPL should be equal during guest entry according
4079                  * to VMX spec, but in reality it is not always so. Since vcpu
4080                  * is in the middle of the transition from real mode to
4081                  * protected mode it is safe to assume that RPL 0 is a good
4082                  * default value.
4083                  */
4084                 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
4085                         save->selector &= ~SEGMENT_RPL_MASK;
4086                 save->dpl = save->selector & SEGMENT_RPL_MASK;
4087                 save->s = 1;
4088         }
4089         vmx_set_segment(vcpu, save, seg);
4090 }
4091 
4092 static void enter_pmode(struct kvm_vcpu *vcpu)
4093 {
4094         unsigned long flags;
4095         struct vcpu_vmx *vmx = to_vmx(vcpu);
4096 
4097         /*
4098          * Update real mode segment cache. It may be not up-to-date if sement
4099          * register was written while vcpu was in a guest mode.
4100          */
4101         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
4102         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
4103         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
4104         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
4105         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
4106         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
4107 
4108         vmx->rmode.vm86_active = 0;
4109 
4110         vmx_segment_cache_clear(vmx);
4111 
4112         vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
4113 
4114         flags = vmcs_readl(GUEST_RFLAGS);
4115         flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
4116         flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
4117         vmcs_writel(GUEST_RFLAGS, flags);
4118 
4119         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
4120                         (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
4121 
4122         update_exception_bitmap(vcpu);
4123 
4124         fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
4125         fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
4126         fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
4127         fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
4128         fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
4129         fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
4130 }
4131 
4132 static void fix_rmode_seg(int seg, struct kvm_segment *save)
4133 {
4134         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
4135         struct kvm_segment var = *save;
4136 
4137         var.dpl = 0x3;
4138         if (seg == VCPU_SREG_CS)
4139                 var.type = 0x3;
4140 
4141         if (!emulate_invalid_guest_state) {
4142                 var.selector = var.base >> 4;
4143                 var.base = var.base & 0xffff0;
4144                 var.limit = 0xffff;
4145                 var.g = 0;
4146                 var.db = 0;
4147                 var.present = 1;
4148                 var.s = 1;
4149                 var.l = 0;
4150                 var.unusable = 0;
4151                 var.type = 0x3;
4152                 var.avl = 0;
4153                 if (save->base & 0xf)
4154                         printk_once(KERN_WARNING "kvm: segment base is not "
4155                                         "paragraph aligned when entering "
4156                                         "protected mode (seg=%d)", seg);
4157         }
4158 
4159         vmcs_write16(sf->selector, var.selector);
4160         vmcs_writel(sf->base, var.base);
4161         vmcs_write32(sf->limit, var.limit);
4162         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
4163 }
4164 
4165 static void enter_rmode(struct kvm_vcpu *vcpu)
4166 {
4167         unsigned long flags;
4168         struct vcpu_vmx *vmx = to_vmx(vcpu);
4169 
4170         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
4171         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
4172         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
4173         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
4174         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
4175         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
4176         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
4177 
4178         vmx->rmode.vm86_active = 1;
4179 
4180         /*
4181          * Very old userspace does not call KVM_SET_TSS_ADDR before entering
4182          * vcpu. Warn the user that an update is overdue.
4183          */
4184         if (!vcpu->kvm->arch.tss_addr)
4185                 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
4186                              "called before entering vcpu\n");
4187 
4188         vmx_segment_cache_clear(vmx);
4189 
4190         vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr);
4191         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
4192         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
4193 
4194         flags = vmcs_readl(GUEST_RFLAGS);
4195         vmx->rmode.save_rflags = flags;
4196 
4197         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
4198 
4199         vmcs_writel(GUEST_RFLAGS, flags);
4200         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
4201         update_exception_bitmap(vcpu);
4202 
4203         fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
4204         fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
4205         fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
4206         fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
4207         fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
4208         fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
4209 
4210         kvm_mmu_reset_context(vcpu);
4211 }
4212 
4213 static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
4214 {
4215         struct vcpu_vmx *vmx = to_vmx(vcpu);
4216         struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
4217 
4218         if (!msr)
4219                 return;
4220 
4221         /*
4222          * Force kernel_gs_base reloading before EFER changes, as control
4223          * of this msr depends on is_long_mode().
4224          */
4225         vmx_load_host_state(to_vmx(vcpu));
4226         vcpu->arch.efer = efer;
4227         if (efer & EFER_LMA) {
4228                 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
4229                 msr->data = efer;
4230         } else {
4231                 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
4232 
4233                 msr->data = efer & ~EFER_LME;
4234         }
4235         setup_msrs(vmx);
4236 }
4237 
4238 #ifdef CONFIG_X86_64
4239 
4240 static void enter_lmode(struct kvm_vcpu *vcpu)
4241 {
4242         u32 guest_tr_ar;
4243 
4244         vmx_segment_cache_clear(to_vmx(vcpu));
4245 
4246         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
4247         if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
4248                 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
4249                                      __func__);
4250                 vmcs_write32(GUEST_TR_AR_BYTES,
4251                              (guest_tr_ar & ~VMX_AR_TYPE_MASK)
4252                              | VMX_AR_TYPE_BUSY_64_TSS);
4253         }
4254         vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
4255 }
4256 
4257 static void exit_lmode(struct kvm_vcpu *vcpu)
4258 {
4259         vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
4260         vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
4261 }
4262 
4263 #endif
4264 
4265 static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
4266 {
4267         if (enable_ept) {
4268                 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
4269                         return;
4270                 ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa));
4271         } else {
4272                 vpid_sync_context(vpid);
4273         }
4274 }
4275 
4276 static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
4277 {
4278         __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid);
4279 }
4280 
4281 static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu)
4282 {
4283         if (enable_ept)
4284                 vmx_flush_tlb(vcpu);
4285 }
4286 
4287 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
4288 {
4289         ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
4290 
4291         vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
4292         vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
4293 }
4294 
4295 static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
4296 {
4297         if (enable_ept && is_paging(vcpu))
4298                 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
4299         __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
4300 }
4301 
4302 static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
4303 {
4304         ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
4305 
4306         vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
4307         vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
4308 }
4309 
4310 static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
4311 {
4312         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
4313 
4314         if (!test_bit(VCPU_EXREG_PDPTR,
4315                       (unsigned long *)&vcpu->arch.regs_dirty))
4316                 return;
4317 
4318         if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
4319                 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
4320                 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
4321                 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
4322                 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
4323         }
4324 }
4325 
4326 static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
4327 {
4328         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
4329 
4330         if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
4331                 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
4332                 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
4333                 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
4334                 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
4335         }
4336 
4337         __set_bit(VCPU_EXREG_PDPTR,
4338                   (unsigned long *)&vcpu->arch.regs_avail);
4339         __set_bit(VCPU_EXREG_PDPTR,
4340                   (unsigned long *)&vcpu->arch.regs_dirty);
4341 }
4342 
4343 static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
4344 {
4345         u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0;
4346         u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1;
4347         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4348 
4349         if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
4350                 SECONDARY_EXEC_UNRESTRICTED_GUEST &&
4351             nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
4352                 fixed0 &= ~(X86_CR0_PE | X86_CR0_PG);
4353 
4354         return fixed_bits_valid(val, fixed0, fixed1);
4355 }
4356 
4357 static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
4358 {
4359         u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0;
4360         u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1;
4361 
4362         return fixed_bits_valid(val, fixed0, fixed1);
4363 }
4364 
4365 static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
4366 {
4367         u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed0;
4368         u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed1;
4369 
4370         return fixed_bits_valid(val, fixed0, fixed1);
4371 }
4372 
4373 /* No difference in the restrictions on guest and host CR4 in VMX operation. */
4374 #define nested_guest_cr4_valid  nested_cr4_valid
4375 #define nested_host_cr4_valid   nested_cr4_valid
4376 
4377 static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
4378 
4379 static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
4380                                         unsigned long cr0,
4381                                         struct kvm_vcpu *vcpu)
4382 {
4383         if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
4384                 vmx_decache_cr3(vcpu);
4385         if (!(cr0 & X86_CR0_PG)) {
4386                 /* From paging/starting to nonpaging */
4387                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
4388                              vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
4389                              (CPU_BASED_CR3_LOAD_EXITING |
4390                               CPU_BASED_CR3_STORE_EXITING));
4391                 vcpu->arch.cr0 = cr0;
4392                 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
4393         } else if (!is_paging(vcpu)) {
4394                 /* From nonpaging to paging */
4395                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
4396                              vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
4397                              ~(CPU_BASED_CR3_LOAD_EXITING |
4398                                CPU_BASED_CR3_STORE_EXITING));
4399                 vcpu->arch.cr0 = cr0;
4400                 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
4401         }
4402 
4403         if (!(cr0 & X86_CR0_WP))
4404                 *hw_cr0 &= ~X86_CR0_WP;
4405 }
4406 
4407 static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
4408 {
4409         struct vcpu_vmx *vmx = to_vmx(vcpu);
4410         unsigned long hw_cr0;
4411 
4412         hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK);
4413         if (enable_unrestricted_guest)
4414                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
4415         else {
4416                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
4417 
4418                 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
4419                         enter_pmode(vcpu);
4420 
4421                 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
4422                         enter_rmode(vcpu);
4423         }
4424 
4425 #ifdef CONFIG_X86_64
4426         if (vcpu->arch.efer & EFER_LME) {
4427                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
4428                         enter_lmode(vcpu);
4429                 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
4430                         exit_lmode(vcpu);
4431         }
4432 #endif
4433 
4434         if (enable_ept)
4435                 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
4436 
4437         vmcs_writel(CR0_READ_SHADOW, cr0);
4438         vmcs_writel(GUEST_CR0, hw_cr0);
4439         vcpu->arch.cr0 = cr0;
4440 
4441         /* depends on vcpu->arch.cr0 to be set to a new value */
4442         vmx->emulation_required = emulation_required(vcpu);
4443 }
4444 
4445 static int get_ept_level(struct kvm_vcpu *vcpu)
4446 {
4447         if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
4448                 return 5;
4449         return 4;
4450 }
4451 
4452 static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
4453 {
4454         u64 eptp = VMX_EPTP_MT_WB;
4455 
4456         eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
4457 
4458         if (enable_ept_ad_bits &&
4459             (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
4460                 eptp |= VMX_EPTP_AD_ENABLE_BIT;
4461         eptp |= (root_hpa & PAGE_MASK);
4462 
4463         return eptp;
4464 }
4465 
4466 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
4467 {
4468         unsigned long guest_cr3;
4469         u64 eptp;
4470 
4471         guest_cr3 = cr3;
4472         if (enable_ept) {
4473                 eptp = construct_eptp(vcpu, cr3);
4474                 vmcs_write64(EPT_POINTER, eptp);
4475                 if (is_paging(vcpu) || is_guest_mode(vcpu))
4476                         guest_cr3 = kvm_read_cr3(vcpu);
4477                 else
4478                         guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr;
4479                 ept_load_pdptrs(vcpu);
4480         }
4481 
4482         vmx_flush_tlb(vcpu);
4483         vmcs_writel(GUEST_CR3, guest_cr3);
4484 }
4485 
4486 static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
4487 {
4488         /*
4489          * Pass through host's Machine Check Enable value to hw_cr4, which
4490          * is in force while we are in guest mode.  Do not let guests control
4491          * this bit, even if host CR4.MCE == 0.
4492          */
4493         unsigned long hw_cr4 =
4494                 (cr4_read_shadow() & X86_CR4_MCE) |
4495                 (cr4 & ~X86_CR4_MCE) |
4496                 (to_vmx(vcpu)->rmode.vm86_active ?
4497                  KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
4498 
4499         if (cr4 & X86_CR4_VMXE) {
4500                 /*
4501                  * To use VMXON (and later other VMX instructions), a guest
4502                  * must first be able to turn on cr4.VMXE (see handle_vmon()).
4503                  * So basically the check on whether to allow nested VMX
4504                  * is here.
4505                  */
4506                 if (!nested_vmx_allowed(vcpu))
4507                         return 1;
4508         }
4509 
4510         if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
4511                 return 1;
4512 
4513         vcpu->arch.cr4 = cr4;
4514         if (enable_ept) {
4515                 if (!is_paging(vcpu)) {
4516                         hw_cr4 &= ~X86_CR4_PAE;
4517                         hw_cr4 |= X86_CR4_PSE;
4518                 } else if (!(cr4 & X86_CR4_PAE)) {
4519                         hw_cr4 &= ~X86_CR4_PAE;
4520                 }
4521         }
4522 
4523         if (!enable_unrestricted_guest && !is_paging(vcpu))
4524                 /*
4525                  * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
4526                  * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
4527                  * to be manually disabled when guest switches to non-paging
4528                  * mode.
4529                  *
4530                  * If !enable_unrestricted_guest, the CPU is always running
4531                  * with CR0.PG=1 and CR4 needs to be modified.
4532                  * If enable_unrestricted_guest, the CPU automatically
4533                  * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
4534                  */
4535                 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
4536 
4537         vmcs_writel(CR4_READ_SHADOW, cr4);
4538         vmcs_writel(GUEST_CR4, hw_cr4);
4539         return 0;
4540 }
4541 
4542 static void vmx_get_segment(struct kvm_vcpu *vcpu,
4543                             struct kvm_segment *var, int seg)
4544 {
4545         struct vcpu_vmx *vmx = to_vmx(vcpu);
4546         u32 ar;
4547 
4548         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
4549                 *var = vmx->rmode.segs[seg];
4550                 if (seg == VCPU_SREG_TR
4551                     || var->selector == vmx_read_guest_seg_selector(vmx, seg))
4552                         return;
4553                 var->base = vmx_read_guest_seg_base(vmx, seg);
4554                 var->selector = vmx_read_guest_seg_selector(vmx, seg);
4555                 return;
4556         }
4557         var->base = vmx_read_guest_seg_base(vmx, seg);
4558         var->limit = vmx_read_guest_seg_limit(vmx, seg);
4559         var->selector = vmx_read_guest_seg_selector(vmx, seg);
4560         ar = vmx_read_guest_seg_ar(vmx, seg);
4561         var->unusable = (ar >> 16) & 1;
4562         var->type = ar & 15;
4563         var->s = (ar >> 4) & 1;
4564         var->dpl = (ar >> 5) & 3;
4565         /*
4566          * Some userspaces do not preserve unusable property. Since usable
4567          * segment has to be present according to VMX spec we can use present
4568          * property to amend userspace bug by making unusable segment always
4569          * nonpresent. vmx_segment_access_rights() already marks nonpresent
4570          * segment as unusable.
4571          */
4572         var->present = !var->unusable;
4573         var->avl = (ar >> 12) & 1;
4574         var->l = (ar >> 13) & 1;
4575         var->db = (ar >> 14) & 1;
4576         var->g = (ar >> 15) & 1;
4577 }
4578 
4579 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
4580 {
4581         struct kvm_segment s;
4582 
4583         if (to_vmx(vcpu)->rmode.vm86_active) {
4584                 vmx_get_segment(vcpu, &s, seg);
4585                 return s.base;
4586         }
4587         return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
4588 }
4589 
4590 static int vmx_get_cpl(struct kvm_vcpu *vcpu)
4591 {
4592         struct vcpu_vmx *vmx = to_vmx(vcpu);
4593 
4594         if (unlikely(vmx->rmode.vm86_active))
4595                 return 0;
4596         else {
4597                 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
4598                 return VMX_AR_DPL(ar);
4599         }
4600 }
4601 
4602 static u32 vmx_segment_access_rights(struct kvm_segment *var)
4603 {
4604         u32 ar;
4605 
4606         if (var->unusable || !var->present)
4607                 ar = 1 << 16;
4608         else {
4609                 ar = var->type & 15;
4610                 ar |= (var->s & 1) << 4;
4611                 ar |= (var->dpl & 3) << 5;
4612                 ar |= (var->present & 1) << 7;
4613                 ar |= (var->avl & 1) << 12;
4614                 ar |= (var->l & 1) << 13;
4615                 ar |= (var->db & 1) << 14;
4616                 ar |= (var->g & 1) << 15;
4617         }
4618 
4619         return ar;
4620 }
4621 
4622 static void vmx_set_segment(struct kvm_vcpu *vcpu,
4623                             struct kvm_segment *var, int seg)
4624 {
4625         struct vcpu_vmx *vmx = to_vmx(vcpu);
4626         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
4627 
4628         vmx_segment_cache_clear(vmx);
4629 
4630         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
4631                 vmx->rmode.segs[seg] = *var;
4632                 if (seg == VCPU_SREG_TR)
4633                         vmcs_write16(sf->selector, var->selector);
4634                 else if (var->s)
4635                         fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
4636                 goto out;
4637         }
4638 
4639         vmcs_writel(sf->base, var->base);
4640         vmcs_write32(sf->limit, var->limit);
4641         vmcs_write16(sf->selector, var->selector);
4642 
4643         /*
4644          *   Fix the "Accessed" bit in AR field of segment registers for older
4645          * qemu binaries.
4646          *   IA32 arch specifies that at the time of processor reset the
4647          * "Accessed" bit in the AR field of segment registers is 1. And qemu
4648          * is setting it to 0 in the userland code. This causes invalid guest
4649          * state vmexit when "unrestricted guest" mode is turned on.
4650          *    Fix for this setup issue in cpu_reset is being pushed in the qemu
4651          * tree. Newer qemu binaries with that qemu fix would not need this
4652          * kvm hack.
4653          */
4654         if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
4655                 var->type |= 0x1; /* Accessed */
4656 
4657         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
4658 
4659 out:
4660         vmx->emulation_required = emulation_required(vcpu);
4661 }
4662 
4663 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
4664 {
4665         u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
4666 
4667         *db = (ar >> 14) & 1;
4668         *l = (ar >> 13) & 1;
4669 }
4670 
4671 static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
4672 {
4673         dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
4674         dt->address = vmcs_readl(GUEST_IDTR_BASE);
4675 }
4676 
4677 static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
4678 {
4679         vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
4680         vmcs_writel(GUEST_IDTR_BASE, dt->address);
4681 }
4682 
4683 static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
4684 {
4685         dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
4686         dt->address = vmcs_readl(GUEST_GDTR_BASE);
4687 }
4688 
4689 static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
4690 {
4691         vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
4692         vmcs_writel(GUEST_GDTR_BASE, dt->address);
4693 }
4694 
4695 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
4696 {
4697         struct kvm_segment var;
4698         u32 ar;
4699 
4700         vmx_get_segment(vcpu, &var, seg);
4701         var.dpl = 0x3;
4702         if (seg == VCPU_SREG_CS)
4703                 var.type = 0x3;
4704         ar = vmx_segment_access_rights(&var);
4705 
4706         if (var.base != (var.selector << 4))
4707                 return false;
4708         if (var.limit != 0xffff)
4709                 return false;
4710         if (ar != 0xf3)
4711                 return false;
4712 
4713         return true;
4714 }
4715 
4716 static bool code_segment_valid(struct kvm_vcpu *vcpu)
4717 {
4718         struct kvm_segment cs;
4719         unsigned int cs_rpl;
4720 
4721         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
4722         cs_rpl = cs.selector & SEGMENT_RPL_MASK;
4723 
4724         if (cs.unusable)
4725                 return false;
4726         if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
4727                 return false;
4728         if (!cs.s)
4729                 return false;
4730         if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
4731                 if (cs.dpl > cs_rpl)
4732                         return false;
4733         } else {
4734                 if (cs.dpl != cs_rpl)
4735                         return false;
4736         }
4737         if (!cs.present)
4738                 return false;
4739 
4740         /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
4741         return true;
4742 }
4743 
4744 static bool stack_segment_valid(struct kvm_vcpu *vcpu)
4745 {
4746         struct kvm_segment ss;
4747         unsigned int ss_rpl;
4748 
4749         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
4750         ss_rpl = ss.selector & SEGMENT_RPL_MASK;
4751 
4752         if (ss.unusable)
4753                 return true;
4754         if (ss.type != 3 && ss.type != 7)
4755                 return false;
4756         if (!ss.s)
4757                 return false;
4758         if (ss.dpl != ss_rpl) /* DPL != RPL */
4759                 return false;
4760         if (!ss.present)
4761                 return false;
4762 
4763         return true;
4764 }
4765 
4766 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
4767 {
4768         struct kvm_segment var;
4769         unsigned int rpl;
4770 
4771         vmx_get_segment(vcpu, &var, seg);
4772         rpl = var.selector & SEGMENT_RPL_MASK;
4773 
4774         if (var.unusable)
4775                 return true;
4776         if (!var.s)
4777                 return false;
4778         if (!var.present)
4779                 return false;
4780         if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
4781                 if (var.dpl < rpl) /* DPL < RPL */
4782                         return false;
4783         }
4784 
4785         /* TODO: Add other members to kvm_segment_field to allow checking for other access
4786          * rights flags
4787          */
4788         return true;
4789 }
4790 
4791 static bool tr_valid(struct kvm_vcpu *vcpu)
4792 {
4793         struct kvm_segment tr;
4794 
4795         vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
4796 
4797         if (tr.unusable)
4798                 return false;
4799         if (tr.selector & SEGMENT_TI_MASK)      /* TI = 1 */
4800                 return false;
4801         if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
4802                 return false;
4803         if (!tr.present)
4804                 return false;
4805 
4806         return true;
4807 }
4808 
4809 static bool ldtr_valid(struct kvm_vcpu *vcpu)
4810 {
4811         struct kvm_segment ldtr;
4812 
4813         vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
4814 
4815         if (ldtr.unusable)
4816                 return true;
4817         if (ldtr.selector & SEGMENT_TI_MASK)    /* TI = 1 */
4818                 return false;
4819         if (ldtr.type != 2)
4820                 return false;
4821         if (!ldtr.present)
4822                 return false;
4823 
4824         return true;
4825 }
4826 
4827 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
4828 {
4829         struct kvm_segment cs, ss;
4830 
4831         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
4832         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
4833 
4834         return ((cs.selector & SEGMENT_RPL_MASK) ==
4835                  (ss.selector & SEGMENT_RPL_MASK));
4836 }
4837 
4838 /*
4839  * Check if guest state is valid. Returns true if valid, false if
4840  * not.
4841  * We assume that registers are always usable
4842  */
4843 static bool guest_state_valid(struct kvm_vcpu *vcpu)
4844 {
4845         if (enable_unrestricted_guest)
4846                 return true;
4847 
4848         /* real mode guest state checks */
4849         if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
4850                 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
4851                         return false;
4852                 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
4853                         return false;
4854                 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
4855                         return false;
4856                 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
4857                         return false;
4858                 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
4859                         return false;
4860                 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
4861                         return false;
4862         } else {
4863         /* protected mode guest state checks */
4864                 if (!cs_ss_rpl_check(vcpu))
4865                         return false;
4866                 if (!code_segment_valid(vcpu))
4867                         return false;
4868                 if (!stack_segment_valid(vcpu))
4869                         return false;
4870                 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
4871                         return false;
4872                 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
4873                         return false;
4874                 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
4875                         return false;
4876                 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
4877                         return false;
4878                 if (!tr_valid(vcpu))
4879                         return false;
4880                 if (!ldtr_valid(vcpu))
4881                         return false;
4882         }
4883         /* TODO:
4884          * - Add checks on RIP
4885          * - Add checks on RFLAGS
4886          */
4887 
4888         return true;
4889 }
4890 
4891 static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
4892 {
4893         return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
4894 }
4895 
4896 static int init_rmode_tss(struct kvm *kvm)
4897 {
4898         gfn_t fn;
4899         u16 data = 0;
4900         int idx, r;
4901 
4902         idx = srcu_read_lock(&kvm->srcu);
4903         fn = kvm->arch.tss_addr >> PAGE_SHIFT;
4904         r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
4905         if (r < 0)
4906                 goto out;
4907         data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
4908         r = kvm_write_guest_page(kvm, fn++, &data,
4909                         TSS_IOPB_BASE_OFFSET, sizeof(u16));
4910         if (r < 0)
4911                 goto out;
4912         r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
4913         if (r < 0)
4914                 goto out;
4915         r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
4916         if (r < 0)
4917                 goto out;
4918         data = ~0;
4919         r = kvm_write_guest_page(kvm, fn, &data,
4920                                  RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
4921                                  sizeof(u8));
4922 out:
4923         srcu_read_unlock(&kvm->srcu, idx);
4924         return r;
4925 }
4926 
4927 static int init_rmode_identity_map(struct kvm *kvm)
4928 {
4929         int i, idx, r = 0;
4930         kvm_pfn_t identity_map_pfn;
4931         u32 tmp;
4932 
4933         /* Protect kvm->arch.ept_identity_pagetable_done. */
4934         mutex_lock(&kvm->slots_lock);
4935 
4936         if (likely(kvm->arch.ept_identity_pagetable_done))
4937                 goto out2;
4938 
4939         if (!kvm->arch.ept_identity_map_addr)
4940                 kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
4941         identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
4942 
4943         r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
4944                                     kvm->arch.ept_identity_map_addr, PAGE_SIZE);
4945         if (r < 0)
4946                 goto out2;
4947 
4948         idx = srcu_read_lock(&kvm->srcu);
4949         r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
4950         if (r < 0)
4951                 goto out;
4952         /* Set up identity-mapping pagetable for EPT in real mode */
4953         for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
4954                 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
4955                         _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
4956                 r = kvm_write_guest_page(kvm, identity_map_pfn,
4957                                 &tmp, i * sizeof(tmp), sizeof(tmp));
4958                 if (r < 0)
4959                         goto out;
4960         }
4961         kvm->arch.ept_identity_pagetable_done = true;
4962 
4963 out:
4964         srcu_read_unlock(&kvm->srcu, idx);
4965 
4966 out2:
4967         mutex_unlock(&kvm->slots_lock);
4968         return r;
4969 }
4970 
4971 static void seg_setup(int seg)
4972 {
4973         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
4974         unsigned int ar;
4975 
4976         vmcs_write16(sf->selector, 0);
4977         vmcs_writel(sf->base, 0);
4978         vmcs_write32(sf->limit, 0xffff);
4979         ar = 0x93;
4980         if (seg == VCPU_SREG_CS)
4981                 ar |= 0x08; /* code segment */
4982 
4983         vmcs_write32(sf->ar_bytes, ar);
4984 }
4985 
4986 static int alloc_apic_access_page(struct kvm *kvm)
4987 {
4988         struct page *page;
4989         int r = 0;
4990 
4991         mutex_lock(&kvm->slots_lock);
4992         if (kvm->arch.apic_access_page_done)
4993                 goto out;
4994         r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
4995                                     APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
4996         if (r)
4997                 goto out;
4998 
4999         page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
5000         if (is_error_page(page)) {
5001                 r = -EFAULT;
5002                 goto out;
5003         }
5004 
5005         /*
5006          * Do not pin the page in memory, so that memory hot-unplug
5007          * is able to migrate it.
5008          */
5009         put_page(page);
5010         kvm->arch.apic_access_page_done = true;
5011 out:
5012         mutex_unlock(&kvm->slots_lock);
5013         return r;
5014 }
5015 
5016 static int allocate_vpid(void)
5017 {
5018         int vpid;
5019 
5020         if (!enable_vpid)
5021                 return 0;
5022         spin_lock(&vmx_vpid_lock);
5023         vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
5024         if (vpid < VMX_NR_VPIDS)
5025                 __set_bit(vpid, vmx_vpid_bitmap);
5026         else
5027                 vpid = 0;
5028         spin_unlock(&vmx_vpid_lock);
5029         return vpid;
5030 }
5031 
5032 static void free_vpid(int vpid)
5033 {
5034         if (!enable_vpid || vpid == 0)
5035                 return;
5036         spin_lock(&vmx_vpid_lock);
5037         __clear_bit(vpid, vmx_vpid_bitmap);
5038         spin_unlock(&vmx_vpid_lock);
5039 }
5040 
5041 static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
5042                                                           u32 msr, int type)
5043 {
5044         int f = sizeof(unsigned long);
5045 
5046         if (!cpu_has_vmx_msr_bitmap())
5047                 return;
5048 
5049         /*
5050          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
5051          * have the write-low and read-high bitmap offsets the wrong way round.
5052          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
5053          */
5054         if (msr <= 0x1fff) {
5055                 if (type & MSR_TYPE_R)
5056                         /* read-low */
5057                         __clear_bit(msr, msr_bitmap + 0x000 / f);
5058 
5059                 if (type & MSR_TYPE_W)
5060                         /* write-low */
5061                         __clear_bit(msr, msr_bitmap + 0x800 / f);
5062 
5063         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
5064                 msr &= 0x1fff;
5065                 if (type & MSR_TYPE_R)
5066                         /* read-high */
5067                         __clear_bit(msr, msr_bitmap + 0x400 / f);
5068 
5069                 if (type & MSR_TYPE_W)
5070                         /* write-high */
5071                         __clear_bit(msr, msr_bitmap + 0xc00 / f);
5072 
5073         }
5074 }
5075 
5076 static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
5077                                                          u32 msr, int type)
5078 {
5079         int f = sizeof(unsigned long);
5080 
5081         if (!cpu_has_vmx_msr_bitmap())
5082                 return;
5083 
5084         /*
5085          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
5086          * have the write-low and read-high bitmap offsets the wrong way round.
5087          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
5088          */
5089         if (msr <= 0x1fff) {
5090                 if (type & MSR_TYPE_R)
5091                         /* read-low */
5092                         __set_bit(msr, msr_bitmap + 0x000 / f);
5093 
5094                 if (type & MSR_TYPE_W)
5095                         /* write-low */
5096                         __set_bit(msr, msr_bitmap + 0x800 / f);
5097 
5098         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
5099                 msr &= 0x1fff;
5100                 if (type & MSR_TYPE_R)
5101                         /* read-high */
5102                         __set_bit(msr, msr_bitmap + 0x400 / f);
5103 
5104                 if (type & MSR_TYPE_W)
5105                         /* write-high */
5106                         __set_bit(msr, msr_bitmap + 0xc00 / f);
5107 
5108         }
5109 }
5110 
5111 static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
5112                                                       u32 msr, int type, bool value)
5113 {
5114         if (value)
5115                 vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
5116         else
5117                 vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
5118 }
5119 
5120 /*
5121  * If a msr is allowed by L0, we should check whether it is allowed by L1.
5122  * The corresponding bit will be cleared unless both of L0 and L1 allow it.
5123  */
5124 static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
5125                                                unsigned long *msr_bitmap_nested,
5126                                                u32 msr, int type)
5127 {
5128         int f = sizeof(unsigned long);
5129 
5130         if (!cpu_has_vmx_msr_bitmap()) {
5131                 WARN_ON(1);
5132                 return;
5133         }
5134 
5135         /*
5136          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
5137          * have the write-low and read-high bitmap offsets the wrong way round.
5138          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
5139          */
5140         if (msr <= 0x1fff) {
5141                 if (type & MSR_TYPE_R &&
5142                    !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
5143                         /* read-low */
5144                         __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
5145 
5146                 if (type & MSR_TYPE_W &&
5147                    !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
5148                         /* write-low */
5149                         __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
5150 
5151         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
5152                 msr &= 0x1fff;
5153                 if (type & MSR_TYPE_R &&
5154                    !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
5155                         /* read-high */
5156                         __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
5157 
5158                 if (type & MSR_TYPE_W &&
5159                    !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
5160                         /* write-high */
5161                         __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
5162 
5163         }
5164 }
5165 
5166 static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
5167 {
5168         u8 mode = 0;
5169 
5170         if (cpu_has_secondary_exec_ctrls() &&
5171             (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
5172              SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
5173                 mode |= MSR_BITMAP_MODE_X2APIC;
5174                 if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
5175                         mode |= MSR_BITMAP_MODE_X2APIC_APICV;
5176         }
5177 
5178         if (is_long_mode(vcpu))
5179                 mode |= MSR_BITMAP_MODE_LM;
5180 
5181         return mode;
5182 }
5183 
5184 #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
5185 
5186 static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
5187                                          u8 mode)
5188 {
5189         int msr;
5190 
5191         for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
5192                 unsigned word = msr / BITS_PER_LONG;
5193                 msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
5194                 msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
5195         }
5196 
5197         if (mode & MSR_BITMAP_MODE_X2APIC) {
5198                 /*
5199                  * TPR reads and writes can be virtualized even if virtual interrupt
5200                  * delivery is not in use.
5201                  */
5202                 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
5203                 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
5204                         vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
5205                         vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
5206                         vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
5207                 }
5208         }
5209 }
5210 
5211 static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
5212 {
5213         struct vcpu_vmx *vmx = to_vmx(vcpu);
5214         unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
5215         u8 mode = vmx_msr_bitmap_mode(vcpu);
5216         u8 changed = mode ^ vmx->msr_bitmap_mode;
5217 
5218         if (!changed)
5219                 return;
5220 
5221         vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
5222                                   !(mode & MSR_BITMAP_MODE_LM));
5223 
5224         if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
5225                 vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
5226 
5227         vmx->msr_bitmap_mode = mode;
5228 }
5229 
5230 static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
5231 {
5232         return enable_apicv;
5233 }
5234 
5235 static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
5236 {
5237         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5238         gfn_t gfn;
5239 
5240         /*
5241          * Don't need to mark the APIC access page dirty; it is never
5242          * written to by the CPU during APIC virtualization.
5243          */
5244 
5245         if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
5246                 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
5247                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
5248         }
5249 
5250         if (nested_cpu_has_posted_intr(vmcs12)) {
5251                 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
5252                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
5253         }
5254 }
5255 
5256 
5257 static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
5258 {
5259         struct vcpu_vmx *vmx = to_vmx(vcpu);
5260         int max_irr;
5261         void *vapic_page;
5262         u16 status;
5263 
5264         if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
5265                 return;
5266 
5267         vmx->nested.pi_pending = false;
5268         if (!pi_test_and_clear_on(vmx->nested.pi_desc))
5269                 return;
5270 
5271         max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
5272         if (max_irr != 256) {
5273                 vapic_page = kmap(vmx->nested.virtual_apic_page);
5274                 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
5275                 kunmap(vmx->nested.virtual_apic_page);
5276 
5277                 status = vmcs_read16(GUEST_INTR_STATUS);
5278                 if ((u8)max_irr > ((u8)status & 0xff)) {
5279                         status &= ~0xff;
5280                         status |= (u8)max_irr;
5281                         vmcs_write16(GUEST_INTR_STATUS, status);
5282                 }
5283         }
5284 
5285         nested_mark_vmcs12_pages_dirty(vcpu);
5286 }
5287 
5288 static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
5289                                                      bool nested)
5290 {
5291 #ifdef CONFIG_SMP
5292         int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
5293 
5294         if (vcpu->mode == IN_GUEST_MODE) {
5295                 /*
5296                  * The vector of interrupt to be delivered to vcpu had
5297                  * been set in PIR before this function.
5298                  *
5299                  * Following cases will be reached in this block, and
5300                  * we always send a notification event in all cases as
5301                  * explained below.
5302                  *
5303                  * Case 1: vcpu keeps in non-root mode. Sending a
5304                  * notification event posts the interrupt to vcpu.
5305                  *
5306                  * Case 2: vcpu exits to root mode and is still
5307                  * runnable. PIR will be synced to vIRR before the
5308                  * next vcpu entry. Sending a notification event in
5309                  * this case has no effect, as vcpu is not in root
5310                  * mode.
5311                  *
5312                  * Case 3: vcpu exits to root mode and is blocked.
5313                  * vcpu_block() has already synced PIR to vIRR and
5314                  * never blocks vcpu if vIRR is not cleared. Therefore,
5315                  * a blocked vcpu here does not wait for any requested
5316                  * interrupts in PIR, and sending a notification event
5317                  * which has no effect is safe here.
5318                  */
5319 
5320                 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
5321                 return true;
5322         }
5323 #endif
5324         return false;
5325 }
5326 
5327 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
5328                                                 int vector)
5329 {
5330         struct vcpu_vmx *vmx = to_vmx(vcpu);
5331 
5332         if (is_guest_mode(vcpu) &&
5333             vector == vmx->nested.posted_intr_nv) {
5334                 /*
5335                  * If a posted intr is not recognized by hardware,
5336                  * we will accomplish it in the next vmentry.
5337                  */
5338                 vmx->nested.pi_pending = true;
5339                 kvm_make_request(KVM_REQ_EVENT, vcpu);
5340                 /* the PIR and ON have been set by L1. */
5341                 if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
5342                         kvm_vcpu_kick(vcpu);
5343                 return 0;
5344         }
5345         return -1;
5346 }
5347 /*
5348  * Send interrupt to vcpu via posted interrupt way.
5349  * 1. If target vcpu is running(non-root mode), send posted interrupt
5350  * notification to vcpu and hardware will sync PIR to vIRR atomically.
5351  * 2. If target vcpu isn't running(root mode), kick it to pick up the
5352  * interrupt from PIR in next vmentry.
5353  */
5354 static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
5355 {
5356         struct vcpu_vmx *vmx = to_vmx(vcpu);
5357         int r;
5358 
5359         r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
5360         if (!r)
5361                 return;
5362 
5363         if (pi_test_and_set_pir(vector, &vmx->pi_desc))
5364                 return;
5365 
5366         /* If a previous notification has sent the IPI, nothing to do.  */
5367         if (pi_test_and_set_on(&vmx->pi_desc))
5368                 return;
5369 
5370         if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
5371                 kvm_vcpu_kick(vcpu);
5372 }
5373 
5374 /*
5375  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
5376  * will not change in the lifetime of the guest.
5377  * Note that host-state that does change is set elsewhere. E.g., host-state
5378  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
5379  */
5380 static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
5381 {
5382         u32 low32, high32;
5383         unsigned long tmpl;
5384         struct desc_ptr dt;
5385         unsigned long cr0, cr3, cr4;
5386 
5387         cr0 = read_cr0();
5388         WARN_ON(cr0 & X86_CR0_TS);
5389         vmcs_writel(HOST_CR0, cr0);  /* 22.2.3 */
5390 
5391         /*
5392          * Save the most likely value for this task's CR3 in the VMCS.
5393          * We can't use __get_current_cr3_fast() because we're not atomic.
5394          */
5395         cr3 = __read_cr3();
5396         vmcs_writel(HOST_CR3, cr3);             /* 22.2.3  FIXME: shadow tables */
5397         vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
5398 
5399         /* Save the most likely value for this task's CR4 in the VMCS. */
5400         cr4 = cr4_read_shadow();
5401         vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
5402         vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
5403 
5404         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
5405 #ifdef CONFIG_X86_64
5406         /*
5407          * Load null selectors, so we can avoid reloading them in
5408          * __vmx_load_host_state(), in case userspace uses the null selectors
5409          * too (the expected case).
5410          */
5411         vmcs_write16(HOST_DS_SELECTOR, 0);
5412         vmcs_write16(HOST_ES_SELECTOR, 0);
5413 #else
5414         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
5415         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
5416 #endif
5417         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
5418         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
5419 
5420         store_idt(&dt);
5421         vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
5422         vmx->host_idt_base = dt.address;
5423 
5424         vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
5425 
5426         rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
5427         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
5428         rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
5429         vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
5430 
5431         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
5432                 rdmsr(MSR_IA32_CR_PAT, low32, high32);
5433                 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
5434         }
5435 }
5436 
5437 static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
5438 {
5439         vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
5440         if (enable_ept)
5441                 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
5442         if (is_guest_mode(&vmx->vcpu))
5443                 vmx->vcpu.arch.cr4_guest_owned_bits &=
5444                         ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
5445         vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
5446 }
5447 
5448 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
5449 {
5450         u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
5451 
5452         if (!kvm_vcpu_apicv_active(&vmx->vcpu))
5453                 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
5454 
5455         if (!enable_vnmi)
5456                 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
5457 
5458         /* Enable the preemption timer dynamically */
5459         pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
5460         return pin_based_exec_ctrl;
5461 }
5462 
5463 static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
5464 {
5465         struct vcpu_vmx *vmx = to_vmx(vcpu);
5466 
5467         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
5468         if (cpu_has_secondary_exec_ctrls()) {
5469                 if (kvm_vcpu_apicv_active(vcpu))
5470                         vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
5471                                       SECONDARY_EXEC_APIC_REGISTER_VIRT |
5472                                       SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
5473                 else
5474                         vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
5475                                         SECONDARY_EXEC_APIC_REGISTER_VIRT |
5476                                         SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
5477         }
5478 
5479         if (cpu_has_vmx_msr_bitmap())
5480                 vmx_update_msr_bitmap(vcpu);
5481 }
5482 
5483 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
5484 {
5485         u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
5486 
5487         if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
5488                 exec_control &= ~CPU_BASED_MOV_DR_EXITING;
5489 
5490         if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
5491                 exec_control &= ~CPU_BASED_TPR_SHADOW;
5492 #ifdef CONFIG_X86_64
5493                 exec_control |= CPU_BASED_CR8_STORE_EXITING |
5494                                 CPU_BASED_CR8_LOAD_EXITING;
5495 #endif
5496         }
5497         if (!enable_ept)
5498                 exec_control |= CPU_BASED_CR3_STORE_EXITING |
5499                                 CPU_BASED_CR3_LOAD_EXITING  |
5500                                 CPU_BASED_INVLPG_EXITING;
5501         return exec_control;
5502 }
5503 
5504 static bool vmx_rdrand_supported(void)
5505 {
5506         return vmcs_config.cpu_based_2nd_exec_ctrl &
5507                 SECONDARY_EXEC_RDRAND_EXITING;
5508 }
5509 
5510 static bool vmx_rdseed_supported(void)
5511 {
5512         return vmcs_config.cpu_based_2nd_exec_ctrl &
5513                 SECONDARY_EXEC_RDSEED_EXITING;
5514 }
5515 
5516 static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
5517 {
5518         struct kvm_vcpu *vcpu = &vmx->vcpu;
5519 
5520         u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
5521         if (!cpu_need_virtualize_apic_accesses(vcpu))
5522                 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
5523         if (vmx->vpid == 0)
5524                 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
5525         if (!enable_ept) {
5526                 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
5527                 enable_unrestricted_guest = 0;
5528                 /* Enable INVPCID for non-ept guests may cause performance regression. */
5529                 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
5530         }
5531         if (!enable_unrestricted_guest)
5532                 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
5533         if (!ple_gap)
5534                 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
5535         if (!kvm_vcpu_apicv_active(vcpu))
5536                 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
5537                                   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
5538         exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
5539         /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
5540            (handle_vmptrld).
5541            We can NOT enable shadow_vmcs here because we don't have yet
5542            a current VMCS12
5543         */
5544         exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
5545 
5546         if (!enable_pml)
5547                 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
5548 
5549         if (vmx_xsaves_supported()) {
5550                 /* Exposing XSAVES only when XSAVE is exposed */
5551                 bool xsaves_enabled =
5552                         guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
5553                         guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
5554 
5555                 if (!xsaves_enabled)
5556                         exec_control &= ~SECONDARY_EXEC_XSAVES;
5557 
5558                 if (nested) {
5559                         if (xsaves_enabled)
5560                                 vmx->nested.nested_vmx_secondary_ctls_high |=
5561                                         SECONDARY_EXEC_XSAVES;
5562                         else
5563                                 vmx->nested.nested_vmx_secondary_ctls_high &=
5564                                         ~SECONDARY_EXEC_XSAVES;
5565                 }
5566         }
5567 
5568         if (vmx_rdtscp_supported()) {
5569                 bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
5570                 if (!rdtscp_enabled)
5571                         exec_control &= ~SECONDARY_EXEC_RDTSCP;
5572 
5573                 if (nested) {
5574                         if (rdtscp_enabled)
5575                                 vmx->nested.nested_vmx_secondary_ctls_high |=
5576                                         SECONDARY_EXEC_RDTSCP;
5577                         else
5578                                 vmx->nested.nested_vmx_secondary_ctls_high &=
5579                                         ~SECONDARY_EXEC_RDTSCP;
5580                 }
5581         }
5582 
5583         if (vmx_invpcid_supported()) {
5584                 /* Exposing INVPCID only when PCID is exposed */
5585                 bool invpcid_enabled =
5586                         guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
5587                         guest_cpuid_has(vcpu, X86_FEATURE_PCID);
5588 
5589                 if (!invpcid_enabled) {
5590                         exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
5591                         guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
5592                 }
5593 
5594                 if (nested) {
5595                         if (invpcid_enabled)
5596                                 vmx->nested.nested_vmx_secondary_ctls_high |=
5597                                         SECONDARY_EXEC_ENABLE_INVPCID;
5598                         else
5599                                 vmx->nested.nested_vmx_secondary_ctls_high &=
5600                                         ~SECONDARY_EXEC_ENABLE_INVPCID;
5601                 }
5602         }
5603 
5604         if (vmx_rdrand_supported()) {
5605                 bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
5606                 if (rdrand_enabled)
5607                         exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
5608 
5609                 if (nested) {
5610                         if (rdrand_enabled)
5611                                 vmx->nested.nested_vmx_secondary_ctls_high |=
5612                                         SECONDARY_EXEC_RDRAND_EXITING;
5613                         else
5614                                 vmx->nested.nested_vmx_secondary_ctls_high &=
5615                                         ~SECONDARY_EXEC_RDRAND_EXITING;
5616                 }
5617         }
5618 
5619         if (vmx_rdseed_supported()) {
5620                 bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
5621                 if (rdseed_enabled)
5622                         exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
5623 
5624                 if (nested) {
5625                         if (rdseed_enabled)
5626                                 vmx->nested.nested_vmx_secondary_ctls_high |=
5627                                         SECONDARY_EXEC_RDSEED_EXITING;
5628                         else
5629                                 vmx->nested.nested_vmx_secondary_ctls_high &=
5630                                         ~SECONDARY_EXEC_RDSEED_EXITING;
5631                 }
5632         }
5633 
5634         vmx->secondary_exec_control = exec_control;
5635 }
5636 
5637 static void ept_set_mmio_spte_mask(void)
5638 {
5639         /*
5640          * EPT Misconfigurations can be generated if the value of bits 2:0
5641          * of an EPT paging-structure entry is 110b (write/execute).
5642          */
5643         kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
5644                                    VMX_EPT_MISCONFIG_WX_VALUE);
5645 }
5646 
5647 #define VMX_XSS_EXIT_BITMAP 0
5648 /*
5649  * Sets up the vmcs for emulated real mode.
5650  */
5651 static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
5652 {
5653 #ifdef CONFIG_X86_64
5654         unsigned long a;
5655 #endif
5656         int i;
5657 
5658         /* I/O */
5659         vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
5660         vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
5661 
5662         if (enable_shadow_vmcs) {
5663                 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
5664                 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
5665         }
5666         if (cpu_has_vmx_msr_bitmap())
5667                 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
5668 
5669         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
5670 
5671         /* Control */
5672         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
5673         vmx->hv_deadline_tsc = -1;
5674 
5675         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
5676 
5677         if (cpu_has_secondary_exec_ctrls()) {
5678                 vmx_compute_secondary_exec_control(vmx);
5679                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
5680                              vmx->secondary_exec_control);
5681         }
5682 
5683         if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
5684                 vmcs_write64(EOI_EXIT_BITMAP0, 0);
5685                 vmcs_write64(EOI_EXIT_BITMAP1, 0);
5686                 vmcs_write64(EOI_EXIT_BITMAP2, 0);
5687                 vmcs_write64(EOI_EXIT_BITMAP3, 0);
5688 
5689                 vmcs_write16(GUEST_INTR_STATUS, 0);
5690 
5691                 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
5692                 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
5693         }
5694 
5695         if (ple_gap) {
5696                 vmcs_write32(PLE_GAP, ple_gap);
5697                 vmx->ple_window = ple_window;
5698                 vmx->ple_window_dirty = true;
5699         }
5700 
5701         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
5702         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
5703         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
5704 
5705         vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
5706         vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
5707         vmx_set_constant_host_state(vmx);
5708 #ifdef CONFIG_X86_64
5709         rdmsrl(MSR_FS_BASE, a);
5710         vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
5711         rdmsrl(MSR_GS_BASE, a);
5712         vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
5713 #else
5714         vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
5715         vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
5716 #endif
5717 
5718         if (cpu_has_vmx_vmfunc())
5719                 vmcs_write64(VM_FUNCTION_CONTROL, 0);
5720 
5721         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
5722         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
5723         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
5724         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
5725         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
5726 
5727         if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
5728                 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
5729 
5730         for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
5731                 u32 index = vmx_msr_index[i];
5732                 u32 data_low, data_high;
5733                 int j = vmx->nmsrs;
5734 
5735                 if (rdmsr_safe(index, &data_low, &data_high) < 0)
5736                         continue;
5737                 if (wrmsr_safe(index, data_low, data_high) < 0)
5738                         continue;
5739                 vmx->guest_msrs[j].index = i;
5740                 vmx->guest_msrs[j].data = 0;
5741                 vmx->guest_msrs[j].mask = -1ull;
5742                 ++vmx->nmsrs;
5743         }
5744 
5745         if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
5746                 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
5747 
5748         vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
5749 
5750         /* 22.2.1, 20.8.1 */
5751         vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
5752 
5753         vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
5754         vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
5755 
5756         set_cr4_guest_host_mask(vmx);
5757 
5758         if (vmx_xsaves_supported())
5759                 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
5760 
5761         if (enable_pml) {
5762                 ASSERT(vmx->pml_pg);
5763                 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
5764                 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
5765         }
5766 }
5767 
5768 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
5769 {
5770         struct vcpu_vmx *vmx = to_vmx(vcpu);
5771         struct msr_data apic_base_msr;
5772         u64 cr0;
5773 
5774         vmx->rmode.vm86_active = 0;
5775         vmx->spec_ctrl = 0;
5776 
5777         vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
5778         kvm_set_cr8(vcpu, 0);
5779 
5780         if (!init_event) {
5781                 apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
5782                                      MSR_IA32_APICBASE_ENABLE;
5783                 if (kvm_vcpu_is_reset_bsp(vcpu))
5784                         apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
5785                 apic_base_msr.host_initiated = true;
5786                 kvm_set_apic_base(vcpu, &apic_base_msr);
5787         }
5788 
5789         vmx_segment_cache_clear(vmx);
5790 
5791         seg_setup(VCPU_SREG_CS);
5792         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
5793         vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
5794 
5795         seg_setup(VCPU_SREG_DS);
5796         seg_setup(VCPU_SREG_ES);
5797         seg_setup(VCPU_SREG_FS);
5798         seg_setup(VCPU_SREG_GS);
5799         seg_setup(VCPU_SREG_SS);
5800 
5801         vmcs_write16(GUEST_TR_SELECTOR, 0);
5802         vmcs_writel(GUEST_TR_BASE, 0);
5803         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
5804         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
5805 
5806         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
5807         vmcs_writel(GUEST_LDTR_BASE, 0);
5808         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
5809         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
5810 
5811         if (!init_event) {
5812                 vmcs_write32(GUEST_SYSENTER_CS, 0);
5813                 vmcs_writel(GUEST_SYSENTER_ESP, 0);
5814                 vmcs_writel(GUEST_SYSENTER_EIP, 0);
5815                 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
5816         }
5817 
5818         kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
5819         kvm_rip_write(vcpu, 0xfff0);
5820 
5821         vmcs_writel(GUEST_GDTR_BASE, 0);
5822         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
5823 
5824         vmcs_writel(GUEST_IDTR_BASE, 0);
5825         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
5826 
5827         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
5828         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
5829         vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
5830         if (kvm_mpx_supported())
5831                 vmcs_write64(GUEST_BNDCFGS, 0);
5832 
5833         setup_msrs(vmx);
5834 
5835         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
5836 
5837         if (cpu_has_vmx_tpr_shadow() && !init_event) {
5838                 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
5839                 if (cpu_need_tpr_shadow(vcpu))
5840                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
5841                                      __pa(vcpu->arch.apic->regs));
5842                 vmcs_write32(TPR_THRESHOLD, 0);
5843         }
5844 
5845         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
5846 
5847         if (vmx->vpid != 0)
5848                 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
5849 
5850         cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
5851         vmx->vcpu.arch.cr0 = cr0;
5852         vmx_set_cr0(vcpu, cr0); /* enter rmode */
5853         vmx_set_cr4(vcpu, 0);
5854         vmx_set_efer(vcpu, 0);
5855 
5856         update_exception_bitmap(vcpu);
5857 
5858         vpid_sync_context(vmx->vpid);
5859 }
5860 
5861 /*
5862  * In nested virtualization, check if L1 asked to exit on external interrupts.
5863  * For most existing hypervisors, this will always return true.
5864  */
5865 static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
5866 {
5867         return get_vmcs12(vcpu)->pin_based_vm_exec_control &
5868                 PIN_BASED_EXT_INTR_MASK;
5869 }
5870 
5871 /*
5872  * In nested virtualization, check if L1 has set
5873  * VM_EXIT_ACK_INTR_ON_EXIT
5874  */
5875 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
5876 {
5877         return get_vmcs12(vcpu)->vm_exit_controls &
5878                 VM_EXIT_ACK_INTR_ON_EXIT;
5879 }
5880 
5881 static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
5882 {
5883         return get_vmcs12(vcpu)->pin_based_vm_exec_control &
5884                 PIN_BASED_NMI_EXITING;
5885 }
5886 
5887 static void enable_irq_window(struct kvm_vcpu *vcpu)
5888 {
5889         vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
5890                       CPU_BASED_VIRTUAL_INTR_PENDING);
5891 }
5892 
5893 static void enable_nmi_window(struct kvm_vcpu *vcpu)
5894 {
5895         if (!enable_vnmi ||
5896             vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
5897                 enable_irq_window(vcpu);
5898                 return;
5899         }
5900 
5901         vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
5902                       CPU_BASED_VIRTUAL_NMI_PENDING);
5903 }
5904 
5905 static void vmx_inject_irq(struct kvm_vcpu *vcpu)
5906 {
5907         struct vcpu_vmx *vmx = to_vmx(vcpu);
5908         uint32_t intr;
5909         int irq = vcpu->arch.interrupt.nr;
5910 
5911         trace_kvm_inj_virq(irq);
5912 
5913         ++vcpu->stat.irq_injections;
5914         if (vmx->rmode.vm86_active) {
5915                 int inc_eip = 0;
5916                 if (vcpu->arch.interrupt.soft)
5917                         inc_eip = vcpu->arch.event_exit_inst_len;
5918                 if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
5919                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5920                 return;
5921         }
5922         intr = irq | INTR_INFO_VALID_MASK;
5923         if (vcpu->arch.interrupt.soft) {
5924                 intr |= INTR_TYPE_SOFT_INTR;
5925                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
5926                              vmx->vcpu.arch.event_exit_inst_len);
5927         } else
5928                 intr |= INTR_TYPE_EXT_INTR;
5929         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
5930 }
5931 
5932 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
5933 {
5934         struct vcpu_vmx *vmx = to_vmx(vcpu);
5935 
5936         if (!enable_vnmi) {
5937                 /*
5938                  * Tracking the NMI-blocked state in software is built upon
5939                  * finding the next open IRQ window. This, in turn, depends on
5940                  * well-behaving guests: They have to keep IRQs disabled at
5941