~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/x86/kvm/vmx.c

Version: ~ [ linux-5.8-rc5 ] ~ [ linux-5.7.8 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.51 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.132 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.188 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.230 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.230 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.85 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * Kernel-based Virtual Machine driver for Linux
  3  *
  4  * This module enables machines with Intel VT-x extensions to run virtual
  5  * machines without emulation or binary translation.
  6  *
  7  * Copyright (C) 2006 Qumranet, Inc.
  8  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  9  *
 10  * Authors:
 11  *   Avi Kivity   <avi@qumranet.com>
 12  *   Yaniv Kamay  <yaniv@qumranet.com>
 13  *
 14  * This work is licensed under the terms of the GNU GPL, version 2.  See
 15  * the COPYING file in the top-level directory.
 16  *
 17  */
 18 
 19 #include "irq.h"
 20 #include "mmu.h"
 21 #include "cpuid.h"
 22 
 23 #include <linux/kvm_host.h>
 24 #include <linux/module.h>
 25 #include <linux/kernel.h>
 26 #include <linux/mm.h>
 27 #include <linux/highmem.h>
 28 #include <linux/sched.h>
 29 #include <linux/moduleparam.h>
 30 #include <linux/mod_devicetable.h>
 31 #include <linux/ftrace_event.h>
 32 #include <linux/slab.h>
 33 #include <linux/tboot.h>
 34 #include "kvm_cache_regs.h"
 35 #include "x86.h"
 36 
 37 #include <asm/io.h>
 38 #include <asm/desc.h>
 39 #include <asm/vmx.h>
 40 #include <asm/virtext.h>
 41 #include <asm/mce.h>
 42 #include <asm/i387.h>
 43 #include <asm/xcr.h>
 44 #include <asm/perf_event.h>
 45 #include <asm/kexec.h>
 46 
 47 #include "trace.h"
 48 
 49 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 50 #define __ex_clear(x, reg) \
 51         ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
 52 
 53 MODULE_AUTHOR("Qumranet");
 54 MODULE_LICENSE("GPL");
 55 
 56 static const struct x86_cpu_id vmx_cpu_id[] = {
 57         X86_FEATURE_MATCH(X86_FEATURE_VMX),
 58         {}
 59 };
 60 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
 61 
 62 static bool __read_mostly enable_vpid = 1;
 63 module_param_named(vpid, enable_vpid, bool, 0444);
 64 
 65 static bool __read_mostly flexpriority_enabled = 1;
 66 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
 67 
 68 static bool __read_mostly enable_ept = 1;
 69 module_param_named(ept, enable_ept, bool, S_IRUGO);
 70 
 71 static bool __read_mostly enable_unrestricted_guest = 1;
 72 module_param_named(unrestricted_guest,
 73                         enable_unrestricted_guest, bool, S_IRUGO);
 74 
 75 static bool __read_mostly enable_ept_ad_bits = 1;
 76 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
 77 
 78 static bool __read_mostly emulate_invalid_guest_state = true;
 79 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 80 
 81 static bool __read_mostly vmm_exclusive = 1;
 82 module_param(vmm_exclusive, bool, S_IRUGO);
 83 
 84 static bool __read_mostly fasteoi = 1;
 85 module_param(fasteoi, bool, S_IRUGO);
 86 
 87 static bool __read_mostly enable_apicv = 1;
 88 module_param(enable_apicv, bool, S_IRUGO);
 89 
 90 static bool __read_mostly enable_shadow_vmcs = 1;
 91 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
 92 /*
 93  * If nested=1, nested virtualization is supported, i.e., guests may use
 94  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
 95  * use VMX instructions.
 96  */
 97 static bool __read_mostly nested = 0;
 98 module_param(nested, bool, S_IRUGO);
 99 
100 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
101 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
102 #define KVM_VM_CR0_ALWAYS_ON                                            \
103         (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
104 #define KVM_CR4_GUEST_OWNED_BITS                                      \
105         (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR      \
106          | X86_CR4_OSXMMEXCPT)
107 
108 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
109 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
110 
111 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
112 
113 /*
114  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
115  * ple_gap:    upper bound on the amount of time between two successive
116  *             executions of PAUSE in a loop. Also indicate if ple enabled.
117  *             According to test, this time is usually smaller than 128 cycles.
118  * ple_window: upper bound on the amount of time a guest is allowed to execute
119  *             in a PAUSE loop. Tests indicate that most spinlocks are held for
120  *             less than 2^12 cycles
121  * Time is measured based on a counter that runs at the same rate as the TSC,
122  * refer SDM volume 3b section 21.6.13 & 22.1.3.
123  */
124 #define KVM_VMX_DEFAULT_PLE_GAP    128
125 #define KVM_VMX_DEFAULT_PLE_WINDOW 4096
126 static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
127 module_param(ple_gap, int, S_IRUGO);
128 
129 static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
130 module_param(ple_window, int, S_IRUGO);
131 
132 extern const ulong vmx_return;
133 
134 #define NR_AUTOLOAD_MSRS 8
135 #define VMCS02_POOL_SIZE 1
136 
137 struct vmcs {
138         u32 revision_id;
139         u32 abort;
140         char data[0];
141 };
142 
143 /*
144  * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
145  * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
146  * loaded on this CPU (so we can clear them if the CPU goes down).
147  */
148 struct loaded_vmcs {
149         struct vmcs *vmcs;
150         int cpu;
151         int launched;
152         struct list_head loaded_vmcss_on_cpu_link;
153 };
154 
155 struct shared_msr_entry {
156         unsigned index;
157         u64 data;
158         u64 mask;
159 };
160 
161 /*
162  * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
163  * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
164  * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
165  * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
166  * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
167  * More than one of these structures may exist, if L1 runs multiple L2 guests.
168  * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
169  * underlying hardware which will be used to run L2.
170  * This structure is packed to ensure that its layout is identical across
171  * machines (necessary for live migration).
172  * If there are changes in this struct, VMCS12_REVISION must be changed.
173  */
174 typedef u64 natural_width;
175 struct __packed vmcs12 {
176         /* According to the Intel spec, a VMCS region must start with the
177          * following two fields. Then follow implementation-specific data.
178          */
179         u32 revision_id;
180         u32 abort;
181 
182         u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
183         u32 padding[7]; /* room for future expansion */
184 
185         u64 io_bitmap_a;
186         u64 io_bitmap_b;
187         u64 msr_bitmap;
188         u64 vm_exit_msr_store_addr;
189         u64 vm_exit_msr_load_addr;
190         u64 vm_entry_msr_load_addr;
191         u64 tsc_offset;
192         u64 virtual_apic_page_addr;
193         u64 apic_access_addr;
194         u64 ept_pointer;
195         u64 guest_physical_address;
196         u64 vmcs_link_pointer;
197         u64 guest_ia32_debugctl;
198         u64 guest_ia32_pat;
199         u64 guest_ia32_efer;
200         u64 guest_ia32_perf_global_ctrl;
201         u64 guest_pdptr0;
202         u64 guest_pdptr1;
203         u64 guest_pdptr2;
204         u64 guest_pdptr3;
205         u64 host_ia32_pat;
206         u64 host_ia32_efer;
207         u64 host_ia32_perf_global_ctrl;
208         u64 padding64[8]; /* room for future expansion */
209         /*
210          * To allow migration of L1 (complete with its L2 guests) between
211          * machines of different natural widths (32 or 64 bit), we cannot have
212          * unsigned long fields with no explict size. We use u64 (aliased
213          * natural_width) instead. Luckily, x86 is little-endian.
214          */
215         natural_width cr0_guest_host_mask;
216         natural_width cr4_guest_host_mask;
217         natural_width cr0_read_shadow;
218         natural_width cr4_read_shadow;
219         natural_width cr3_target_value0;
220         natural_width cr3_target_value1;
221         natural_width cr3_target_value2;
222         natural_width cr3_target_value3;
223         natural_width exit_qualification;
224         natural_width guest_linear_address;
225         natural_width guest_cr0;
226         natural_width guest_cr3;
227         natural_width guest_cr4;
228         natural_width guest_es_base;
229         natural_width guest_cs_base;
230         natural_width guest_ss_base;
231         natural_width guest_ds_base;
232         natural_width guest_fs_base;
233         natural_width guest_gs_base;
234         natural_width guest_ldtr_base;
235         natural_width guest_tr_base;
236         natural_width guest_gdtr_base;
237         natural_width guest_idtr_base;
238         natural_width guest_dr7;
239         natural_width guest_rsp;
240         natural_width guest_rip;
241         natural_width guest_rflags;
242         natural_width guest_pending_dbg_exceptions;
243         natural_width guest_sysenter_esp;
244         natural_width guest_sysenter_eip;
245         natural_width host_cr0;
246         natural_width host_cr3;
247         natural_width host_cr4;
248         natural_width host_fs_base;
249         natural_width host_gs_base;
250         natural_width host_tr_base;
251         natural_width host_gdtr_base;
252         natural_width host_idtr_base;
253         natural_width host_ia32_sysenter_esp;
254         natural_width host_ia32_sysenter_eip;
255         natural_width host_rsp;
256         natural_width host_rip;
257         natural_width paddingl[8]; /* room for future expansion */
258         u32 pin_based_vm_exec_control;
259         u32 cpu_based_vm_exec_control;
260         u32 exception_bitmap;
261         u32 page_fault_error_code_mask;
262         u32 page_fault_error_code_match;
263         u32 cr3_target_count;
264         u32 vm_exit_controls;
265         u32 vm_exit_msr_store_count;
266         u32 vm_exit_msr_load_count;
267         u32 vm_entry_controls;
268         u32 vm_entry_msr_load_count;
269         u32 vm_entry_intr_info_field;
270         u32 vm_entry_exception_error_code;
271         u32 vm_entry_instruction_len;
272         u32 tpr_threshold;
273         u32 secondary_vm_exec_control;
274         u32 vm_instruction_error;
275         u32 vm_exit_reason;
276         u32 vm_exit_intr_info;
277         u32 vm_exit_intr_error_code;
278         u32 idt_vectoring_info_field;
279         u32 idt_vectoring_error_code;
280         u32 vm_exit_instruction_len;
281         u32 vmx_instruction_info;
282         u32 guest_es_limit;
283         u32 guest_cs_limit;
284         u32 guest_ss_limit;
285         u32 guest_ds_limit;
286         u32 guest_fs_limit;
287         u32 guest_gs_limit;
288         u32 guest_ldtr_limit;
289         u32 guest_tr_limit;
290         u32 guest_gdtr_limit;
291         u32 guest_idtr_limit;
292         u32 guest_es_ar_bytes;
293         u32 guest_cs_ar_bytes;
294         u32 guest_ss_ar_bytes;
295         u32 guest_ds_ar_bytes;
296         u32 guest_fs_ar_bytes;
297         u32 guest_gs_ar_bytes;
298         u32 guest_ldtr_ar_bytes;
299         u32 guest_tr_ar_bytes;
300         u32 guest_interruptibility_info;
301         u32 guest_activity_state;
302         u32 guest_sysenter_cs;
303         u32 host_ia32_sysenter_cs;
304         u32 vmx_preemption_timer_value;
305         u32 padding32[7]; /* room for future expansion */
306         u16 virtual_processor_id;
307         u16 guest_es_selector;
308         u16 guest_cs_selector;
309         u16 guest_ss_selector;
310         u16 guest_ds_selector;
311         u16 guest_fs_selector;
312         u16 guest_gs_selector;
313         u16 guest_ldtr_selector;
314         u16 guest_tr_selector;
315         u16 host_es_selector;
316         u16 host_cs_selector;
317         u16 host_ss_selector;
318         u16 host_ds_selector;
319         u16 host_fs_selector;
320         u16 host_gs_selector;
321         u16 host_tr_selector;
322 };
323 
324 /*
325  * VMCS12_REVISION is an arbitrary id that should be changed if the content or
326  * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
327  * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
328  */
329 #define VMCS12_REVISION 0x11e57ed0
330 
331 /*
332  * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
333  * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
334  * current implementation, 4K are reserved to avoid future complications.
335  */
336 #define VMCS12_SIZE 0x1000
337 
338 /* Used to remember the last vmcs02 used for some recently used vmcs12s */
339 struct vmcs02_list {
340         struct list_head list;
341         gpa_t vmptr;
342         struct loaded_vmcs vmcs02;
343 };
344 
345 /*
346  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
347  * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
348  */
349 struct nested_vmx {
350         /* Has the level1 guest done vmxon? */
351         bool vmxon;
352 
353         /* The guest-physical address of the current VMCS L1 keeps for L2 */
354         gpa_t current_vmptr;
355         /* The host-usable pointer to the above */
356         struct page *current_vmcs12_page;
357         struct vmcs12 *current_vmcs12;
358         struct vmcs *current_shadow_vmcs;
359         /*
360          * Indicates if the shadow vmcs must be updated with the
361          * data hold by vmcs12
362          */
363         bool sync_shadow_vmcs;
364 
365         /* vmcs02_list cache of VMCSs recently used to run L2 guests */
366         struct list_head vmcs02_pool;
367         int vmcs02_num;
368         u64 vmcs01_tsc_offset;
369         /* L2 must run next, and mustn't decide to exit to L1. */
370         bool nested_run_pending;
371         /*
372          * Guest pages referred to in vmcs02 with host-physical pointers, so
373          * we must keep them pinned while L2 runs.
374          */
375         struct page *apic_access_page;
376         u64 msr_ia32_feature_control;
377 };
378 
379 #define POSTED_INTR_ON  0
380 /* Posted-Interrupt Descriptor */
381 struct pi_desc {
382         u32 pir[8];     /* Posted interrupt requested */
383         u32 control;    /* bit 0 of control is outstanding notification bit */
384         u32 rsvd[7];
385 } __aligned(64);
386 
387 static bool pi_test_and_set_on(struct pi_desc *pi_desc)
388 {
389         return test_and_set_bit(POSTED_INTR_ON,
390                         (unsigned long *)&pi_desc->control);
391 }
392 
393 static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
394 {
395         return test_and_clear_bit(POSTED_INTR_ON,
396                         (unsigned long *)&pi_desc->control);
397 }
398 
399 static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
400 {
401         return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
402 }
403 
404 struct vcpu_vmx {
405         struct kvm_vcpu       vcpu;
406         unsigned long         host_rsp;
407         u8                    fail;
408         u8                    cpl;
409         bool                  nmi_known_unmasked;
410         u32                   exit_intr_info;
411         u32                   idt_vectoring_info;
412         ulong                 rflags;
413         struct shared_msr_entry *guest_msrs;
414         int                   nmsrs;
415         int                   save_nmsrs;
416         unsigned long         host_idt_base;
417 #ifdef CONFIG_X86_64
418         u64                   msr_host_kernel_gs_base;
419         u64                   msr_guest_kernel_gs_base;
420 #endif
421         u32 vm_entry_controls_shadow;
422         u32 vm_exit_controls_shadow;
423         /*
424          * loaded_vmcs points to the VMCS currently used in this vcpu. For a
425          * non-nested (L1) guest, it always points to vmcs01. For a nested
426          * guest (L2), it points to a different VMCS.
427          */
428         struct loaded_vmcs    vmcs01;
429         struct loaded_vmcs   *loaded_vmcs;
430         bool                  __launched; /* temporary, used in vmx_vcpu_run */
431         struct msr_autoload {
432                 unsigned nr;
433                 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
434                 struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
435         } msr_autoload;
436         struct {
437                 int           loaded;
438                 u16           fs_sel, gs_sel, ldt_sel;
439 #ifdef CONFIG_X86_64
440                 u16           ds_sel, es_sel;
441 #endif
442                 int           gs_ldt_reload_needed;
443                 int           fs_reload_needed;
444                 unsigned long vmcs_host_cr4;    /* May not match real cr4 */
445         } host_state;
446         struct {
447                 int vm86_active;
448                 ulong save_rflags;
449                 struct kvm_segment segs[8];
450         } rmode;
451         struct {
452                 u32 bitmask; /* 4 bits per segment (1 bit per field) */
453                 struct kvm_save_segment {
454                         u16 selector;
455                         unsigned long base;
456                         u32 limit;
457                         u32 ar;
458                 } seg[8];
459         } segment_cache;
460         int vpid;
461         bool emulation_required;
462 
463         /* Support for vnmi-less CPUs */
464         int soft_vnmi_blocked;
465         ktime_t entry_time;
466         s64 vnmi_blocked_time;
467         u32 exit_reason;
468 
469         bool rdtscp_enabled;
470 
471         /* Posted interrupt descriptor */
472         struct pi_desc pi_desc;
473 
474         /* Support for a guest hypervisor (nested VMX) */
475         struct nested_vmx nested;
476 };
477 
478 enum segment_cache_field {
479         SEG_FIELD_SEL = 0,
480         SEG_FIELD_BASE = 1,
481         SEG_FIELD_LIMIT = 2,
482         SEG_FIELD_AR = 3,
483 
484         SEG_FIELD_NR = 4
485 };
486 
487 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
488 {
489         return container_of(vcpu, struct vcpu_vmx, vcpu);
490 }
491 
492 #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
493 #define FIELD(number, name)     [number] = VMCS12_OFFSET(name)
494 #define FIELD64(number, name)   [number] = VMCS12_OFFSET(name), \
495                                 [number##_HIGH] = VMCS12_OFFSET(name)+4
496 
497 
498 static const unsigned long shadow_read_only_fields[] = {
499         /*
500          * We do NOT shadow fields that are modified when L0
501          * traps and emulates any vmx instruction (e.g. VMPTRLD,
502          * VMXON...) executed by L1.
503          * For example, VM_INSTRUCTION_ERROR is read
504          * by L1 if a vmx instruction fails (part of the error path).
505          * Note the code assumes this logic. If for some reason
506          * we start shadowing these fields then we need to
507          * force a shadow sync when L0 emulates vmx instructions
508          * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
509          * by nested_vmx_failValid)
510          */
511         VM_EXIT_REASON,
512         VM_EXIT_INTR_INFO,
513         VM_EXIT_INSTRUCTION_LEN,
514         IDT_VECTORING_INFO_FIELD,
515         IDT_VECTORING_ERROR_CODE,
516         VM_EXIT_INTR_ERROR_CODE,
517         EXIT_QUALIFICATION,
518         GUEST_LINEAR_ADDRESS,
519         GUEST_PHYSICAL_ADDRESS
520 };
521 static const int max_shadow_read_only_fields =
522         ARRAY_SIZE(shadow_read_only_fields);
523 
524 static const unsigned long shadow_read_write_fields[] = {
525         GUEST_RIP,
526         GUEST_RSP,
527         GUEST_CR0,
528         GUEST_CR3,
529         GUEST_CR4,
530         GUEST_INTERRUPTIBILITY_INFO,
531         GUEST_RFLAGS,
532         GUEST_CS_SELECTOR,
533         GUEST_CS_AR_BYTES,
534         GUEST_CS_LIMIT,
535         GUEST_CS_BASE,
536         GUEST_ES_BASE,
537         CR0_GUEST_HOST_MASK,
538         CR0_READ_SHADOW,
539         CR4_READ_SHADOW,
540         TSC_OFFSET,
541         EXCEPTION_BITMAP,
542         CPU_BASED_VM_EXEC_CONTROL,
543         VM_ENTRY_EXCEPTION_ERROR_CODE,
544         VM_ENTRY_INTR_INFO_FIELD,
545         VM_ENTRY_INSTRUCTION_LEN,
546         VM_ENTRY_EXCEPTION_ERROR_CODE,
547         HOST_FS_BASE,
548         HOST_GS_BASE,
549         HOST_FS_SELECTOR,
550         HOST_GS_SELECTOR
551 };
552 static const int max_shadow_read_write_fields =
553         ARRAY_SIZE(shadow_read_write_fields);
554 
555 static const unsigned short vmcs_field_to_offset_table[] = {
556         FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
557         FIELD(GUEST_ES_SELECTOR, guest_es_selector),
558         FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
559         FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
560         FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
561         FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
562         FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
563         FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
564         FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
565         FIELD(HOST_ES_SELECTOR, host_es_selector),
566         FIELD(HOST_CS_SELECTOR, host_cs_selector),
567         FIELD(HOST_SS_SELECTOR, host_ss_selector),
568         FIELD(HOST_DS_SELECTOR, host_ds_selector),
569         FIELD(HOST_FS_SELECTOR, host_fs_selector),
570         FIELD(HOST_GS_SELECTOR, host_gs_selector),
571         FIELD(HOST_TR_SELECTOR, host_tr_selector),
572         FIELD64(IO_BITMAP_A, io_bitmap_a),
573         FIELD64(IO_BITMAP_B, io_bitmap_b),
574         FIELD64(MSR_BITMAP, msr_bitmap),
575         FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
576         FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
577         FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
578         FIELD64(TSC_OFFSET, tsc_offset),
579         FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
580         FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
581         FIELD64(EPT_POINTER, ept_pointer),
582         FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
583         FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
584         FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
585         FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
586         FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
587         FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
588         FIELD64(GUEST_PDPTR0, guest_pdptr0),
589         FIELD64(GUEST_PDPTR1, guest_pdptr1),
590         FIELD64(GUEST_PDPTR2, guest_pdptr2),
591         FIELD64(GUEST_PDPTR3, guest_pdptr3),
592         FIELD64(HOST_IA32_PAT, host_ia32_pat),
593         FIELD64(HOST_IA32_EFER, host_ia32_efer),
594         FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
595         FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
596         FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
597         FIELD(EXCEPTION_BITMAP, exception_bitmap),
598         FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
599         FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
600         FIELD(CR3_TARGET_COUNT, cr3_target_count),
601         FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
602         FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
603         FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
604         FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
605         FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
606         FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
607         FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
608         FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
609         FIELD(TPR_THRESHOLD, tpr_threshold),
610         FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
611         FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
612         FIELD(VM_EXIT_REASON, vm_exit_reason),
613         FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
614         FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
615         FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
616         FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
617         FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
618         FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
619         FIELD(GUEST_ES_LIMIT, guest_es_limit),
620         FIELD(GUEST_CS_LIMIT, guest_cs_limit),
621         FIELD(GUEST_SS_LIMIT, guest_ss_limit),
622         FIELD(GUEST_DS_LIMIT, guest_ds_limit),
623         FIELD(GUEST_FS_LIMIT, guest_fs_limit),
624         FIELD(GUEST_GS_LIMIT, guest_gs_limit),
625         FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
626         FIELD(GUEST_TR_LIMIT, guest_tr_limit),
627         FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
628         FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
629         FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
630         FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
631         FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
632         FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
633         FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
634         FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
635         FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
636         FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
637         FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
638         FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
639         FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
640         FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
641         FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
642         FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
643         FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
644         FIELD(CR0_READ_SHADOW, cr0_read_shadow),
645         FIELD(CR4_READ_SHADOW, cr4_read_shadow),
646         FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
647         FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
648         FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
649         FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
650         FIELD(EXIT_QUALIFICATION, exit_qualification),
651         FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
652         FIELD(GUEST_CR0, guest_cr0),
653         FIELD(GUEST_CR3, guest_cr3),
654         FIELD(GUEST_CR4, guest_cr4),
655         FIELD(GUEST_ES_BASE, guest_es_base),
656         FIELD(GUEST_CS_BASE, guest_cs_base),
657         FIELD(GUEST_SS_BASE, guest_ss_base),
658         FIELD(GUEST_DS_BASE, guest_ds_base),
659         FIELD(GUEST_FS_BASE, guest_fs_base),
660         FIELD(GUEST_GS_BASE, guest_gs_base),
661         FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
662         FIELD(GUEST_TR_BASE, guest_tr_base),
663         FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
664         FIELD(GUEST_IDTR_BASE, guest_idtr_base),
665         FIELD(GUEST_DR7, guest_dr7),
666         FIELD(GUEST_RSP, guest_rsp),
667         FIELD(GUEST_RIP, guest_rip),
668         FIELD(GUEST_RFLAGS, guest_rflags),
669         FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
670         FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
671         FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
672         FIELD(HOST_CR0, host_cr0),
673         FIELD(HOST_CR3, host_cr3),
674         FIELD(HOST_CR4, host_cr4),
675         FIELD(HOST_FS_BASE, host_fs_base),
676         FIELD(HOST_GS_BASE, host_gs_base),
677         FIELD(HOST_TR_BASE, host_tr_base),
678         FIELD(HOST_GDTR_BASE, host_gdtr_base),
679         FIELD(HOST_IDTR_BASE, host_idtr_base),
680         FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
681         FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
682         FIELD(HOST_RSP, host_rsp),
683         FIELD(HOST_RIP, host_rip),
684 };
685 static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table);
686 
687 static inline short vmcs_field_to_offset(unsigned long field)
688 {
689         if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0)
690                 return -1;
691         return vmcs_field_to_offset_table[field];
692 }
693 
694 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
695 {
696         return to_vmx(vcpu)->nested.current_vmcs12;
697 }
698 
699 static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
700 {
701         struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
702         if (is_error_page(page))
703                 return NULL;
704 
705         return page;
706 }
707 
708 static void nested_release_page(struct page *page)
709 {
710         kvm_release_page_dirty(page);
711 }
712 
713 static void nested_release_page_clean(struct page *page)
714 {
715         kvm_release_page_clean(page);
716 }
717 
718 static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
719 static u64 construct_eptp(unsigned long root_hpa);
720 static void kvm_cpu_vmxon(u64 addr);
721 static void kvm_cpu_vmxoff(void);
722 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
723 static void vmx_set_segment(struct kvm_vcpu *vcpu,
724                             struct kvm_segment *var, int seg);
725 static void vmx_get_segment(struct kvm_vcpu *vcpu,
726                             struct kvm_segment *var, int seg);
727 static bool guest_state_valid(struct kvm_vcpu *vcpu);
728 static u32 vmx_segment_access_rights(struct kvm_segment *var);
729 static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
730 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
731 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
732 
733 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
734 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
735 /*
736  * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
737  * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
738  */
739 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
740 static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
741 
742 static unsigned long *vmx_io_bitmap_a;
743 static unsigned long *vmx_io_bitmap_b;
744 static unsigned long *vmx_msr_bitmap_legacy;
745 static unsigned long *vmx_msr_bitmap_longmode;
746 static unsigned long *vmx_msr_bitmap_legacy_x2apic;
747 static unsigned long *vmx_msr_bitmap_longmode_x2apic;
748 static unsigned long *vmx_vmread_bitmap;
749 static unsigned long *vmx_vmwrite_bitmap;
750 
751 static bool cpu_has_load_ia32_efer;
752 static bool cpu_has_load_perf_global_ctrl;
753 
754 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
755 static DEFINE_SPINLOCK(vmx_vpid_lock);
756 
757 static struct vmcs_config {
758         int size;
759         int order;
760         u32 revision_id;
761         u32 pin_based_exec_ctrl;
762         u32 cpu_based_exec_ctrl;
763         u32 cpu_based_2nd_exec_ctrl;
764         u32 vmexit_ctrl;
765         u32 vmentry_ctrl;
766 } vmcs_config;
767 
768 static struct vmx_capability {
769         u32 ept;
770         u32 vpid;
771 } vmx_capability;
772 
773 #define VMX_SEGMENT_FIELD(seg)                                  \
774         [VCPU_SREG_##seg] = {                                   \
775                 .selector = GUEST_##seg##_SELECTOR,             \
776                 .base = GUEST_##seg##_BASE,                     \
777                 .limit = GUEST_##seg##_LIMIT,                   \
778                 .ar_bytes = GUEST_##seg##_AR_BYTES,             \
779         }
780 
781 static const struct kvm_vmx_segment_field {
782         unsigned selector;
783         unsigned base;
784         unsigned limit;
785         unsigned ar_bytes;
786 } kvm_vmx_segment_fields[] = {
787         VMX_SEGMENT_FIELD(CS),
788         VMX_SEGMENT_FIELD(DS),
789         VMX_SEGMENT_FIELD(ES),
790         VMX_SEGMENT_FIELD(FS),
791         VMX_SEGMENT_FIELD(GS),
792         VMX_SEGMENT_FIELD(SS),
793         VMX_SEGMENT_FIELD(TR),
794         VMX_SEGMENT_FIELD(LDTR),
795 };
796 
797 static u64 host_efer;
798 
799 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
800 
801 /*
802  * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
803  * away by decrementing the array size.
804  */
805 static const u32 vmx_msr_index[] = {
806 #ifdef CONFIG_X86_64
807         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
808 #endif
809         MSR_EFER, MSR_TSC_AUX, MSR_STAR,
810 };
811 #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
812 
813 static inline bool is_page_fault(u32 intr_info)
814 {
815         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
816                              INTR_INFO_VALID_MASK)) ==
817                 (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
818 }
819 
820 static inline bool is_no_device(u32 intr_info)
821 {
822         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
823                              INTR_INFO_VALID_MASK)) ==
824                 (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
825 }
826 
827 static inline bool is_invalid_opcode(u32 intr_info)
828 {
829         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
830                              INTR_INFO_VALID_MASK)) ==
831                 (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
832 }
833 
834 static inline bool is_external_interrupt(u32 intr_info)
835 {
836         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
837                 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
838 }
839 
840 static inline bool is_machine_check(u32 intr_info)
841 {
842         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
843                              INTR_INFO_VALID_MASK)) ==
844                 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
845 }
846 
847 static inline bool cpu_has_vmx_msr_bitmap(void)
848 {
849         return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
850 }
851 
852 static inline bool cpu_has_vmx_tpr_shadow(void)
853 {
854         return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
855 }
856 
857 static inline bool vm_need_tpr_shadow(struct kvm *kvm)
858 {
859         return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
860 }
861 
862 static inline bool cpu_has_secondary_exec_ctrls(void)
863 {
864         return vmcs_config.cpu_based_exec_ctrl &
865                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
866 }
867 
868 static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
869 {
870         return vmcs_config.cpu_based_2nd_exec_ctrl &
871                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
872 }
873 
874 static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
875 {
876         return vmcs_config.cpu_based_2nd_exec_ctrl &
877                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
878 }
879 
880 static inline bool cpu_has_vmx_apic_register_virt(void)
881 {
882         return vmcs_config.cpu_based_2nd_exec_ctrl &
883                 SECONDARY_EXEC_APIC_REGISTER_VIRT;
884 }
885 
886 static inline bool cpu_has_vmx_virtual_intr_delivery(void)
887 {
888         return vmcs_config.cpu_based_2nd_exec_ctrl &
889                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
890 }
891 
892 static inline bool cpu_has_vmx_posted_intr(void)
893 {
894         return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
895 }
896 
897 static inline bool cpu_has_vmx_apicv(void)
898 {
899         return cpu_has_vmx_apic_register_virt() &&
900                 cpu_has_vmx_virtual_intr_delivery() &&
901                 cpu_has_vmx_posted_intr();
902 }
903 
904 static inline bool cpu_has_vmx_flexpriority(void)
905 {
906         return cpu_has_vmx_tpr_shadow() &&
907                 cpu_has_vmx_virtualize_apic_accesses();
908 }
909 
910 static inline bool cpu_has_vmx_ept_execute_only(void)
911 {
912         return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
913 }
914 
915 static inline bool cpu_has_vmx_eptp_uncacheable(void)
916 {
917         return vmx_capability.ept & VMX_EPTP_UC_BIT;
918 }
919 
920 static inline bool cpu_has_vmx_eptp_writeback(void)
921 {
922         return vmx_capability.ept & VMX_EPTP_WB_BIT;
923 }
924 
925 static inline bool cpu_has_vmx_ept_2m_page(void)
926 {
927         return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
928 }
929 
930 static inline bool cpu_has_vmx_ept_1g_page(void)
931 {
932         return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
933 }
934 
935 static inline bool cpu_has_vmx_ept_4levels(void)
936 {
937         return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
938 }
939 
940 static inline bool cpu_has_vmx_ept_ad_bits(void)
941 {
942         return vmx_capability.ept & VMX_EPT_AD_BIT;
943 }
944 
945 static inline bool cpu_has_vmx_invept_context(void)
946 {
947         return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
948 }
949 
950 static inline bool cpu_has_vmx_invept_global(void)
951 {
952         return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
953 }
954 
955 static inline bool cpu_has_vmx_invvpid_single(void)
956 {
957         return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
958 }
959 
960 static inline bool cpu_has_vmx_invvpid_global(void)
961 {
962         return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
963 }
964 
965 static inline bool cpu_has_vmx_ept(void)
966 {
967         return vmcs_config.cpu_based_2nd_exec_ctrl &
968                 SECONDARY_EXEC_ENABLE_EPT;
969 }
970 
971 static inline bool cpu_has_vmx_unrestricted_guest(void)
972 {
973         return vmcs_config.cpu_based_2nd_exec_ctrl &
974                 SECONDARY_EXEC_UNRESTRICTED_GUEST;
975 }
976 
977 static inline bool cpu_has_vmx_ple(void)
978 {
979         return vmcs_config.cpu_based_2nd_exec_ctrl &
980                 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
981 }
982 
983 static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
984 {
985         return flexpriority_enabled && irqchip_in_kernel(kvm);
986 }
987 
988 static inline bool cpu_has_vmx_vpid(void)
989 {
990         return vmcs_config.cpu_based_2nd_exec_ctrl &
991                 SECONDARY_EXEC_ENABLE_VPID;
992 }
993 
994 static inline bool cpu_has_vmx_rdtscp(void)
995 {
996         return vmcs_config.cpu_based_2nd_exec_ctrl &
997                 SECONDARY_EXEC_RDTSCP;
998 }
999 
1000 static inline bool cpu_has_vmx_invpcid(void)
1001 {
1002         return vmcs_config.cpu_based_2nd_exec_ctrl &
1003                 SECONDARY_EXEC_ENABLE_INVPCID;
1004 }
1005 
1006 static inline bool cpu_has_virtual_nmis(void)
1007 {
1008         return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
1009 }
1010 
1011 static inline bool cpu_has_vmx_wbinvd_exit(void)
1012 {
1013         return vmcs_config.cpu_based_2nd_exec_ctrl &
1014                 SECONDARY_EXEC_WBINVD_EXITING;
1015 }
1016 
1017 static inline bool cpu_has_vmx_shadow_vmcs(void)
1018 {
1019         u64 vmx_msr;
1020         rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
1021         /* check if the cpu supports writing r/o exit information fields */
1022         if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
1023                 return false;
1024 
1025         return vmcs_config.cpu_based_2nd_exec_ctrl &
1026                 SECONDARY_EXEC_SHADOW_VMCS;
1027 }
1028 
1029 static inline bool report_flexpriority(void)
1030 {
1031         return flexpriority_enabled;
1032 }
1033 
1034 static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
1035 {
1036         return vmcs12->cpu_based_vm_exec_control & bit;
1037 }
1038 
1039 static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
1040 {
1041         return (vmcs12->cpu_based_vm_exec_control &
1042                         CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
1043                 (vmcs12->secondary_vm_exec_control & bit);
1044 }
1045 
1046 static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
1047 {
1048         return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
1049 }
1050 
1051 static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
1052 {
1053         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
1054 }
1055 
1056 static inline bool is_exception(u32 intr_info)
1057 {
1058         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
1059                 == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
1060 }
1061 
1062 static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
1063                               u32 exit_intr_info,
1064                               unsigned long exit_qualification);
1065 static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
1066                         struct vmcs12 *vmcs12,
1067                         u32 reason, unsigned long qualification);
1068 
1069 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
1070 {
1071         int i;
1072 
1073         for (i = 0; i < vmx->nmsrs; ++i)
1074                 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
1075                         return i;
1076         return -1;
1077 }
1078 
1079 static inline void __invvpid(int ext, u16 vpid, gva_t gva)
1080 {
1081     struct {
1082         u64 vpid : 16;
1083         u64 rsvd : 48;
1084         u64 gva;
1085     } operand = { vpid, 0, gva };
1086 
1087     asm volatile (__ex(ASM_VMX_INVVPID)
1088                   /* CF==1 or ZF==1 --> rc = -1 */
1089                   "; ja 1f ; ud2 ; 1:"
1090                   : : "a"(&operand), "c"(ext) : "cc", "memory");
1091 }
1092 
1093 static inline void __invept(int ext, u64 eptp, gpa_t gpa)
1094 {
1095         struct {
1096                 u64 eptp, gpa;
1097         } operand = {eptp, gpa};
1098 
1099         asm volatile (__ex(ASM_VMX_INVEPT)
1100                         /* CF==1 or ZF==1 --> rc = -1 */
1101                         "; ja 1f ; ud2 ; 1:\n"
1102                         : : "a" (&operand), "c" (ext) : "cc", "memory");
1103 }
1104 
1105 static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
1106 {
1107         int i;
1108 
1109         i = __find_msr_index(vmx, msr);
1110         if (i >= 0)
1111                 return &vmx->guest_msrs[i];
1112         return NULL;
1113 }
1114 
1115 static void vmcs_clear(struct vmcs *vmcs)
1116 {
1117         u64 phys_addr = __pa(vmcs);
1118         u8 error;
1119 
1120         asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
1121                       : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
1122                       : "cc", "memory");
1123         if (error)
1124                 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
1125                        vmcs, phys_addr);
1126 }
1127 
1128 static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
1129 {
1130         vmcs_clear(loaded_vmcs->vmcs);
1131         loaded_vmcs->cpu = -1;
1132         loaded_vmcs->launched = 0;
1133 }
1134 
1135 static void vmcs_load(struct vmcs *vmcs)
1136 {
1137         u64 phys_addr = __pa(vmcs);
1138         u8 error;
1139 
1140         asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
1141                         : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
1142                         : "cc", "memory");
1143         if (error)
1144                 printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
1145                        vmcs, phys_addr);
1146 }
1147 
1148 #ifdef CONFIG_KEXEC
1149 /*
1150  * This bitmap is used to indicate whether the vmclear
1151  * operation is enabled on all cpus. All disabled by
1152  * default.
1153  */
1154 static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
1155 
1156 static inline void crash_enable_local_vmclear(int cpu)
1157 {
1158         cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
1159 }
1160 
1161 static inline void crash_disable_local_vmclear(int cpu)
1162 {
1163         cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
1164 }
1165 
1166 static inline int crash_local_vmclear_enabled(int cpu)
1167 {
1168         return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
1169 }
1170 
1171 static void crash_vmclear_local_loaded_vmcss(void)
1172 {
1173         int cpu = raw_smp_processor_id();
1174         struct loaded_vmcs *v;
1175 
1176         if (!crash_local_vmclear_enabled(cpu))
1177                 return;
1178 
1179         list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
1180                             loaded_vmcss_on_cpu_link)
1181                 vmcs_clear(v->vmcs);
1182 }
1183 #else
1184 static inline void crash_enable_local_vmclear(int cpu) { }
1185 static inline void crash_disable_local_vmclear(int cpu) { }
1186 #endif /* CONFIG_KEXEC */
1187 
1188 static void __loaded_vmcs_clear(void *arg)
1189 {
1190         struct loaded_vmcs *loaded_vmcs = arg;
1191         int cpu = raw_smp_processor_id();
1192 
1193         if (loaded_vmcs->cpu != cpu)
1194                 return; /* vcpu migration can race with cpu offline */
1195         if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
1196                 per_cpu(current_vmcs, cpu) = NULL;
1197         crash_disable_local_vmclear(cpu);
1198         list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
1199 
1200         /*
1201          * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
1202          * is before setting loaded_vmcs->vcpu to -1 which is done in
1203          * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
1204          * then adds the vmcs into percpu list before it is deleted.
1205          */
1206         smp_wmb();
1207 
1208         loaded_vmcs_init(loaded_vmcs);
1209         crash_enable_local_vmclear(cpu);
1210 }
1211 
1212 static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
1213 {
1214         int cpu = loaded_vmcs->cpu;
1215 
1216         if (cpu != -1)
1217                 smp_call_function_single(cpu,
1218                          __loaded_vmcs_clear, loaded_vmcs, 1);
1219 }
1220 
1221 static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
1222 {
1223         if (vmx->vpid == 0)
1224                 return;
1225 
1226         if (cpu_has_vmx_invvpid_single())
1227                 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
1228 }
1229 
1230 static inline void vpid_sync_vcpu_global(void)
1231 {
1232         if (cpu_has_vmx_invvpid_global())
1233                 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
1234 }
1235 
1236 static inline void vpid_sync_context(struct vcpu_vmx *vmx)
1237 {
1238         if (cpu_has_vmx_invvpid_single())
1239                 vpid_sync_vcpu_single(vmx);
1240         else
1241                 vpid_sync_vcpu_global();
1242 }
1243 
1244 static inline void ept_sync_global(void)
1245 {
1246         if (cpu_has_vmx_invept_global())
1247                 __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
1248 }
1249 
1250 static inline void ept_sync_context(u64 eptp)
1251 {
1252         if (enable_ept) {
1253                 if (cpu_has_vmx_invept_context())
1254                         __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
1255                 else
1256                         ept_sync_global();
1257         }
1258 }
1259 
1260 static __always_inline unsigned long vmcs_readl(unsigned long field)
1261 {
1262         unsigned long value;
1263 
1264         asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
1265                       : "=a"(value) : "d"(field) : "cc");
1266         return value;
1267 }
1268 
1269 static __always_inline u16 vmcs_read16(unsigned long field)
1270 {
1271         return vmcs_readl(field);
1272 }
1273 
1274 static __always_inline u32 vmcs_read32(unsigned long field)
1275 {
1276         return vmcs_readl(field);
1277 }
1278 
1279 static __always_inline u64 vmcs_read64(unsigned long field)
1280 {
1281 #ifdef CONFIG_X86_64
1282         return vmcs_readl(field);
1283 #else
1284         return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
1285 #endif
1286 }
1287 
1288 static noinline void vmwrite_error(unsigned long field, unsigned long value)
1289 {
1290         printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
1291                field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
1292         dump_stack();
1293 }
1294 
1295 static void vmcs_writel(unsigned long field, unsigned long value)
1296 {
1297         u8 error;
1298 
1299         asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
1300                        : "=q"(error) : "a"(value), "d"(field) : "cc");
1301         if (unlikely(error))
1302                 vmwrite_error(field, value);
1303 }
1304 
1305 static void vmcs_write16(unsigned long field, u16 value)
1306 {
1307         vmcs_writel(field, value);
1308 }
1309 
1310 static void vmcs_write32(unsigned long field, u32 value)
1311 {
1312         vmcs_writel(field, value);
1313 }
1314 
1315 static void vmcs_write64(unsigned long field, u64 value)
1316 {
1317         vmcs_writel(field, value);
1318 #ifndef CONFIG_X86_64
1319         asm volatile ("");
1320         vmcs_writel(field+1, value >> 32);
1321 #endif
1322 }
1323 
1324 static void vmcs_clear_bits(unsigned long field, u32 mask)
1325 {
1326         vmcs_writel(field, vmcs_readl(field) & ~mask);
1327 }
1328 
1329 static void vmcs_set_bits(unsigned long field, u32 mask)
1330 {
1331         vmcs_writel(field, vmcs_readl(field) | mask);
1332 }
1333 
1334 static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
1335 {
1336         vmcs_write32(VM_ENTRY_CONTROLS, val);
1337         vmx->vm_entry_controls_shadow = val;
1338 }
1339 
1340 static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
1341 {
1342         if (vmx->vm_entry_controls_shadow != val)
1343                 vm_entry_controls_init(vmx, val);
1344 }
1345 
1346 static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
1347 {
1348         return vmx->vm_entry_controls_shadow;
1349 }
1350 
1351 
1352 static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
1353 {
1354         vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
1355 }
1356 
1357 static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
1358 {
1359         vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
1360 }
1361 
1362 static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
1363 {
1364         vmcs_write32(VM_EXIT_CONTROLS, val);
1365         vmx->vm_exit_controls_shadow = val;
1366 }
1367 
1368 static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
1369 {
1370         if (vmx->vm_exit_controls_shadow != val)
1371                 vm_exit_controls_init(vmx, val);
1372 }
1373 
1374 static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
1375 {
1376         return vmx->vm_exit_controls_shadow;
1377 }
1378 
1379 
1380 static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
1381 {
1382         vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
1383 }
1384 
1385 static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
1386 {
1387         vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
1388 }
1389 
1390 static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
1391 {
1392         vmx->segment_cache.bitmask = 0;
1393 }
1394 
1395 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
1396                                        unsigned field)
1397 {
1398         bool ret;
1399         u32 mask = 1 << (seg * SEG_FIELD_NR + field);
1400 
1401         if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
1402                 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
1403                 vmx->segment_cache.bitmask = 0;
1404         }
1405         ret = vmx->segment_cache.bitmask & mask;
1406         vmx->segment_cache.bitmask |= mask;
1407         return ret;
1408 }
1409 
1410 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
1411 {
1412         u16 *p = &vmx->segment_cache.seg[seg].selector;
1413 
1414         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
1415                 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
1416         return *p;
1417 }
1418 
1419 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
1420 {
1421         ulong *p = &vmx->segment_cache.seg[seg].base;
1422 
1423         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
1424                 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
1425         return *p;
1426 }
1427 
1428 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
1429 {
1430         u32 *p = &vmx->segment_cache.seg[seg].limit;
1431 
1432         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
1433                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
1434         return *p;
1435 }
1436 
1437 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
1438 {
1439         u32 *p = &vmx->segment_cache.seg[seg].ar;
1440 
1441         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
1442                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
1443         return *p;
1444 }
1445 
1446 static void update_exception_bitmap(struct kvm_vcpu *vcpu)
1447 {
1448         u32 eb;
1449 
1450         eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
1451              (1u << NM_VECTOR) | (1u << DB_VECTOR);
1452         if ((vcpu->guest_debug &
1453              (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
1454             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
1455                 eb |= 1u << BP_VECTOR;
1456         if (to_vmx(vcpu)->rmode.vm86_active)
1457                 eb = ~0;
1458         if (enable_ept)
1459                 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
1460         if (vcpu->fpu_active)
1461                 eb &= ~(1u << NM_VECTOR);
1462 
1463         /* When we are running a nested L2 guest and L1 specified for it a
1464          * certain exception bitmap, we must trap the same exceptions and pass
1465          * them to L1. When running L2, we will only handle the exceptions
1466          * specified above if L1 did not want them.
1467          */
1468         if (is_guest_mode(vcpu))
1469                 eb |= get_vmcs12(vcpu)->exception_bitmap;
1470 
1471         vmcs_write32(EXCEPTION_BITMAP, eb);
1472 }
1473 
1474 static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1475                 unsigned long entry, unsigned long exit)
1476 {
1477         vm_entry_controls_clearbit(vmx, entry);
1478         vm_exit_controls_clearbit(vmx, exit);
1479 }
1480 
1481 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
1482 {
1483         unsigned i;
1484         struct msr_autoload *m = &vmx->msr_autoload;
1485 
1486         switch (msr) {
1487         case MSR_EFER:
1488                 if (cpu_has_load_ia32_efer) {
1489                         clear_atomic_switch_msr_special(vmx,
1490                                         VM_ENTRY_LOAD_IA32_EFER,
1491                                         VM_EXIT_LOAD_IA32_EFER);
1492                         return;
1493                 }
1494                 break;
1495         case MSR_CORE_PERF_GLOBAL_CTRL:
1496                 if (cpu_has_load_perf_global_ctrl) {
1497                         clear_atomic_switch_msr_special(vmx,
1498                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1499                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
1500                         return;
1501                 }
1502                 break;
1503         }
1504 
1505         for (i = 0; i < m->nr; ++i)
1506                 if (m->guest[i].index == msr)
1507                         break;
1508 
1509         if (i == m->nr)
1510                 return;
1511         --m->nr;
1512         m->guest[i] = m->guest[m->nr];
1513         m->host[i] = m->host[m->nr];
1514         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
1515         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
1516 }
1517 
1518 static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1519                 unsigned long entry, unsigned long exit,
1520                 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
1521                 u64 guest_val, u64 host_val)
1522 {
1523         vmcs_write64(guest_val_vmcs, guest_val);
1524         vmcs_write64(host_val_vmcs, host_val);
1525         vm_entry_controls_setbit(vmx, entry);
1526         vm_exit_controls_setbit(vmx, exit);
1527 }
1528 
1529 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1530                                   u64 guest_val, u64 host_val)
1531 {
1532         unsigned i;
1533         struct msr_autoload *m = &vmx->msr_autoload;
1534 
1535         switch (msr) {
1536         case MSR_EFER:
1537                 if (cpu_has_load_ia32_efer) {
1538                         add_atomic_switch_msr_special(vmx,
1539                                         VM_ENTRY_LOAD_IA32_EFER,
1540                                         VM_EXIT_LOAD_IA32_EFER,
1541                                         GUEST_IA32_EFER,
1542                                         HOST_IA32_EFER,
1543                                         guest_val, host_val);
1544                         return;
1545                 }
1546                 break;
1547         case MSR_CORE_PERF_GLOBAL_CTRL:
1548                 if (cpu_has_load_perf_global_ctrl) {
1549                         add_atomic_switch_msr_special(vmx,
1550                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1551                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1552                                         GUEST_IA32_PERF_GLOBAL_CTRL,
1553                                         HOST_IA32_PERF_GLOBAL_CTRL,
1554                                         guest_val, host_val);
1555                         return;
1556                 }
1557                 break;
1558         case MSR_IA32_PEBS_ENABLE:
1559                 /* PEBS needs a quiescent period after being disabled (to write
1560                  * a record).  Disabling PEBS through VMX MSR swapping doesn't
1561                  * provide that period, so a CPU could write host's record into
1562                  * guest's memory.
1563                  */
1564                 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
1565         }
1566 
1567         for (i = 0; i < m->nr; ++i)
1568                 if (m->guest[i].index == msr)
1569                         break;
1570 
1571         if (i == NR_AUTOLOAD_MSRS) {
1572                 printk_once(KERN_WARNING "Not enough msr switch entries. "
1573                                 "Can't add msr %x\n", msr);
1574                 return;
1575         } else if (i == m->nr) {
1576                 ++m->nr;
1577                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
1578                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
1579         }
1580 
1581         m->guest[i].index = msr;
1582         m->guest[i].value = guest_val;
1583         m->host[i].index = msr;
1584         m->host[i].value = host_val;
1585 }
1586 
1587 static void reload_tss(void)
1588 {
1589         /*
1590          * VT restores TR but not its size.  Useless.
1591          */
1592         struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
1593         struct desc_struct *descs;
1594 
1595         descs = (void *)gdt->address;
1596         descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
1597         load_TR_desc();
1598 }
1599 
1600 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
1601 {
1602         u64 guest_efer;
1603         u64 ignore_bits;
1604 
1605         guest_efer = vmx->vcpu.arch.efer;
1606 
1607         /*
1608          * NX is emulated; LMA and LME handled by hardware; SCE meaningless
1609          * outside long mode
1610          */
1611         ignore_bits = EFER_NX | EFER_SCE;
1612 #ifdef CONFIG_X86_64
1613         ignore_bits |= EFER_LMA | EFER_LME;
1614         /* SCE is meaningful only in long mode on Intel */
1615         if (guest_efer & EFER_LMA)
1616                 ignore_bits &= ~(u64)EFER_SCE;
1617 #endif
1618         guest_efer &= ~ignore_bits;
1619         guest_efer |= host_efer & ignore_bits;
1620         vmx->guest_msrs[efer_offset].data = guest_efer;
1621         vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
1622 
1623         clear_atomic_switch_msr(vmx, MSR_EFER);
1624         /* On ept, can't emulate nx, and must switch nx atomically */
1625         if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) {
1626                 guest_efer = vmx->vcpu.arch.efer;
1627                 if (!(guest_efer & EFER_LMA))
1628                         guest_efer &= ~EFER_LME;
1629                 add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer);
1630                 return false;
1631         }
1632 
1633         return true;
1634 }
1635 
1636 static unsigned long segment_base(u16 selector)
1637 {
1638         struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
1639         struct desc_struct *d;
1640         unsigned long table_base;
1641         unsigned long v;
1642 
1643         if (!(selector & ~3))
1644                 return 0;
1645 
1646         table_base = gdt->address;
1647 
1648         if (selector & 4) {           /* from ldt */
1649                 u16 ldt_selector = kvm_read_ldt();
1650 
1651                 if (!(ldt_selector & ~3))
1652                         return 0;
1653 
1654                 table_base = segment_base(ldt_selector);
1655         }
1656         d = (struct desc_struct *)(table_base + (selector & ~7));
1657         v = get_desc_base(d);
1658 #ifdef CONFIG_X86_64
1659        if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
1660                v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
1661 #endif
1662         return v;
1663 }
1664 
1665 static inline unsigned long kvm_read_tr_base(void)
1666 {
1667         u16 tr;
1668         asm("str %0" : "=g"(tr));
1669         return segment_base(tr);
1670 }
1671 
1672 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
1673 {
1674         struct vcpu_vmx *vmx = to_vmx(vcpu);
1675         int i;
1676 
1677         if (vmx->host_state.loaded)
1678                 return;
1679 
1680         vmx->host_state.loaded = 1;
1681         /*
1682          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
1683          * allow segment selectors with cpl > 0 or ti == 1.
1684          */
1685         vmx->host_state.ldt_sel = kvm_read_ldt();
1686         vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
1687         savesegment(fs, vmx->host_state.fs_sel);
1688         if (!(vmx->host_state.fs_sel & 7)) {
1689                 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
1690                 vmx->host_state.fs_reload_needed = 0;
1691         } else {
1692                 vmcs_write16(HOST_FS_SELECTOR, 0);
1693                 vmx->host_state.fs_reload_needed = 1;
1694         }
1695         savesegment(gs, vmx->host_state.gs_sel);
1696         if (!(vmx->host_state.gs_sel & 7))
1697                 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
1698         else {
1699                 vmcs_write16(HOST_GS_SELECTOR, 0);
1700                 vmx->host_state.gs_ldt_reload_needed = 1;
1701         }
1702 
1703 #ifdef CONFIG_X86_64
1704         savesegment(ds, vmx->host_state.ds_sel);
1705         savesegment(es, vmx->host_state.es_sel);
1706 #endif
1707 
1708 #ifdef CONFIG_X86_64
1709         vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
1710         vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
1711 #else
1712         vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
1713         vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
1714 #endif
1715 
1716 #ifdef CONFIG_X86_64
1717         rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1718         if (is_long_mode(&vmx->vcpu))
1719                 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1720 #endif
1721         for (i = 0; i < vmx->save_nmsrs; ++i)
1722                 kvm_set_shared_msr(vmx->guest_msrs[i].index,
1723                                    vmx->guest_msrs[i].data,
1724                                    vmx->guest_msrs[i].mask);
1725 }
1726 
1727 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
1728 {
1729         if (!vmx->host_state.loaded)
1730                 return;
1731 
1732         ++vmx->vcpu.stat.host_state_reload;
1733         vmx->host_state.loaded = 0;
1734 #ifdef CONFIG_X86_64
1735         if (is_long_mode(&vmx->vcpu))
1736                 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1737 #endif
1738         if (vmx->host_state.gs_ldt_reload_needed) {
1739                 kvm_load_ldt(vmx->host_state.ldt_sel);
1740 #ifdef CONFIG_X86_64
1741                 load_gs_index(vmx->host_state.gs_sel);
1742 #else
1743                 loadsegment(gs, vmx->host_state.gs_sel);
1744 #endif
1745         }
1746         if (vmx->host_state.fs_reload_needed)
1747                 loadsegment(fs, vmx->host_state.fs_sel);
1748 #ifdef CONFIG_X86_64
1749         if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
1750                 loadsegment(ds, vmx->host_state.ds_sel);
1751                 loadsegment(es, vmx->host_state.es_sel);
1752         }
1753 #endif
1754         reload_tss();
1755 #ifdef CONFIG_X86_64
1756         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1757 #endif
1758         /*
1759          * If the FPU is not active (through the host task or
1760          * the guest vcpu), then restore the cr0.TS bit.
1761          */
1762         if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded)
1763                 stts();
1764         load_gdt(&__get_cpu_var(host_gdt));
1765 }
1766 
1767 static void vmx_load_host_state(struct vcpu_vmx *vmx)
1768 {
1769         preempt_disable();
1770         __vmx_load_host_state(vmx);
1771         preempt_enable();
1772 }
1773 
1774 /*
1775  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1776  * vcpu mutex is already taken.
1777  */
1778 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1779 {
1780         struct vcpu_vmx *vmx = to_vmx(vcpu);
1781         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1782 
1783         if (!vmm_exclusive)
1784                 kvm_cpu_vmxon(phys_addr);
1785         else if (vmx->loaded_vmcs->cpu != cpu)
1786                 loaded_vmcs_clear(vmx->loaded_vmcs);
1787 
1788         if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
1789                 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1790                 vmcs_load(vmx->loaded_vmcs->vmcs);
1791         }
1792 
1793         if (vmx->loaded_vmcs->cpu != cpu) {
1794                 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
1795                 unsigned long sysenter_esp;
1796 
1797                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1798                 local_irq_disable();
1799                 crash_disable_local_vmclear(cpu);
1800 
1801                 /*
1802                  * Read loaded_vmcs->cpu should be before fetching
1803                  * loaded_vmcs->loaded_vmcss_on_cpu_link.
1804                  * See the comments in __loaded_vmcs_clear().
1805                  */
1806                 smp_rmb();
1807 
1808                 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1809                          &per_cpu(loaded_vmcss_on_cpu, cpu));
1810                 crash_enable_local_vmclear(cpu);
1811                 local_irq_enable();
1812 
1813                 /*
1814                  * Linux uses per-cpu TSS and GDT, so set these when switching
1815                  * processors.
1816                  */
1817                 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
1818                 vmcs_writel(HOST_GDTR_BASE, gdt->address);   /* 22.2.4 */
1819 
1820                 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
1821                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
1822                 vmx->loaded_vmcs->cpu = cpu;
1823         }
1824 }
1825 
1826 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1827 {
1828         __vmx_load_host_state(to_vmx(vcpu));
1829         if (!vmm_exclusive) {
1830                 __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
1831                 vcpu->cpu = -1;
1832                 kvm_cpu_vmxoff();
1833         }
1834 }
1835 
1836 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
1837 {
1838         ulong cr0;
1839 
1840         if (vcpu->fpu_active)
1841                 return;
1842         vcpu->fpu_active = 1;
1843         cr0 = vmcs_readl(GUEST_CR0);
1844         cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
1845         cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
1846         vmcs_writel(GUEST_CR0, cr0);
1847         update_exception_bitmap(vcpu);
1848         vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
1849         if (is_guest_mode(vcpu))
1850                 vcpu->arch.cr0_guest_owned_bits &=
1851                         ~get_vmcs12(vcpu)->cr0_guest_host_mask;
1852         vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
1853 }
1854 
1855 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
1856 
1857 /*
1858  * Return the cr0 value that a nested guest would read. This is a combination
1859  * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
1860  * its hypervisor (cr0_read_shadow).
1861  */
1862 static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
1863 {
1864         return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
1865                 (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
1866 }
1867 static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
1868 {
1869         return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
1870                 (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
1871 }
1872 
1873 static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
1874 {
1875         /* Note that there is no vcpu->fpu_active = 0 here. The caller must
1876          * set this *before* calling this function.
1877          */
1878         vmx_decache_cr0_guest_bits(vcpu);
1879         vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
1880         update_exception_bitmap(vcpu);
1881         vcpu->arch.cr0_guest_owned_bits = 0;
1882         vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
1883         if (is_guest_mode(vcpu)) {
1884                 /*
1885                  * L1's specified read shadow might not contain the TS bit,
1886                  * so now that we turned on shadowing of this bit, we need to
1887                  * set this bit of the shadow. Like in nested_vmx_run we need
1888                  * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet
1889                  * up-to-date here because we just decached cr0.TS (and we'll
1890                  * only update vmcs12->guest_cr0 on nested exit).
1891                  */
1892                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1893                 vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
1894                         (vcpu->arch.cr0 & X86_CR0_TS);
1895                 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
1896         } else
1897                 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
1898 }
1899 
1900 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
1901 {
1902         unsigned long rflags, save_rflags;
1903 
1904         if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
1905                 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
1906                 rflags = vmcs_readl(GUEST_RFLAGS);
1907                 if (to_vmx(vcpu)->rmode.vm86_active) {
1908                         rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1909                         save_rflags = to_vmx(vcpu)->rmode.save_rflags;
1910                         rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1911                 }
1912                 to_vmx(vcpu)->rflags = rflags;
1913         }
1914         return to_vmx(vcpu)->rflags;
1915 }
1916 
1917 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1918 {
1919         __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
1920         to_vmx(vcpu)->rflags = rflags;
1921         if (to_vmx(vcpu)->rmode.vm86_active) {
1922                 to_vmx(vcpu)->rmode.save_rflags = rflags;
1923                 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1924         }
1925         vmcs_writel(GUEST_RFLAGS, rflags);
1926 }
1927 
1928 static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1929 {
1930         u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1931         int ret = 0;
1932 
1933         if (interruptibility & GUEST_INTR_STATE_STI)
1934                 ret |= KVM_X86_SHADOW_INT_STI;
1935         if (interruptibility & GUEST_INTR_STATE_MOV_SS)
1936                 ret |= KVM_X86_SHADOW_INT_MOV_SS;
1937 
1938         return ret & mask;
1939 }
1940 
1941 static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1942 {
1943         u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1944         u32 interruptibility = interruptibility_old;
1945 
1946         interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
1947 
1948         if (mask & KVM_X86_SHADOW_INT_MOV_SS)
1949                 interruptibility |= GUEST_INTR_STATE_MOV_SS;
1950         else if (mask & KVM_X86_SHADOW_INT_STI)
1951                 interruptibility |= GUEST_INTR_STATE_STI;
1952 
1953         if ((interruptibility != interruptibility_old))
1954                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
1955 }
1956 
1957 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
1958 {
1959         unsigned long rip;
1960 
1961         rip = kvm_rip_read(vcpu);
1962         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1963         kvm_rip_write(vcpu, rip);
1964 
1965         /* skipping an emulated instruction also counts */
1966         vmx_set_interrupt_shadow(vcpu, 0);
1967 }
1968 
1969 /*
1970  * KVM wants to inject page-faults which it got to the guest. This function
1971  * checks whether in a nested guest, we need to inject them to L1 or L2.
1972  */
1973 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)
1974 {
1975         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1976 
1977         if (!(vmcs12->exception_bitmap & (1u << nr)))
1978                 return 0;
1979 
1980         nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
1981                           vmcs_read32(VM_EXIT_INTR_INFO),
1982                           vmcs_readl(EXIT_QUALIFICATION));
1983         return 1;
1984 }
1985 
1986 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1987                                 bool has_error_code, u32 error_code,
1988                                 bool reinject)
1989 {
1990         struct vcpu_vmx *vmx = to_vmx(vcpu);
1991         u32 intr_info = nr | INTR_INFO_VALID_MASK;
1992 
1993         if (!reinject && is_guest_mode(vcpu) &&
1994             nested_vmx_check_exception(vcpu, nr))
1995                 return;
1996 
1997         if (has_error_code) {
1998                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1999                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
2000         }
2001 
2002         if (vmx->rmode.vm86_active) {
2003                 int inc_eip = 0;
2004                 if (kvm_exception_is_soft(nr))
2005                         inc_eip = vcpu->arch.event_exit_inst_len;
2006                 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
2007                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2008                 return;
2009         }
2010 
2011         if (kvm_exception_is_soft(nr)) {
2012                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2013                              vmx->vcpu.arch.event_exit_inst_len);
2014                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
2015         } else
2016                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
2017 
2018         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
2019 }
2020 
2021 static bool vmx_rdtscp_supported(void)
2022 {
2023         return cpu_has_vmx_rdtscp();
2024 }
2025 
2026 static bool vmx_invpcid_supported(void)
2027 {
2028         return cpu_has_vmx_invpcid() && enable_ept;
2029 }
2030 
2031 /*
2032  * Swap MSR entry in host/guest MSR entry array.
2033  */
2034 static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
2035 {
2036         struct shared_msr_entry tmp;
2037 
2038         tmp = vmx->guest_msrs[to];
2039         vmx->guest_msrs[to] = vmx->guest_msrs[from];
2040         vmx->guest_msrs[from] = tmp;
2041 }
2042 
2043 static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
2044 {
2045         unsigned long *msr_bitmap;
2046 
2047         if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) {
2048                 if (is_long_mode(vcpu))
2049                         msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
2050                 else
2051                         msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
2052         } else {
2053                 if (is_long_mode(vcpu))
2054                         msr_bitmap = vmx_msr_bitmap_longmode;
2055                 else
2056                         msr_bitmap = vmx_msr_bitmap_legacy;
2057         }
2058 
2059         vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
2060 }
2061 
2062 /*
2063  * Set up the vmcs to automatically save and restore system
2064  * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
2065  * mode, as fiddling with msrs is very expensive.
2066  */
2067 static void setup_msrs(struct vcpu_vmx *vmx)
2068 {
2069         int save_nmsrs, index;
2070 
2071         save_nmsrs = 0;
2072 #ifdef CONFIG_X86_64
2073         if (is_long_mode(&vmx->vcpu)) {
2074                 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
2075                 if (index >= 0)
2076                         move_msr_up(vmx, index, save_nmsrs++);
2077                 index = __find_msr_index(vmx, MSR_LSTAR);
2078                 if (index >= 0)
2079                         move_msr_up(vmx, index, save_nmsrs++);
2080                 index = __find_msr_index(vmx, MSR_CSTAR);
2081                 if (index >= 0)
2082                         move_msr_up(vmx, index, save_nmsrs++);
2083                 index = __find_msr_index(vmx, MSR_TSC_AUX);
2084                 if (index >= 0 && vmx->rdtscp_enabled)
2085                         move_msr_up(vmx, index, save_nmsrs++);
2086                 /*
2087                  * MSR_STAR is only needed on long mode guests, and only
2088                  * if efer.sce is enabled.
2089                  */
2090                 index = __find_msr_index(vmx, MSR_STAR);
2091                 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
2092                         move_msr_up(vmx, index, save_nmsrs++);
2093         }
2094 #endif
2095         index = __find_msr_index(vmx, MSR_EFER);
2096         if (index >= 0 && update_transition_efer(vmx, index))
2097                 move_msr_up(vmx, index, save_nmsrs++);
2098 
2099         vmx->save_nmsrs = save_nmsrs;
2100 
2101         if (cpu_has_vmx_msr_bitmap())
2102                 vmx_set_msr_bitmap(&vmx->vcpu);
2103 }
2104 
2105 /*
2106  * reads and returns guest's timestamp counter "register"
2107  * guest_tsc = host_tsc + tsc_offset    -- 21.3
2108  */
2109 static u64 guest_read_tsc(void)
2110 {
2111         u64 host_tsc, tsc_offset;
2112 
2113         rdtscll(host_tsc);
2114         tsc_offset = vmcs_read64(TSC_OFFSET);
2115         return host_tsc + tsc_offset;
2116 }
2117 
2118 /*
2119  * Like guest_read_tsc, but always returns L1's notion of the timestamp
2120  * counter, even if a nested guest (L2) is currently running.
2121  */
2122 u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
2123 {
2124         u64 tsc_offset;
2125 
2126         tsc_offset = is_guest_mode(vcpu) ?
2127                 to_vmx(vcpu)->nested.vmcs01_tsc_offset :
2128                 vmcs_read64(TSC_OFFSET);
2129         return host_tsc + tsc_offset;
2130 }
2131 
2132 /*
2133  * Engage any workarounds for mis-matched TSC rates.  Currently limited to
2134  * software catchup for faster rates on slower CPUs.
2135  */
2136 static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
2137 {
2138         if (!scale)
2139                 return;
2140 
2141         if (user_tsc_khz > tsc_khz) {
2142                 vcpu->arch.tsc_catchup = 1;
2143                 vcpu->arch.tsc_always_catchup = 1;
2144         } else
2145                 WARN(1, "user requested TSC rate below hardware speed\n");
2146 }
2147 
2148 static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
2149 {
2150         return vmcs_read64(TSC_OFFSET);
2151 }
2152 
2153 /*
2154  * writes 'offset' into guest's timestamp counter offset register
2155  */
2156 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
2157 {
2158         if (is_guest_mode(vcpu)) {
2159                 /*
2160                  * We're here if L1 chose not to trap WRMSR to TSC. According
2161                  * to the spec, this should set L1's TSC; The offset that L1
2162                  * set for L2 remains unchanged, and still needs to be added
2163                  * to the newly set TSC to get L2's TSC.
2164                  */
2165                 struct vmcs12 *vmcs12;
2166                 to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset;
2167                 /* recalculate vmcs02.TSC_OFFSET: */
2168                 vmcs12 = get_vmcs12(vcpu);
2169                 vmcs_write64(TSC_OFFSET, offset +
2170                         (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
2171                          vmcs12->tsc_offset : 0));
2172         } else {
2173                 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
2174                                            vmcs_read64(TSC_OFFSET), offset);
2175                 vmcs_write64(TSC_OFFSET, offset);
2176         }
2177 }
2178 
2179 static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
2180 {
2181         u64 offset = vmcs_read64(TSC_OFFSET);
2182 
2183         vmcs_write64(TSC_OFFSET, offset + adjustment);
2184         if (is_guest_mode(vcpu)) {
2185                 /* Even when running L2, the adjustment needs to apply to L1 */
2186                 to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment;
2187         } else
2188                 trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset,
2189                                            offset + adjustment);
2190 }
2191 
2192 static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
2193 {
2194         return target_tsc - native_read_tsc();
2195 }
2196 
2197 static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
2198 {
2199         struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
2200         return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31)));
2201 }
2202 
2203 /*
2204  * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
2205  * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
2206  * all guests if the "nested" module option is off, and can also be disabled
2207  * for a single guest by disabling its VMX cpuid bit.
2208  */
2209 static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
2210 {
2211         return nested && guest_cpuid_has_vmx(vcpu);
2212 }
2213 
2214 /*
2215  * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
2216  * returned for the various VMX controls MSRs when nested VMX is enabled.
2217  * The same values should also be used to verify that vmcs12 control fields are
2218  * valid during nested entry from L1 to L2.
2219  * Each of these control msrs has a low and high 32-bit half: A low bit is on
2220  * if the corresponding bit in the (32-bit) control field *must* be on, and a
2221  * bit in the high half is on if the corresponding bit in the control field
2222  * may be on. See also vmx_control_verify().
2223  * TODO: allow these variables to be modified (downgraded) by module options
2224  * or other means.
2225  */
2226 static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
2227 static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
2228 static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
2229 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
2230 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
2231 static u32 nested_vmx_misc_low, nested_vmx_misc_high;
2232 static u32 nested_vmx_ept_caps;
2233 static __init void nested_vmx_setup_ctls_msrs(void)
2234 {
2235         /*
2236          * Note that as a general rule, the high half of the MSRs (bits in
2237          * the control fields which may be 1) should be initialized by the
2238          * intersection of the underlying hardware's MSR (i.e., features which
2239          * can be supported) and the list of features we want to expose -
2240          * because they are known to be properly supported in our code.
2241          * Also, usually, the low half of the MSRs (bits which must be 1) can
2242          * be set to 0, meaning that L1 may turn off any of these bits. The
2243          * reason is that if one of these bits is necessary, it will appear
2244          * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
2245          * fields of vmcs01 and vmcs02, will turn these bits off - and
2246          * nested_vmx_exit_handled() will not pass related exits to L1.
2247          * These rules have exceptions below.
2248          */
2249 
2250         /* pin-based controls */
2251         rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
2252               nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
2253         /*
2254          * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is
2255          * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR.
2256          */
2257         nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2258         nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
2259                 PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS |
2260                 PIN_BASED_VMX_PREEMPTION_TIMER;
2261         nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2262 
2263         /*
2264          * Exit controls
2265          * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and
2266          * 17 must be 1.
2267          */
2268         rdmsr(MSR_IA32_VMX_EXIT_CTLS,
2269                 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
2270         nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
2271         /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
2272         nested_vmx_exit_ctls_high &=
2273 #ifdef CONFIG_X86_64
2274                 VM_EXIT_HOST_ADDR_SPACE_SIZE |
2275 #endif
2276                 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
2277                 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
2278         if (!(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) ||
2279             !(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
2280                 nested_vmx_exit_ctls_high &= ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
2281                 nested_vmx_pinbased_ctls_high &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
2282         }
2283         nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
2284                 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER);
2285 
2286         /* entry controls */
2287         rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
2288                 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
2289         /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */
2290         nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
2291         nested_vmx_entry_ctls_high &=
2292 #ifdef CONFIG_X86_64
2293                 VM_ENTRY_IA32E_MODE |
2294 #endif
2295                 VM_ENTRY_LOAD_IA32_PAT;
2296         nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
2297                                        VM_ENTRY_LOAD_IA32_EFER);
2298 
2299         /* cpu-based controls */
2300         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
2301                 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
2302         nested_vmx_procbased_ctls_low = 0;
2303         nested_vmx_procbased_ctls_high &=
2304                 CPU_BASED_VIRTUAL_INTR_PENDING |
2305                 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
2306                 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
2307                 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
2308                 CPU_BASED_CR3_STORE_EXITING |
2309 #ifdef CONFIG_X86_64
2310                 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
2311 #endif
2312                 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
2313                 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
2314                 CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
2315                 CPU_BASED_PAUSE_EXITING |
2316                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2317         /*
2318          * We can allow some features even when not supported by the
2319          * hardware. For example, L1 can specify an MSR bitmap - and we
2320          * can use it to avoid exits to L1 - even when L0 runs L2
2321          * without MSR bitmaps.
2322          */
2323         nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS;
2324 
2325         /* secondary cpu-based controls */
2326         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
2327                 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
2328         nested_vmx_secondary_ctls_low = 0;
2329         nested_vmx_secondary_ctls_high &=
2330                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2331                 SECONDARY_EXEC_WBINVD_EXITING;
2332 
2333         if (enable_ept) {
2334                 /* nested EPT: emulate EPT also to L1 */
2335                 nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT |
2336                         SECONDARY_EXEC_UNRESTRICTED_GUEST;
2337                 nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
2338                          VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
2339                          VMX_EPT_INVEPT_BIT;
2340                 nested_vmx_ept_caps &= vmx_capability.ept;
2341                 /*
2342                  * Since invept is completely emulated we support both global
2343                  * and context invalidation independent of what host cpu
2344                  * supports
2345                  */
2346                 nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
2347                         VMX_EPT_EXTENT_CONTEXT_BIT;
2348         } else
2349                 nested_vmx_ept_caps = 0;
2350 
2351         /* miscellaneous data */
2352         rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
2353         nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
2354                 VMX_MISC_SAVE_EFER_LMA;
2355         nested_vmx_misc_low |= VMX_MISC_ACTIVITY_HLT;
2356         nested_vmx_misc_high = 0;
2357 }
2358 
2359 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
2360 {
2361         /*
2362          * Bits 0 in high must be 0, and bits 1 in low must be 1.
2363          */
2364         return ((control & high) | low) == control;
2365 }
2366 
2367 static inline u64 vmx_control_msr(u32 low, u32 high)
2368 {
2369         return low | ((u64)high << 32);
2370 }
2371 
2372 /* Returns 0 on success, non-0 otherwise. */
2373 static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2374 {
2375         switch (msr_index) {
2376         case MSR_IA32_VMX_BASIC:
2377                 /*
2378                  * This MSR reports some information about VMX support. We
2379                  * should return information about the VMX we emulate for the
2380                  * guest, and the VMCS structure we give it - not about the
2381                  * VMX support of the underlying hardware.
2382                  */
2383                 *pdata = VMCS12_REVISION |
2384                            ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
2385                            (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
2386                 break;
2387         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
2388         case MSR_IA32_VMX_PINBASED_CTLS:
2389                 *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low,
2390                                         nested_vmx_pinbased_ctls_high);
2391                 break;
2392         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
2393         case MSR_IA32_VMX_PROCBASED_CTLS:
2394                 *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
2395                                         nested_vmx_procbased_ctls_high);
2396                 break;
2397         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
2398         case MSR_IA32_VMX_EXIT_CTLS:
2399                 *pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
2400                                         nested_vmx_exit_ctls_high);
2401                 break;
2402         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
2403         case MSR_IA32_VMX_ENTRY_CTLS:
2404                 *pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
2405                                         nested_vmx_entry_ctls_high);
2406                 break;
2407         case MSR_IA32_VMX_MISC:
2408                 *pdata = vmx_control_msr(nested_vmx_misc_low,
2409                                          nested_vmx_misc_high);
2410                 break;
2411         /*
2412          * These MSRs specify bits which the guest must keep fixed (on or off)
2413          * while L1 is in VMXON mode (in L1's root mode, or running an L2).
2414          * We picked the standard core2 setting.
2415          */
2416 #define VMXON_CR0_ALWAYSON      (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
2417 #define VMXON_CR4_ALWAYSON      X86_CR4_VMXE
2418         case MSR_IA32_VMX_CR0_FIXED0:
2419                 *pdata = VMXON_CR0_ALWAYSON;
2420                 break;
2421         case MSR_IA32_VMX_CR0_FIXED1:
2422                 *pdata = -1ULL;
2423                 break;
2424         case MSR_IA32_VMX_CR4_FIXED0:
2425                 *pdata = VMXON_CR4_ALWAYSON;
2426                 break;
2427         case MSR_IA32_VMX_CR4_FIXED1:
2428                 *pdata = -1ULL;
2429                 break;
2430         case MSR_IA32_VMX_VMCS_ENUM:
2431                 *pdata = 0x1f;
2432                 break;
2433         case MSR_IA32_VMX_PROCBASED_CTLS2:
2434                 *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
2435                                         nested_vmx_secondary_ctls_high);
2436                 break;
2437         case MSR_IA32_VMX_EPT_VPID_CAP:
2438                 /* Currently, no nested vpid support */
2439                 *pdata = nested_vmx_ept_caps;
2440                 break;
2441         default:
2442                 return 1;
2443         }
2444 
2445         return 0;
2446 }
2447 
2448 /*
2449  * Reads an msr value (of 'msr_index') into 'pdata'.
2450  * Returns 0 on success, non-0 otherwise.
2451  * Assumes vcpu_load() was already called.
2452  */
2453 static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2454 {
2455         u64 data;
2456         struct shared_msr_entry *msr;
2457 
2458         if (!pdata) {
2459                 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
2460                 return -EINVAL;
2461         }
2462 
2463         switch (msr_index) {
2464 #ifdef CONFIG_X86_64
2465         case MSR_FS_BASE:
2466                 data = vmcs_readl(GUEST_FS_BASE);
2467                 break;
2468         case MSR_GS_BASE:
2469                 data = vmcs_readl(GUEST_GS_BASE);
2470                 break;
2471         case MSR_KERNEL_GS_BASE:
2472                 vmx_load_host_state(to_vmx(vcpu));
2473                 data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
2474                 break;
2475 #endif
2476         case MSR_EFER:
2477                 return kvm_get_msr_common(vcpu, msr_index, pdata);
2478         case MSR_IA32_TSC:
2479                 data = guest_read_tsc();
2480                 break;
2481         case MSR_IA32_SYSENTER_CS:
2482                 data = vmcs_read32(GUEST_SYSENTER_CS);
2483                 break;
2484         case MSR_IA32_SYSENTER_EIP:
2485                 data = vmcs_readl(GUEST_SYSENTER_EIP);
2486                 break;
2487         case MSR_IA32_SYSENTER_ESP:
2488                 data = vmcs_readl(GUEST_SYSENTER_ESP);
2489                 break;
2490         case MSR_IA32_FEATURE_CONTROL:
2491                 if (!nested_vmx_allowed(vcpu))
2492                         return 1;
2493                 data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
2494                 break;
2495         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2496                 if (!nested_vmx_allowed(vcpu))
2497                         return 1;
2498                 return vmx_get_vmx_msr(vcpu, msr_index, pdata);
2499         case MSR_TSC_AUX:
2500                 if (!to_vmx(vcpu)->rdtscp_enabled)
2501                         return 1;
2502                 /* Otherwise falls through */
2503         default:
2504                 msr = find_msr_entry(to_vmx(vcpu), msr_index);
2505                 if (msr) {
2506                         data = msr->data;
2507                         break;
2508                 }
2509                 return kvm_get_msr_common(vcpu, msr_index, pdata);
2510         }
2511 
2512         *pdata = data;
2513         return 0;
2514 }
2515 
2516 static void vmx_leave_nested(struct kvm_vcpu *vcpu);
2517 
2518 /*
2519  * Writes msr value into into the appropriate "register".
2520  * Returns 0 on success, non-0 otherwise.
2521  * Assumes vcpu_load() was already called.
2522  */
2523 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2524 {
2525         struct vcpu_vmx *vmx = to_vmx(vcpu);
2526         struct shared_msr_entry *msr;
2527         int ret = 0;
2528         u32 msr_index = msr_info->index;
2529         u64 data = msr_info->data;
2530 
2531         switch (msr_index) {
2532         case MSR_EFER:
2533                 ret = kvm_set_msr_common(vcpu, msr_info);
2534                 break;
2535 #ifdef CONFIG_X86_64
2536         case MSR_FS_BASE:
2537                 vmx_segment_cache_clear(vmx);
2538                 vmcs_writel(GUEST_FS_BASE, data);
2539                 break;
2540         case MSR_GS_BASE:
2541                 vmx_segment_cache_clear(vmx);
2542                 vmcs_writel(GUEST_GS_BASE, data);
2543                 break;
2544         case MSR_KERNEL_GS_BASE:
2545                 vmx_load_host_state(vmx);
2546                 vmx->msr_guest_kernel_gs_base = data;
2547                 break;
2548 #endif
2549         case MSR_IA32_SYSENTER_CS:
2550                 vmcs_write32(GUEST_SYSENTER_CS, data);
2551                 break;
2552         case MSR_IA32_SYSENTER_EIP:
2553                 vmcs_writel(GUEST_SYSENTER_EIP, data);
2554                 break;
2555         case MSR_IA32_SYSENTER_ESP:
2556                 vmcs_writel(GUEST_SYSENTER_ESP, data);
2557                 break;
2558         case MSR_IA32_TSC:
2559                 kvm_write_tsc(vcpu, msr_info);
2560                 break;
2561         case MSR_IA32_CR_PAT:
2562                 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2563                         vmcs_write64(GUEST_IA32_PAT, data);
2564                         vcpu->arch.pat = data;
2565                         break;
2566                 }
2567                 ret = kvm_set_msr_common(vcpu, msr_info);
2568                 break;
2569         case MSR_IA32_TSC_ADJUST:
2570                 ret = kvm_set_msr_common(vcpu, msr_info);
2571                 break;
2572         case MSR_IA32_FEATURE_CONTROL:
2573                 if (!nested_vmx_allowed(vcpu) ||
2574                     (to_vmx(vcpu)->nested.msr_ia32_feature_control &
2575                      FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
2576                         return 1;
2577                 vmx->nested.msr_ia32_feature_control = data;
2578                 if (msr_info->host_initiated && data == 0)
2579                         vmx_leave_nested(vcpu);
2580                 break;
2581         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2582                 return 1; /* they are read-only */
2583         case MSR_TSC_AUX:
2584                 if (!vmx->rdtscp_enabled)
2585                         return 1;
2586                 /* Check reserved bit, higher 32 bits should be zero */
2587                 if ((data >> 32) != 0)
2588                         return 1;
2589                 /* Otherwise falls through */
2590         default:
2591                 msr = find_msr_entry(vmx, msr_index);
2592                 if (msr) {
2593                         u64 old_msr_data = msr->data;
2594                         msr->data = data;
2595                         if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
2596                                 preempt_disable();
2597                                 ret = kvm_set_shared_msr(msr->index, msr->data,
2598                                                          msr->mask);
2599                                 preempt_enable();
2600                                 if (ret)
2601                                         msr->data = old_msr_data;
2602                         }
2603                         break;
2604                 }
2605                 ret = kvm_set_msr_common(vcpu, msr_info);
2606         }
2607 
2608         return ret;
2609 }
2610 
2611 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2612 {
2613         __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
2614         switch (reg) {
2615         case VCPU_REGS_RSP:
2616                 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2617                 break;
2618         case VCPU_REGS_RIP:
2619                 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2620                 break;
2621         case VCPU_EXREG_PDPTR:
2622                 if (enable_ept)
2623                         ept_save_pdptrs(vcpu);
2624                 break;
2625         default:
2626                 break;
2627         }
2628 }
2629 
2630 static __init int cpu_has_kvm_support(void)
2631 {
2632         return cpu_has_vmx();
2633 }
2634 
2635 static __init int vmx_disabled_by_bios(void)
2636 {
2637         u64 msr;
2638 
2639         rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
2640         if (msr & FEATURE_CONTROL_LOCKED) {
2641                 /* launched w/ TXT and VMX disabled */
2642                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
2643                         && tboot_enabled())
2644                         return 1;
2645                 /* launched w/o TXT and VMX only enabled w/ TXT */
2646                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
2647                         && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
2648                         && !tboot_enabled()) {
2649                         printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
2650                                 "activate TXT before enabling KVM\n");
2651                         return 1;
2652                 }
2653                 /* launched w/o TXT and VMX disabled */
2654                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
2655                         && !tboot_enabled())
2656                         return 1;
2657         }
2658 
2659         return 0;
2660 }
2661 
2662 static void kvm_cpu_vmxon(u64 addr)
2663 {
2664         asm volatile (ASM_VMX_VMXON_RAX
2665                         : : "a"(&addr), "m"(addr)
2666                         : "memory", "cc");
2667 }
2668 
2669 static int hardware_enable(void *garbage)
2670 {
2671         int cpu = raw_smp_processor_id();
2672         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
2673         u64 old, test_bits;
2674 
2675         if (read_cr4() & X86_CR4_VMXE)
2676                 return -EBUSY;
2677 
2678         INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
2679 
2680         /*
2681          * Now we can enable the vmclear operation in kdump
2682          * since the loaded_vmcss_on_cpu list on this cpu
2683          * has been initialized.
2684          *
2685          * Though the cpu is not in VMX operation now, there
2686          * is no problem to enable the vmclear operation
2687          * for the loaded_vmcss_on_cpu list is empty!
2688          */
2689         crash_enable_local_vmclear(cpu);
2690 
2691         rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
2692 
2693         test_bits = FEATURE_CONTROL_LOCKED;
2694         test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
2695         if (tboot_enabled())
2696                 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
2697 
2698         if ((old & test_bits) != test_bits) {
2699                 /* enable and lock */
2700                 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
2701         }
2702         write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
2703 
2704         if (vmm_exclusive) {
2705                 kvm_cpu_vmxon(phys_addr);
2706                 ept_sync_global();
2707         }
2708 
2709         native_store_gdt(&__get_cpu_var(host_gdt));
2710 
2711         return 0;
2712 }
2713 
2714 static void vmclear_local_loaded_vmcss(void)
2715 {
2716         int cpu = raw_smp_processor_id();
2717         struct loaded_vmcs *v, *n;
2718 
2719         list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
2720                                  loaded_vmcss_on_cpu_link)
2721                 __loaded_vmcs_clear(v);
2722 }
2723 
2724 
2725 /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
2726  * tricks.
2727  */
2728 static void kvm_cpu_vmxoff(void)
2729 {
2730         asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
2731 }
2732 
2733 static void hardware_disable(void *garbage)
2734 {
2735         if (vmm_exclusive) {
2736                 vmclear_local_loaded_vmcss();
2737                 kvm_cpu_vmxoff();
2738         }
2739         write_cr4(read_cr4() & ~X86_CR4_VMXE);
2740 }
2741 
2742 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
2743                                       u32 msr, u32 *result)
2744 {
2745         u32 vmx_msr_low, vmx_msr_high;
2746         u32 ctl = ctl_min | ctl_opt;
2747 
2748         rdmsr(msr, vmx_msr_low, vmx_msr_high);
2749 
2750         ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
2751         ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
2752 
2753         /* Ensure minimum (required) set of control bits are supported. */
2754         if (ctl_min & ~ctl)
2755                 return -EIO;
2756 
2757         *result = ctl;
2758         return 0;
2759 }
2760 
2761 static __init bool allow_1_setting(u32 msr, u32 ctl)
2762 {
2763         u32 vmx_msr_low, vmx_msr_high;
2764 
2765         rdmsr(msr, vmx_msr_low, vmx_msr_high);
2766         return vmx_msr_high & ctl;
2767 }
2768 
2769 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2770 {
2771         u32 vmx_msr_low, vmx_msr_high;
2772         u32 min, opt, min2, opt2;
2773         u32 _pin_based_exec_control = 0;
2774         u32 _cpu_based_exec_control = 0;
2775         u32 _cpu_based_2nd_exec_control = 0;
2776         u32 _vmexit_control = 0;
2777         u32 _vmentry_control = 0;
2778 
2779         min = CPU_BASED_HLT_EXITING |
2780 #ifdef CONFIG_X86_64
2781               CPU_BASED_CR8_LOAD_EXITING |
2782               CPU_BASED_CR8_STORE_EXITING |
2783 #endif
2784               CPU_BASED_CR3_LOAD_EXITING |
2785               CPU_BASED_CR3_STORE_EXITING |
2786               CPU_BASED_USE_IO_BITMAPS |
2787               CPU_BASED_MOV_DR_EXITING |
2788               CPU_BASED_USE_TSC_OFFSETING |
2789               CPU_BASED_MWAIT_EXITING |
2790               CPU_BASED_MONITOR_EXITING |
2791               CPU_BASED_INVLPG_EXITING |
2792               CPU_BASED_RDPMC_EXITING;
2793 
2794         opt = CPU_BASED_TPR_SHADOW |
2795               CPU_BASED_USE_MSR_BITMAPS |
2796               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2797         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
2798                                 &_cpu_based_exec_control) < 0)
2799                 return -EIO;
2800 #ifdef CONFIG_X86_64
2801         if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2802                 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
2803                                            ~CPU_BASED_CR8_STORE_EXITING;
2804 #endif
2805         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
2806                 min2 = 0;
2807                 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2808                         SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2809                         SECONDARY_EXEC_WBINVD_EXITING |
2810                         SECONDARY_EXEC_ENABLE_VPID |
2811                         SECONDARY_EXEC_ENABLE_EPT |
2812                         SECONDARY_EXEC_UNRESTRICTED_GUEST |
2813                         SECONDARY_EXEC_PAUSE_LOOP_EXITING |
2814                         SECONDARY_EXEC_RDTSCP |
2815                         SECONDARY_EXEC_ENABLE_INVPCID |
2816                         SECONDARY_EXEC_APIC_REGISTER_VIRT |
2817                         SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2818                         SECONDARY_EXEC_SHADOW_VMCS;
2819                 if (adjust_vmx_controls(min2, opt2,
2820                                         MSR_IA32_VMX_PROCBASED_CTLS2,
2821                                         &_cpu_based_2nd_exec_control) < 0)
2822                         return -EIO;
2823         }
2824 #ifndef CONFIG_X86_64
2825         if (!(_cpu_based_2nd_exec_control &
2826                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2827                 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2828 #endif
2829 
2830         if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2831                 _cpu_based_2nd_exec_control &= ~(
2832                                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2833                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2834                                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2835 
2836         if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
2837                 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
2838                    enabled */
2839                 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
2840                                              CPU_BASED_CR3_STORE_EXITING |
2841                                              CPU_BASED_INVLPG_EXITING);
2842                 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
2843                       vmx_capability.ept, vmx_capability.vpid);
2844         }
2845 
2846         min = 0;
2847 #ifdef CONFIG_X86_64
2848         min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
2849 #endif
2850         opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
2851                 VM_EXIT_ACK_INTR_ON_EXIT;
2852         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
2853                                 &_vmexit_control) < 0)
2854                 return -EIO;
2855 
2856         min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
2857         opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
2858         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
2859                                 &_pin_based_exec_control) < 0)
2860                 return -EIO;
2861 
2862         if (!(_cpu_based_2nd_exec_control &
2863                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
2864                 !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
2865                 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2866 
2867         min = 0;
2868         opt = VM_ENTRY_LOAD_IA32_PAT;
2869         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
2870                                 &_vmentry_control) < 0)
2871                 return -EIO;
2872 
2873         rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
2874 
2875         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
2876         if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
2877                 return -EIO;
2878 
2879 #ifdef CONFIG_X86_64
2880         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
2881         if (vmx_msr_high & (1u<<16))
2882                 return -EIO;
2883 #endif
2884 
2885         /* Require Write-Back (WB) memory type for VMCS accesses. */
2886         if (((vmx_msr_high >> 18) & 15) != 6)
2887                 return -EIO;
2888 
2889         vmcs_conf->size = vmx_msr_high & 0x1fff;
2890         vmcs_conf->order = get_order(vmcs_config.size);
2891         vmcs_conf->revision_id = vmx_msr_low;
2892 
2893         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
2894         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
2895         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
2896         vmcs_conf->vmexit_ctrl         = _vmexit_control;
2897         vmcs_conf->vmentry_ctrl        = _vmentry_control;
2898 
2899         cpu_has_load_ia32_efer =
2900                 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
2901                                 VM_ENTRY_LOAD_IA32_EFER)
2902                 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
2903                                    VM_EXIT_LOAD_IA32_EFER);
2904 
2905         cpu_has_load_perf_global_ctrl =
2906                 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
2907                                 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
2908                 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
2909                                    VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
2910 
2911         /*
2912          * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
2913          * but due to arrata below it can't be used. Workaround is to use
2914          * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
2915          *
2916          * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
2917          *
2918          * AAK155             (model 26)
2919          * AAP115             (model 30)
2920          * AAT100             (model 37)
2921          * BC86,AAY89,BD102   (model 44)
2922          * BA97               (model 46)
2923          *
2924          */
2925         if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) {
2926                 switch (boot_cpu_data.x86_model) {
2927                 case 26:
2928                 case 30:
2929                 case 37:
2930                 case 44:
2931                 case 46:
2932                         cpu_has_load_perf_global_ctrl = false;
2933                         printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
2934                                         "does not work properly. Using workaround\n");
2935                         break;
2936                 default:
2937                         break;
2938                 }
2939         }
2940 
2941         return 0;
2942 }
2943 
2944 static struct vmcs *alloc_vmcs_cpu(int cpu)
2945 {
2946         int node = cpu_to_node(cpu);
2947         struct page *pages;
2948         struct vmcs *vmcs;
2949 
2950         pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
2951         if (!pages)
2952                 return NULL;
2953         vmcs = page_address(pages);
2954         memset(vmcs, 0, vmcs_config.size);
2955         vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
2956         return vmcs;
2957 }
2958 
2959 static struct vmcs *alloc_vmcs(void)
2960 {
2961         return alloc_vmcs_cpu(raw_smp_processor_id());
2962 }
2963 
2964 static void free_vmcs(struct vmcs *vmcs)
2965 {
2966         free_pages((unsigned long)vmcs, vmcs_config.order);
2967 }
2968 
2969 /*
2970  * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
2971  */
2972 static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2973 {
2974         if (!loaded_vmcs->vmcs)
2975                 return;
2976         loaded_vmcs_clear(loaded_vmcs);
2977         free_vmcs(loaded_vmcs->vmcs);
2978         loaded_vmcs->vmcs = NULL;
2979 }
2980 
2981 static void free_kvm_area(void)
2982 {
2983         int cpu;
2984 
2985         for_each_possible_cpu(cpu) {
2986                 free_vmcs(per_cpu(vmxarea, cpu));
2987                 per_cpu(vmxarea, cpu) = NULL;
2988         }
2989 }
2990 
2991 static __init int alloc_kvm_area(void)
2992 {
2993         int cpu;
2994 
2995         for_each_possible_cpu(cpu) {
2996                 struct vmcs *vmcs;
2997 
2998                 vmcs = alloc_vmcs_cpu(cpu);
2999                 if (!vmcs) {
3000                         free_kvm_area();
3001                         return -ENOMEM;
3002                 }
3003 
3004                 per_cpu(vmxarea, cpu) = vmcs;
3005         }
3006         return 0;
3007 }
3008 
3009 static __init int hardware_setup(void)
3010 {
3011         if (setup_vmcs_config(&vmcs_config) < 0)
3012                 return -EIO;
3013 
3014         if (boot_cpu_has(X86_FEATURE_NX))
3015                 kvm_enable_efer_bits(EFER_NX);
3016 
3017         if (!cpu_has_vmx_vpid())
3018                 enable_vpid = 0;
3019         if (!cpu_has_vmx_shadow_vmcs())
3020                 enable_shadow_vmcs = 0;
3021 
3022         if (!cpu_has_vmx_ept() ||
3023             !cpu_has_vmx_ept_4levels()) {
3024                 enable_ept = 0;
3025                 enable_unrestricted_guest = 0;
3026                 enable_ept_ad_bits = 0;
3027         }
3028 
3029         if (!cpu_has_vmx_ept_ad_bits())
3030                 enable_ept_ad_bits = 0;
3031 
3032         if (!cpu_has_vmx_unrestricted_guest())
3033                 enable_unrestricted_guest = 0;
3034 
3035         if (!cpu_has_vmx_flexpriority())
3036                 flexpriority_enabled = 0;
3037 
3038         if (!cpu_has_vmx_tpr_shadow())
3039                 kvm_x86_ops->update_cr8_intercept = NULL;
3040 
3041         if (enable_ept && !cpu_has_vmx_ept_2m_page())
3042                 kvm_disable_largepages();
3043 
3044         if (!cpu_has_vmx_ple())
3045                 ple_gap = 0;
3046 
3047         if (!cpu_has_vmx_apicv())
3048                 enable_apicv = 0;
3049 
3050         if (enable_apicv)
3051                 kvm_x86_ops->update_cr8_intercept = NULL;
3052         else {
3053                 kvm_x86_ops->hwapic_irr_update = NULL;
3054                 kvm_x86_ops->deliver_posted_interrupt = NULL;
3055                 kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
3056         }
3057 
3058         if (nested)
3059                 nested_vmx_setup_ctls_msrs();
3060 
3061         return alloc_kvm_area();
3062 }
3063 
3064 static __exit void hardware_unsetup(void)
3065 {
3066         free_kvm_area();
3067 }
3068 
3069 static bool emulation_required(struct kvm_vcpu *vcpu)
3070 {
3071         return emulate_invalid_guest_state && !guest_state_valid(vcpu);
3072 }
3073 
3074 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
3075                 struct kvm_segment *save)
3076 {
3077         if (!emulate_invalid_guest_state) {
3078                 /*
3079                  * CS and SS RPL should be equal during guest entry according
3080                  * to VMX spec, but in reality it is not always so. Since vcpu
3081                  * is in the middle of the transition from real mode to
3082                  * protected mode it is safe to assume that RPL 0 is a good
3083                  * default value.
3084                  */
3085                 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
3086                         save->selector &= ~SELECTOR_RPL_MASK;
3087                 save->dpl = save->selector & SELECTOR_RPL_MASK;
3088                 save->s = 1;
3089         }
3090         vmx_set_segment(vcpu, save, seg);
3091 }
3092 
3093 static void enter_pmode(struct kvm_vcpu *vcpu)
3094 {
3095         unsigned long flags;
3096         struct vcpu_vmx *vmx = to_vmx(vcpu);
3097 
3098         /*
3099          * Update real mode segment cache. It may be not up-to-date if sement
3100          * register was written while vcpu was in a guest mode.
3101          */
3102         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3103         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3104         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3105         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3106         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3107         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3108 
3109         vmx->rmode.vm86_active = 0;
3110 
3111         vmx_segment_cache_clear(vmx);
3112 
3113         vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3114 
3115         flags = vmcs_readl(GUEST_RFLAGS);
3116         flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3117         flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
3118         vmcs_writel(GUEST_RFLAGS, flags);
3119 
3120         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
3121                         (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
3122 
3123         update_exception_bitmap(vcpu);
3124 
3125         fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3126         fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3127         fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3128         fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3129         fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3130         fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3131 
3132         /* CPL is always 0 when CPU enters protected mode */
3133         __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3134         vmx->cpl = 0;
3135 }
3136 
3137 static void fix_rmode_seg(int seg, struct kvm_segment *save)
3138 {
3139         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3140         struct kvm_segment var = *save;
3141 
3142         var.dpl = 0x3;
3143         if (seg == VCPU_SREG_CS)
3144                 var.type = 0x3;
3145 
3146         if (!emulate_invalid_guest_state) {
3147                 var.selector = var.base >> 4;
3148                 var.base = var.base & 0xffff0;
3149                 var.limit = 0xffff;
3150                 var.g = 0;
3151                 var.db = 0;
3152                 var.present = 1;
3153                 var.s = 1;
3154                 var.l = 0;
3155                 var.unusable = 0;
3156                 var.type = 0x3;
3157                 var.avl = 0;
3158                 if (save->base & 0xf)
3159                         printk_once(KERN_WARNING "kvm: segment base is not "
3160                                         "paragraph aligned when entering "
3161                                         "protected mode (seg=%d)", seg);
3162         }
3163 
3164         vmcs_write16(sf->selector, var.selector);
3165         vmcs_write32(sf->base, var.base);
3166         vmcs_write32(sf->limit, var.limit);
3167         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
3168 }
3169 
3170 static void enter_rmode(struct kvm_vcpu *vcpu)
3171 {
3172         unsigned long flags;
3173         struct vcpu_vmx *vmx = to_vmx(vcpu);
3174 
3175         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3176         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3177         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3178         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3179         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3180         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3181         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3182 
3183         vmx->rmode.vm86_active = 1;
3184 
3185         /*
3186          * Very old userspace does not call KVM_SET_TSS_ADDR before entering
3187          * vcpu. Warn the user that an update is overdue.
3188          */
3189         if (!vcpu->kvm->arch.tss_addr)
3190                 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
3191                              "called before entering vcpu\n");
3192 
3193         vmx_segment_cache_clear(vmx);
3194 
3195         vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr);
3196         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
3197         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
3198 
3199         flags = vmcs_readl(GUEST_RFLAGS);
3200         vmx->rmode.save_rflags = flags;
3201 
3202         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
3203 
3204         vmcs_writel(GUEST_RFLAGS, flags);
3205         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
3206         update_exception_bitmap(vcpu);
3207 
3208         fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3209         fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3210         fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3211         fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3212         fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3213         fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3214 
3215         kvm_mmu_reset_context(vcpu);
3216 }
3217 
3218 static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
3219 {
3220         struct vcpu_vmx *vmx = to_vmx(vcpu);
3221         struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
3222 
3223         if (!msr)
3224                 return;
3225 
3226         /*
3227          * Force kernel_gs_base reloading before EFER changes, as control
3228          * of this msr depends on is_long_mode().
3229          */
3230         vmx_load_host_state(to_vmx(vcpu));
3231         vcpu->arch.efer = efer;
3232         if (efer & EFER_LMA) {
3233                 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3234                 msr->data = efer;
3235         } else {
3236                 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3237 
3238                 msr->data = efer & ~EFER_LME;
3239         }
3240         setup_msrs(vmx);
3241 }
3242 
3243 #ifdef CONFIG_X86_64
3244 
3245 static void enter_lmode(struct kvm_vcpu *vcpu)
3246 {
3247         u32 guest_tr_ar;
3248 
3249         vmx_segment_cache_clear(to_vmx(vcpu));
3250 
3251         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
3252         if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
3253                 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
3254                                      __func__);
3255                 vmcs_write32(GUEST_TR_AR_BYTES,
3256                              (guest_tr_ar & ~AR_TYPE_MASK)
3257                              | AR_TYPE_BUSY_64_TSS);
3258         }
3259         vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
3260 }
3261 
3262 static void exit_lmode(struct kvm_vcpu *vcpu)
3263 {
3264         vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3265         vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
3266 }
3267 
3268 #endif
3269 
3270 static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
3271 {
3272         vpid_sync_context(to_vmx(vcpu));
3273         if (enable_ept) {
3274                 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3275                         return;
3276                 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
3277         }
3278 }
3279 
3280 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
3281 {
3282         ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
3283 
3284         vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
3285         vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
3286 }
3287 
3288 static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
3289 {
3290         if (enable_ept && is_paging(vcpu))
3291                 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3292         __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
3293 }
3294 
3295 static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
3296 {
3297         ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
3298 
3299         vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
3300         vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
3301 }
3302 
3303 static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
3304 {
3305         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3306 
3307         if (!test_bit(VCPU_EXREG_PDPTR,
3308                       (unsigned long *)&vcpu->arch.regs_dirty))
3309                 return;
3310 
3311         if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
3312                 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
3313                 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
3314                 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
3315                 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
3316         }
3317 }
3318 
3319 static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
3320 {
3321         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3322 
3323         if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
3324                 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
3325                 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
3326                 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
3327                 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
3328         }
3329 
3330         __set_bit(VCPU_EXREG_PDPTR,
3331                   (unsigned long *)&vcpu->arch.regs_avail);
3332         __set_bit(VCPU_EXREG_PDPTR,
3333                   (unsigned long *)&vcpu->arch.regs_dirty);
3334 }
3335 
3336 static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
3337 
3338 static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
3339                                         unsigned long cr0,
3340                                         struct kvm_vcpu *vcpu)
3341 {
3342         if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
3343                 vmx_decache_cr3(vcpu);
3344         if (!(cr0 & X86_CR0_PG)) {
3345                 /* From paging/starting to nonpaging */
3346                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
3347                              vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
3348                              (CPU_BASED_CR3_LOAD_EXITING |
3349                               CPU_BASED_CR3_STORE_EXITING));
3350                 vcpu->arch.cr0 = cr0;
3351                 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3352         } else if (!is_paging(vcpu)) {
3353                 /* From nonpaging to paging */
3354                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
3355                              vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
3356                              ~(CPU_BASED_CR3_LOAD_EXITING |
3357                                CPU_BASED_CR3_STORE_EXITING));
3358                 vcpu->arch.cr0 = cr0;
3359                 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3360         }
3361 
3362         if (!(cr0 & X86_CR0_WP))
3363                 *hw_cr0 &= ~X86_CR0_WP;
3364 }
3365 
3366 static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3367 {
3368         struct vcpu_vmx *vmx = to_vmx(vcpu);
3369         unsigned long hw_cr0;
3370 
3371         hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK);
3372         if (enable_unrestricted_guest)
3373                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
3374         else {
3375                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
3376 
3377                 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
3378                         enter_pmode(vcpu);
3379 
3380                 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
3381                         enter_rmode(vcpu);
3382         }
3383 
3384 #ifdef CONFIG_X86_64
3385         if (vcpu->arch.efer & EFER_LME) {
3386                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
3387                         enter_lmode(vcpu);
3388                 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
3389                         exit_lmode(vcpu);
3390         }
3391 #endif
3392 
3393         if (enable_ept)
3394                 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
3395 
3396         if (!vcpu->fpu_active)
3397                 hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
3398 
3399         vmcs_writel(CR0_READ_SHADOW, cr0);
3400         vmcs_writel(GUEST_CR0, hw_cr0);
3401         vcpu->arch.cr0 = cr0;
3402 
3403         /* depends on vcpu->arch.cr0 to be set to a new value */
3404         vmx->emulation_required = emulation_required(vcpu);
3405 }
3406 
3407 static u64 construct_eptp(unsigned long root_hpa)
3408 {
3409         u64 eptp;
3410 
3411         /* TODO write the value reading from MSR */
3412         eptp = VMX_EPT_DEFAULT_MT |
3413                 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
3414         if (enable_ept_ad_bits)
3415                 eptp |= VMX_EPT_AD_ENABLE_BIT;
3416         eptp |= (root_hpa & PAGE_MASK);
3417 
3418         return eptp;
3419 }
3420 
3421 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
3422 {
3423         unsigned long guest_cr3;
3424         u64 eptp;
3425 
3426         guest_cr3 = cr3;
3427         if (enable_ept) {
3428                 eptp = construct_eptp(cr3);
3429                 vmcs_write64(EPT_POINTER, eptp);
3430                 if (is_paging(vcpu) || is_guest_mode(vcpu))
3431                         guest_cr3 = kvm_read_cr3(vcpu);
3432                 else
3433                         guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr;
3434                 ept_load_pdptrs(vcpu);
3435         }
3436 
3437         vmx_flush_tlb(vcpu);
3438         vmcs_writel(GUEST_CR3, guest_cr3);
3439 }
3440 
3441 static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3442 {
3443         unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
3444                     KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
3445 
3446         if (cr4 & X86_CR4_VMXE) {
3447                 /*
3448                  * To use VMXON (and later other VMX instructions), a guest
3449                  * must first be able to turn on cr4.VMXE (see handle_vmon()).
3450                  * So basically the check on whether to allow nested VMX
3451                  * is here.
3452                  */
3453                 if (!nested_vmx_allowed(vcpu))
3454                         return 1;
3455         }
3456         if (to_vmx(vcpu)->nested.vmxon &&
3457             ((cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON))
3458                 return 1;
3459 
3460         vcpu->arch.cr4 = cr4;
3461         if (enable_ept) {
3462                 if (!is_paging(vcpu)) {
3463                         hw_cr4 &= ~X86_CR4_PAE;
3464                         hw_cr4 |= X86_CR4_PSE;
3465                         /*
3466                          * SMEP is disabled if CPU is in non-paging mode in
3467                          * hardware. However KVM always uses paging mode to
3468                          * emulate guest non-paging mode with TDP.
3469                          * To emulate this behavior, SMEP needs to be manually
3470                          * disabled when guest switches to non-paging mode.
3471                          */
3472                         hw_cr4 &= ~X86_CR4_SMEP;
3473                 } else if (!(cr4 & X86_CR4_PAE)) {
3474                         hw_cr4 &= ~X86_CR4_PAE;
3475                 }
3476         }
3477 
3478         vmcs_writel(CR4_READ_SHADOW, cr4);
3479         vmcs_writel(GUEST_CR4, hw_cr4);
3480         return 0;
3481 }
3482 
3483 static void vmx_get_segment(struct kvm_vcpu *vcpu,
3484                             struct kvm_segment *var, int seg)
3485 {
3486         struct vcpu_vmx *vmx = to_vmx(vcpu);
3487         u32 ar;
3488 
3489         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3490                 *var = vmx->rmode.segs[seg];
3491                 if (seg == VCPU_SREG_TR
3492                     || var->selector == vmx_read_guest_seg_selector(vmx, seg))
3493                         return;
3494                 var->base = vmx_read_guest_seg_base(vmx, seg);
3495                 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3496                 return;
3497         }
3498         var->base = vmx_read_guest_seg_base(vmx, seg);
3499         var->limit = vmx_read_guest_seg_limit(vmx, seg);
3500         var->selector = vmx_read_guest_seg_selector(vmx, seg);
3501         ar = vmx_read_guest_seg_ar(vmx, seg);
3502         var->unusable = (ar >> 16) & 1;
3503         var->type = ar & 15;
3504         var->s = (ar >> 4) & 1;
3505         var->dpl = (ar >> 5) & 3;
3506         /*
3507          * Some userspaces do not preserve unusable property. Since usable
3508          * segment has to be present according to VMX spec we can use present
3509          * property to amend userspace bug by making unusable segment always
3510          * nonpresent. vmx_segment_access_rights() already marks nonpresent
3511          * segment as unusable.
3512          */
3513         var->present = !var->unusable;
3514         var->avl = (ar >> 12) & 1;
3515         var->l = (ar >> 13) & 1;
3516         var->db = (ar >> 14) & 1;
3517         var->g = (ar >> 15) & 1;
3518 }
3519 
3520 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3521 {
3522         struct kvm_segment s;
3523 
3524         if (to_vmx(vcpu)->rmode.vm86_active) {
3525                 vmx_get_segment(vcpu, &s, seg);
3526                 return s.base;
3527         }
3528         return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3529 }
3530 
3531 static int vmx_get_cpl(struct kvm_vcpu *vcpu)
3532 {
3533         struct vcpu_vmx *vmx = to_vmx(vcpu);
3534 
3535         if (!is_protmode(vcpu))
3536                 return 0;
3537 
3538         if (!is_long_mode(vcpu)
3539             && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */
3540                 return 3;
3541 
3542         if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
3543                 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3544                 vmx->cpl = vmx_read_guest_seg_selector(vmx, VCPU_SREG_CS) & 3;
3545         }
3546 
3547         return vmx->cpl;
3548 }
3549 
3550 
3551 static u32 vmx_segment_access_rights(struct kvm_segment *var)
3552 {
3553         u32 ar;
3554 
3555         if (var->unusable || !var->present)
3556                 ar = 1 << 16;
3557         else {
3558                 ar = var->type & 15;
3559                 ar |= (var->s & 1) << 4;
3560                 ar |= (var->dpl & 3) << 5;
3561                 ar |= (var->present & 1) << 7;
3562                 ar |= (var->avl & 1) << 12;
3563                 ar |= (var->l & 1) << 13;
3564                 ar |= (var->db & 1) << 14;
3565                 ar |= (var->g & 1) << 15;
3566         }
3567 
3568         return ar;
3569 }
3570 
3571 static void vmx_set_segment(struct kvm_vcpu *vcpu,
3572                             struct kvm_segment *var, int seg)
3573 {
3574         struct vcpu_vmx *vmx = to_vmx(vcpu);
3575         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3576 
3577         vmx_segment_cache_clear(vmx);
3578         if (seg == VCPU_SREG_CS)
3579                 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3580 
3581         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3582                 vmx->rmode.segs[seg] = *var;
3583                 if (seg == VCPU_SREG_TR)
3584                         vmcs_write16(sf->selector, var->selector);
3585                 else if (var->s)
3586                         fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3587                 goto out;
3588         }
3589 
3590         vmcs_writel(sf->base, var->base);
3591         vmcs_write32(sf->limit, var->limit);
3592         vmcs_write16(sf->selector, var->selector);
3593 
3594         /*
3595          *   Fix the "Accessed" bit in AR field of segment registers for older
3596          * qemu binaries.
3597          *   IA32 arch specifies that at the time of processor reset the
3598          * "Accessed" bit in the AR field of segment registers is 1. And qemu
3599          * is setting it to 0 in the userland code. This causes invalid guest
3600          * state vmexit when "unrestricted guest" mode is turned on.
3601          *    Fix for this setup issue in cpu_reset is being pushed in the qemu
3602          * tree. Newer qemu binaries with that qemu fix would not need this
3603          * kvm hack.
3604          */
3605         if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
3606                 var->type |= 0x1; /* Accessed */
3607 
3608         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
3609 
3610 out:
3611         vmx->emulation_required |= emulation_required(vcpu);
3612 }
3613 
3614 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3615 {
3616         u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
3617 
3618         *db = (ar >> 14) & 1;
3619         *l = (ar >> 13) & 1;
3620 }
3621 
3622 static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3623 {
3624         dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
3625         dt->address = vmcs_readl(GUEST_IDTR_BASE);
3626 }
3627 
3628 static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3629 {
3630         vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
3631         vmcs_writel(GUEST_IDTR_BASE, dt->address);
3632 }
3633 
3634 static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3635 {
3636         dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
3637         dt->address = vmcs_readl(GUEST_GDTR_BASE);
3638 }
3639 
3640 static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3641 {
3642         vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
3643         vmcs_writel(GUEST_GDTR_BASE, dt->address);
3644 }
3645 
3646 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3647 {
3648         struct kvm_segment var;
3649         u32 ar;
3650 
3651         vmx_get_segment(vcpu, &var, seg);
3652         var.dpl = 0x3;
3653         if (seg == VCPU_SREG_CS)
3654                 var.type = 0x3;
3655         ar = vmx_segment_access_rights(&var);
3656 
3657         if (var.base != (var.selector << 4))
3658                 return false;
3659         if (var.limit != 0xffff)
3660                 return false;
3661         if (ar != 0xf3)
3662                 return false;
3663 
3664         return true;
3665 }
3666 
3667 static bool code_segment_valid(struct kvm_vcpu *vcpu)
3668 {
3669         struct kvm_segment cs;
3670         unsigned int cs_rpl;
3671 
3672         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3673         cs_rpl = cs.selector & SELECTOR_RPL_MASK;
3674 
3675         if (cs.unusable)
3676                 return false;
3677         if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
3678                 return false;
3679         if (!cs.s)
3680                 return false;
3681         if (cs.type & AR_TYPE_WRITEABLE_MASK) {
3682                 if (cs.dpl > cs_rpl)
3683                         return false;
3684         } else {
3685                 if (cs.dpl != cs_rpl)
3686                         return false;
3687         }
3688         if (!cs.present)
3689                 return false;
3690 
3691         /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
3692         return true;
3693 }
3694 
3695 static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3696 {
3697         struct kvm_segment ss;
3698         unsigned int ss_rpl;
3699 
3700         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3701         ss_rpl = ss.selector & SELECTOR_RPL_MASK;
3702 
3703         if (ss.unusable)
3704                 return true;
3705         if (ss.type != 3 && ss.type != 7)
3706                 return false;
3707         if (!ss.s)
3708                 return false;
3709         if (ss.dpl != ss_rpl) /* DPL != RPL */
3710                 return false;
3711         if (!ss.present)
3712                 return false;
3713 
3714         return true;
3715 }
3716 
3717 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
3718 {
3719         struct kvm_segment var;
3720         unsigned int rpl;
3721 
3722         vmx_get_segment(vcpu, &var, seg);
3723         rpl = var.selector & SELECTOR_RPL_MASK;
3724 
3725         if (var.unusable)
3726                 return true;
3727         if (!var.s)
3728                 return false;
3729         if (!var.present)
3730                 return false;
3731         if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
3732                 if (var.dpl < rpl) /* DPL < RPL */
3733                         return false;
3734         }
3735 
3736         /* TODO: Add other members to kvm_segment_field to allow checking for other access
3737          * rights flags
3738          */
3739         return true;
3740 }
3741 
3742 static bool tr_valid(struct kvm_vcpu *vcpu)
3743 {
3744         struct kvm_segment tr;
3745 
3746         vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
3747 
3748         if (tr.unusable)
3749                 return false;
3750         if (tr.selector & SELECTOR_TI_MASK)     /* TI = 1 */
3751                 return false;
3752         if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
3753                 return false;
3754         if (!tr.present)
3755                 return false;
3756 
3757         return true;
3758 }
3759 
3760 static bool ldtr_valid(struct kvm_vcpu *vcpu)
3761 {
3762         struct kvm_segment ldtr;
3763 
3764         vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
3765 
3766         if (ldtr.unusable)
3767                 return true;
3768         if (ldtr.selector & SELECTOR_TI_MASK)   /* TI = 1 */
3769                 return false;
3770         if (ldtr.type != 2)
3771                 return false;
3772         if (!ldtr.present)
3773                 return false;
3774 
3775         return true;
3776 }
3777 
3778 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3779 {
3780         struct kvm_segment cs, ss;
3781 
3782         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3783         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3784 
3785         return ((cs.selector & SELECTOR_RPL_MASK) ==
3786                  (ss.selector & SELECTOR_RPL_MASK));
3787 }
3788 
3789 /*
3790  * Check if guest state is valid. Returns true if valid, false if
3791  * not.
3792  * We assume that registers are always usable
3793  */
3794 static bool guest_state_valid(struct kvm_vcpu *vcpu)
3795 {
3796         if (enable_unrestricted_guest)
3797                 return true;
3798 
3799         /* real mode guest state checks */
3800         if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
3801                 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
3802                         return false;
3803                 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
3804                         return false;
3805                 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
3806                         return false;
3807                 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
3808                         return false;
3809                 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
3810                         return false;
3811                 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
3812                         return false;
3813         } else {
3814         /* protected mode guest state checks */
3815                 if (!cs_ss_rpl_check(vcpu))
3816                         return false;
3817                 if (!code_segment_valid(vcpu))
3818                         return false;
3819                 if (!stack_segment_valid(vcpu))
3820                         return false;
3821                 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
3822                         return false;
3823                 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
3824                         return false;
3825                 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
3826                         return false;
3827                 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
3828                         return false;
3829                 if (!tr_valid(vcpu))
3830                         return false;
3831                 if (!ldtr_valid(vcpu))
3832                         return false;
3833         }
3834         /* TODO:
3835          * - Add checks on RIP
3836          * - Add checks on RFLAGS
3837          */
3838 
3839         return true;
3840 }
3841 
3842 static int init_rmode_tss(struct kvm *kvm)
3843 {
3844         gfn_t fn;
3845         u16 data = 0;
3846         int r, idx, ret = 0;
3847 
3848         idx = srcu_read_lock(&kvm->srcu);
3849         fn = kvm->arch.tss_addr >> PAGE_SHIFT;
3850         r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
3851         if (r < 0)
3852                 goto out;
3853         data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
3854         r = kvm_write_guest_page(kvm, fn++, &data,
3855                         TSS_IOPB_BASE_OFFSET, sizeof(u16));
3856         if (r < 0)
3857                 goto out;
3858         r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
3859         if (r < 0)
3860                 goto out;
3861         r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
3862         if (r < 0)
3863                 goto out;
3864         data = ~0;
3865         r = kvm_write_guest_page(kvm, fn, &data,
3866                                  RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
3867                                  sizeof(u8));
3868         if (r < 0)
3869                 goto out;
3870 
3871         ret = 1;
3872 out:
3873         srcu_read_unlock(&kvm->srcu, idx);
3874         return ret;
3875 }
3876 
3877 static int init_rmode_identity_map(struct kvm *kvm)
3878 {
3879         int i, idx, r, ret;
3880         pfn_t identity_map_pfn;
3881         u32 tmp;
3882 
3883         if (!enable_ept)
3884                 return 1;
3885         if (unlikely(!kvm->arch.ept_identity_pagetable)) {
3886                 printk(KERN_ERR "EPT: identity-mapping pagetable "
3887                         "haven't been allocated!\n");
3888                 return 0;
3889         }
3890         if (likely(kvm->arch.ept_identity_pagetable_done))
3891                 return 1;
3892         ret = 0;
3893         identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
3894         idx = srcu_read_lock(&kvm->srcu);
3895         r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
3896         if (r < 0)
3897                 goto out;
3898         /* Set up identity-mapping pagetable for EPT in real mode */
3899         for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
3900                 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
3901                         _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
3902                 r = kvm_write_guest_page(kvm, identity_map_pfn,
3903                                 &tmp, i * sizeof(tmp), sizeof(tmp));
3904                 if (r < 0)
3905                         goto out;
3906         }
3907         kvm->arch.ept_identity_pagetable_done = true;
3908         ret = 1;
3909 out:
3910         srcu_read_unlock(&kvm->srcu, idx);
3911         return ret;
3912 }
3913 
3914 static void seg_setup(int seg)
3915 {
3916         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3917         unsigned int ar;
3918 
3919         vmcs_write16(sf->selector, 0);
3920         vmcs_writel(sf->base, 0);
3921         vmcs_write32(sf->limit, 0xffff);
3922         ar = 0x93;
3923         if (seg == VCPU_SREG_CS)
3924                 ar |= 0x08; /* code segment */
3925 
3926         vmcs_write32(sf->ar_bytes, ar);
3927 }
3928 
3929 static int alloc_apic_access_page(struct kvm *kvm)
3930 {
3931         struct page *page;
3932         struct kvm_userspace_memory_region kvm_userspace_mem;
3933         int r = 0;
3934 
3935         mutex_lock(&kvm->slots_lock);
3936         if (kvm->arch.apic_access_page)
3937                 goto out;
3938         kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
3939         kvm_userspace_mem.flags = 0;
3940         kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
3941         kvm_userspace_mem.memory_size = PAGE_SIZE;
3942         r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
3943         if (r)
3944                 goto out;
3945 
3946         page = gfn_to_page(kvm, 0xfee00);
3947         if (is_error_page(page)) {
3948                 r = -EFAULT;
3949                 goto out;
3950         }
3951 
3952         kvm->arch.apic_access_page = page;
3953 out:
3954         mutex_unlock(&kvm->slots_lock);
3955         return r;
3956 }
3957 
3958 static int alloc_identity_pagetable(struct kvm *kvm)
3959 {
3960         struct page *page;
3961         struct kvm_userspace_memory_region kvm_userspace_mem;
3962         int r = 0;
3963 
3964         mutex_lock(&kvm->slots_lock);
3965         if (kvm->arch.ept_identity_pagetable)
3966                 goto out;
3967         kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
3968         kvm_userspace_mem.flags = 0;
3969         kvm_userspace_mem.guest_phys_addr =
3970                 kvm->arch.ept_identity_map_addr;
3971         kvm_userspace_mem.memory_size = PAGE_SIZE;
3972         r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
3973         if (r)
3974                 goto out;
3975 
3976         page = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
3977         if (is_error_page(page)) {
3978                 r = -EFAULT;
3979                 goto out;
3980         }
3981 
3982         kvm->arch.ept_identity_pagetable = page;
3983 out:
3984         mutex_unlock(&kvm->slots_lock);
3985         return r;
3986 }
3987 
3988 static void allocate_vpid(struct vcpu_vmx *vmx)
3989 {
3990         int vpid;
3991 
3992         vmx->vpid = 0;
3993         if (!enable_vpid)
3994                 return;
3995         spin_lock(&vmx_vpid_lock);
3996         vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
3997         if (vpid < VMX_NR_VPIDS) {
3998                 vmx->vpid = vpid;
3999                 __set_bit(vpid, vmx_vpid_bitmap);
4000         }
4001         spin_unlock(&vmx_vpid_lock);
4002 }
4003 
4004 static void free_vpid(struct vcpu_vmx *vmx)
4005 {
4006         if (!enable_vpid)
4007                 return;
4008         spin_lock(&vmx_vpid_lock);
4009         if (vmx->vpid != 0)
4010                 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
4011         spin_unlock(&vmx_vpid_lock);
4012 }
4013 
4014 #define MSR_TYPE_R      1
4015 #define MSR_TYPE_W      2
4016 static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
4017                                                 u32 msr, int type)
4018 {
4019         int f = sizeof(unsigned long);
4020 
4021         if (!cpu_has_vmx_msr_bitmap())
4022                 return;
4023 
4024         /*
4025          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
4026          * have the write-low and read-high bitmap offsets the wrong way round.
4027          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
4028          */
4029         if (msr <= 0x1fff) {
4030                 if (type & MSR_TYPE_R)
4031                         /* read-low */
4032                         __clear_bit(msr, msr_bitmap + 0x000 / f);
4033 
4034                 if (type & MSR_TYPE_W)
4035                         /* write-low */
4036                         __clear_bit(msr, msr_bitmap + 0x800 / f);
4037 
4038         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
4039                 msr &= 0x1fff;
4040                 if (type & MSR_TYPE_R)
4041                         /* read-high */
4042                         __clear_bit(msr, msr_bitmap + 0x400 / f);
4043 
4044                 if (type & MSR_TYPE_W)
4045                         /* write-high */
4046                         __clear_bit(msr, msr_bitmap + 0xc00 / f);
4047 
4048         }
4049 }
4050 
4051 static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
4052                                                 u32 msr, int type)
4053 {
4054         int f = sizeof(unsigned long);
4055 
4056         if (!cpu_has_vmx_msr_bitmap())
4057                 return;
4058 
4059         /*
4060          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
4061          * have the write-low and read-high bitmap offsets the wrong way round.
4062          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
4063          */
4064         if (msr <= 0x1fff) {
4065                 if (type & MSR_TYPE_R)
4066                         /* read-low */
4067                         __set_bit(msr, msr_bitmap + 0x000 / f);
4068 
4069                 if (type & MSR_TYPE_W)
4070                         /* write-low */
4071                         __set_bit(msr, msr_bitmap + 0x800 / f);
4072 
4073         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
4074                 msr &= 0x1fff;
4075                 if (type & MSR_TYPE_R)
4076                         /* read-high */
4077                         __set_bit(msr, msr_bitmap + 0x400 / f);
4078 
4079                 if (type & MSR_TYPE_W)
4080                         /* write-high */
4081                         __set_bit(msr, msr_bitmap + 0xc00 / f);
4082 
4083         }
4084 }
4085 
4086 static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
4087 {
4088         if (!longmode_only)
4089                 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
4090                                                 msr, MSR_TYPE_R | MSR_TYPE_W);
4091         __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
4092                                                 msr, MSR_TYPE_R | MSR_TYPE_W);
4093 }
4094 
4095 static void vmx_enable_intercept_msr_read_x2apic(u32 msr)
4096 {
4097         __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
4098                         msr, MSR_TYPE_R);
4099         __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
4100                         msr, MSR_TYPE_R);
4101 }
4102 
4103 static void vmx_disable_intercept_msr_read_x2apic(u32 msr)
4104 {
4105         __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
4106                         msr, MSR_TYPE_R);
4107         __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
4108                         msr, MSR_TYPE_R);
4109 }
4110 
4111 static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
4112 {
4113         __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
4114                         msr, MSR_TYPE_W);
4115         __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
4116                         msr, MSR_TYPE_W);
4117 }
4118 
4119 static int vmx_vm_has_apicv(struct kvm *kvm)
4120 {
4121         return enable_apicv && irqchip_in_kernel(kvm);
4122 }
4123 
4124 /*
4125  * Send interrupt to vcpu via posted interrupt way.
4126  * 1. If target vcpu is running(non-root mode), send posted interrupt
4127  * notification to vcpu and hardware will sync PIR to vIRR atomically.
4128  * 2. If target vcpu isn't running(root mode), kick it to pick up the
4129  * interrupt from PIR in next vmentry.
4130  */
4131 static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
4132 {
4133         struct vcpu_vmx *vmx = to_vmx(vcpu);
4134         int r;
4135 
4136         if (pi_test_and_set_pir(vector, &vmx->pi_desc))
4137                 return;
4138 
4139         r = pi_test_and_set_on(&vmx->pi_desc);
4140         kvm_make_request(KVM_REQ_EVENT, vcpu);
4141 #ifdef CONFIG_SMP
4142         if (!r && (vcpu->mode == IN_GUEST_MODE))
4143                 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
4144                                 POSTED_INTR_VECTOR);
4145         else
4146 #endif
4147                 kvm_vcpu_kick(vcpu);
4148 }
4149 
4150 static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
4151 {
4152         struct vcpu_vmx *vmx = to_vmx(vcpu);
4153 
4154         if (!pi_test_and_clear_on(&vmx->pi_desc))
4155                 return;
4156 
4157         kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
4158 }
4159 
4160 static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu)
4161 {
4162         return;
4163 }
4164 
4165 /*
4166  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
4167  * will not change in the lifetime of the guest.
4168  * Note that host-state that does change is set elsewhere. E.g., host-state
4169  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
4170  */
4171 static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
4172 {
4173         u32 low32, high32;
4174         unsigned long tmpl;
4175         struct desc_ptr dt;
4176         unsigned long cr4;
4177 
4178         vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS);  /* 22.2.3 */
4179         vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
4180 
4181         /* Save the most likely value for this task's CR4 in the VMCS. */
4182         cr4 = read_cr4();
4183         vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
4184         vmx->host_state.vmcs_host_cr4 = cr4;
4185 
4186         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
4187 #ifdef CONFIG_X86_64
4188         /*
4189          * Load null selectors, so we can avoid reloading them in
4190          * __vmx_load_host_state(), in case userspace uses the null selectors
4191          * too (the expected case).
4192          */
4193         vmcs_write16(HOST_DS_SELECTOR, 0);
4194         vmcs_write16(HOST_ES_SELECTOR, 0);
4195 #else
4196         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4197         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4198 #endif
4199         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4200         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
4201 
4202         native_store_idt(&dt);
4203         vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
4204         vmx->host_idt_base = dt.address;
4205 
4206         vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
4207 
4208         rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
4209         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
4210         rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
4211         vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
4212 
4213         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
4214                 rdmsr(MSR_IA32_CR_PAT, low32, high32);
4215                 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
4216         }
4217 }
4218 
4219 static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
4220 {
4221         vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
4222         if (enable_ept)
4223                 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
4224         if (is_guest_mode(&vmx->vcpu))
4225                 vmx->vcpu.arch.cr4_guest_owned_bits &=
4226                         ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
4227         vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
4228 }
4229 
4230 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
4231 {
4232         u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
4233 
4234         if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
4235                 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
4236         return pin_based_exec_ctrl;
4237 }
4238 
4239 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
4240 {
4241         u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
4242         if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
4243                 exec_control &= ~CPU_BASED_TPR_SHADOW;
4244 #ifdef CONFIG_X86_64
4245                 exec_control |= CPU_BASED_CR8_STORE_EXITING |
4246                                 CPU_BASED_CR8_LOAD_EXITING;
4247 #endif
4248         }
4249         if (!enable_ept)
4250                 exec_control |= CPU_BASED_CR3_STORE_EXITING |
4251                                 CPU_BASED_CR3_LOAD_EXITING  |
4252                                 CPU_BASED_INVLPG_EXITING;
4253         return exec_control;
4254 }
4255 
4256 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
4257 {
4258         u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
4259         if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
4260                 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4261         if (vmx->vpid == 0)
4262                 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
4263         if (!enable_ept) {
4264                 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
4265                 enable_unrestricted_guest = 0;
4266                 /* Enable INVPCID for non-ept guests may cause performance regression. */
4267                 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
4268         }
4269         if (!enable_unrestricted_guest)
4270                 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
4271         if (!ple_gap)
4272                 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
4273         if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
4274                 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
4275                                   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4276         exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
4277         /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
4278            (handle_vmptrld).
4279            We can NOT enable shadow_vmcs here because we don't have yet
4280            a current VMCS12
4281         */
4282         exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
4283         return exec_control;
4284 }
4285 
4286 static void ept_set_mmio_spte_mask(void)
4287 {
4288         /*
4289          * EPT Misconfigurations can be generated if the value of bits 2:0
4290          * of an EPT paging-structure entry is 110b (write/execute).
4291          * Also, magic bits (0x3ull << 62) is set to quickly identify mmio
4292          * spte.
4293          */
4294         kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
4295 }
4296 
4297 /*
4298  * Sets up the vmcs for emulated real mode.
4299  */
4300 static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
4301 {
4302 #ifdef CONFIG_X86_64
4303         unsigned long a;
4304 #endif
4305         int i;
4306 
4307         /* I/O */
4308         vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
4309         vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
4310 
4311         if (enable_shadow_vmcs) {
4312                 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
4313                 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
4314         }
4315         if (cpu_has_vmx_msr_bitmap())
4316                 vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
4317 
4318         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
4319 
4320         /* Control */
4321         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
4322 
4323         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
4324 
4325         if (cpu_has_secondary_exec_ctrls()) {
4326                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
4327                                 vmx_secondary_exec_control(vmx));
4328         }
4329 
4330         if (vmx_vm_has_apicv(vmx->vcpu.kvm)) {
4331                 vmcs_write64(EOI_EXIT_BITMAP0, 0);
4332                 vmcs_write64(EOI_EXIT_BITMAP1, 0);
4333                 vmcs_write64(EOI_EXIT_BITMAP2, 0);
4334                 vmcs_write64(EOI_EXIT_BITMAP3, 0);
4335 
4336                 vmcs_write16(GUEST_INTR_STATUS, 0);
4337 
4338                 vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
4339                 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
4340         }
4341 
4342         if (ple_gap) {
4343                 vmcs_write32(PLE_GAP, ple_gap);
4344                 vmcs_write32(PLE_WINDOW, ple_window);
4345         }
4346 
4347         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
4348         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
4349         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
4350 
4351         vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
4352         vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
4353         vmx_set_constant_host_state(vmx);
4354 #ifdef CONFIG_X86_64
4355         rdmsrl(MSR_FS_BASE, a);
4356         vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
4357         rdmsrl(MSR_GS_BASE, a);
4358         vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
4359 #else
4360         vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
4361         vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
4362 #endif
4363 
4364         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
4365         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
4366         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
4367         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
4368         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
4369 
4370         if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
4371                 u32 msr_low, msr_high;
4372                 u64 host_pat;
4373                 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
4374                 host_pat = msr_low | ((u64) msr_high << 32);
4375                 /* Write the default value follow host pat */
4376                 vmcs_write64(GUEST_IA32_PAT, host_pat);
4377                 /* Keep arch.pat sync with GUEST_IA32_PAT */
4378                 vmx->vcpu.arch.pat = host_pat;
4379         }
4380 
4381         for (i = 0; i < NR_VMX_MSR; ++i) {
4382                 u32 index = vmx_msr_index[i];
4383                 u32 data_low, data_high;
4384                 int j = vmx->nmsrs;
4385 
4386                 if (rdmsr_safe(index, &data_low, &data_high) < 0)
4387                         continue;
4388                 if (wrmsr_safe(index, data_low, data_high) < 0)
4389                         continue;
4390                 vmx->guest_msrs[j].index = i;
4391                 vmx->guest_msrs[j].data = 0;
4392                 vmx->guest_msrs[j].mask = -1ull;
4393                 ++vmx->nmsrs;
4394         }
4395 
4396 
4397         vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
4398 
4399         /* 22.2.1, 20.8.1 */
4400         vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
4401 
4402         vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
4403         set_cr4_guest_host_mask(vmx);
4404 
4405         return 0;
4406 }
4407 
4408 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4409 {
4410         struct vcpu_vmx *vmx = to_vmx(vcpu);
4411         struct msr_data apic_base_msr;
4412 
4413         vmx->rmode.vm86_active = 0;
4414 
4415         vmx->soft_vnmi_blocked = 0;
4416 
4417         vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
4418         kvm_set_cr8(&vmx->vcpu, 0);
4419         apic_base_msr.data = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
4420         if (kvm_vcpu_is_bsp(&vmx->vcpu))
4421                 apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
4422         apic_base_msr.host_initiated = true;
4423         kvm_set_apic_base(&vmx->vcpu, &apic_base_msr);
4424 
4425         vmx_segment_cache_clear(vmx);
4426 
4427         seg_setup(VCPU_SREG_CS);
4428         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
4429         vmcs_write32(GUEST_CS_BASE, 0xffff0000);
4430 
4431         seg_setup(VCPU_SREG_DS);
4432         seg_setup(VCPU_SREG_ES);
4433         seg_setup(VCPU_SREG_FS);
4434         seg_setup(VCPU_SREG_GS);
4435         seg_setup(VCPU_SREG_SS);
4436 
4437         vmcs_write16(GUEST_TR_SELECTOR, 0);
4438         vmcs_writel(GUEST_TR_BASE, 0);
4439         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
4440         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
4441 
4442         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
4443         vmcs_writel(GUEST_LDTR_BASE, 0);
4444         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
4445         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
4446 
4447         vmcs_write32(GUEST_SYSENTER_CS, 0);
4448         vmcs_writel(GUEST_SYSENTER_ESP, 0);
4449         vmcs_writel(GUEST_SYSENTER_EIP, 0);
4450 
4451         vmcs_writel(GUEST_RFLAGS, 0x02);
4452         kvm_rip_write(vcpu, 0xfff0);
4453 
4454         vmcs_writel(GUEST_GDTR_BASE, 0);
4455         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
4456 
4457         vmcs_writel(GUEST_IDTR_BASE, 0);
4458         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
4459 
4460         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
4461         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
4462         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
4463 
4464         /* Special registers */
4465         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4466 
4467         setup_msrs(vmx);
4468 
4469         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
4470 
4471         if (cpu_has_vmx_tpr_shadow()) {
4472                 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
4473                 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
4474                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
4475                                      __pa(vmx->vcpu.arch.apic->regs));
4476                 vmcs_write32(TPR_THRESHOLD, 0);
4477         }
4478 
4479         if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
4480                 vmcs_write64(APIC_ACCESS_ADDR,
4481                              page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
4482 
4483         if (vmx_vm_has_apicv(vcpu->kvm))
4484                 memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
4485 
4486         if (vmx->vpid != 0)
4487                 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
4488 
4489         vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
4490         vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
4491         vmx_set_cr4(&vmx->vcpu, 0);
4492         vmx_set_efer(&vmx->vcpu, 0);
4493         vmx_fpu_activate(&vmx->vcpu);
4494         update_exception_bitmap(&vmx->vcpu);
4495 
4496         vpid_sync_context(vmx);
4497 }
4498 
4499 /*
4500  * In nested virtualization, check if L1 asked to exit on external interrupts.
4501  * For most existing hypervisors, this will always return true.
4502  */
4503 static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
4504 {
4505         return get_vmcs12(vcpu)->pin_based_vm_exec_control &
4506                 PIN_BASED_EXT_INTR_MASK;
4507 }
4508 
4509 static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
4510 {
4511         return get_vmcs12(vcpu)->pin_based_vm_exec_control &
4512                 PIN_BASED_NMI_EXITING;
4513 }
4514 
4515 static int enable_irq_window(struct kvm_vcpu *vcpu)
4516 {
4517         u32 cpu_based_vm_exec_control;
4518 
4519         if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
4520                 /*
4521                  * We get here if vmx_interrupt_allowed() said we can't
4522                  * inject to L1 now because L2 must run. The caller will have
4523                  * to make L2 exit right after entry, so we can inject to L1
4524                  * more promptly.
4525                  */
4526                 return -EBUSY;
4527 
4528         cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
4529         cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
4530         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
4531         return 0;
4532 }
4533 
4534 static int enable_nmi_window(struct kvm_vcpu *vcpu)
4535 {
4536         u32 cpu_based_vm_exec_control;
4537 
4538         if (!cpu_has_virtual_nmis())
4539                 return enable_irq_window(vcpu);
4540 
4541         if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI)
4542                 return enable_irq_window(vcpu);
4543 
4544         cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
4545         cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
4546         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
4547         return 0;
4548 }
4549 
4550 static void vmx_inject_irq(struct kvm_vcpu *vcpu)
4551 {
4552         struct vcpu_vmx *vmx = to_vmx(vcpu);
4553         uint32_t intr;
4554         int irq = vcpu->arch.interrupt.nr;
4555 
4556         trace_kvm_inj_virq(irq);
4557 
4558         ++vcpu->stat.irq_injections;
4559         if (vmx->rmode.vm86_active) {
4560                 int inc_eip = 0;
4561                 if (vcpu->arch.interrupt.soft)
4562                         inc_eip = vcpu->arch.event_exit_inst_len;
4563                 if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
4564                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4565                 return;
4566         }
4567         intr = irq | INTR_INFO_VALID_MASK;
4568         if (vcpu->arch.interrupt.soft) {
4569                 intr |= INTR_TYPE_SOFT_INTR;
4570                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
4571                              vmx->vcpu.arch.event_exit_inst_len);
4572         } else
4573                 intr |= INTR_TYPE_EXT_INTR;
4574         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
4575 }
4576 
4577 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
4578 {
4579         struct vcpu_vmx *vmx = to_vmx(vcpu);
4580 
4581         if (is_guest_mode(vcpu))
4582                 return;
4583 
4584         if (!cpu_has_virtual_nmis()) {
4585                 /*
4586                  * Tracking the NMI-blocked state in software is built upon
4587                  * finding the next open IRQ window. This, in turn, depends on
4588                  * well-behaving guests: They have to keep IRQs disabled at
4589                  * least as long as the NMI handler runs. Otherwise we may
4590                  * cause NMI nesting, maybe breaking the guest. But as this is
4591                  * highly unlikely, we can live with the residual risk.
4592                  */
4593                 vmx->soft_vnmi_blocked = 1;
4594                 vmx->vnmi_blocked_time = 0;
4595         }
4596 
4597         ++vcpu->stat.nmi_injections;
4598         vmx->nmi_known_unmasked = false;
4599         if (vmx->rmode.vm86_active) {
4600                 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
4601                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4602                 return;
4603         }
4604         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
4605                         INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
4606 }
4607 
4608 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
4609 {
4610         if (!cpu_has_virtual_nmis())
4611                 return to_vmx(vcpu)->soft_vnmi_blocked;
4612         if (to_vmx(vcpu)->nmi_known_unmasked)
4613                 return false;
4614         return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
4615 }
4616 
4617 static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
4618 {
4619         struct vcpu_vmx *vmx = to_vmx(vcpu);
4620 
4621         if (!cpu_has_virtual_nmis()) {
4622                 if (vmx->soft_vnmi_blocked != masked) {
4623                         vmx->soft_vnmi_blocked = masked;
4624                         vmx->vnmi_blocked_time = 0;
4625                 }
4626         } else {
4627                 vmx->nmi_known_unmasked = !masked;
4628                 if (masked)
4629                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
4630                                       GUEST_INTR_STATE_NMI);
4631                 else
4632                         vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
4633                                         GUEST_INTR_STATE_NMI);
4634         }
4635 }
4636 
4637 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
4638 {
4639         if (is_guest_mode(vcpu)) {
4640                 if (to_vmx(vcpu)->nested.nested_run_pending)
4641                         return 0;
4642                 if (nested_exit_on_nmi(vcpu)) {
4643                         nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
4644                                           NMI_VECTOR | INTR_TYPE_NMI_INTR |
4645                                           INTR_INFO_VALID_MASK, 0);
4646                         /*
4647                          * The NMI-triggered VM exit counts as injection:
4648                          * clear this one and block further NMIs.
4649                          */
4650                         vcpu->arch.nmi_pending = 0;
4651                         vmx_set_nmi_mask(vcpu, true);
4652                         return 0;
4653                 }
4654         }
4655 
4656         if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
4657                 return 0;
4658 
4659         return  !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4660                   (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
4661                    | GUEST_INTR_STATE_NMI));
4662 }
4663 
4664 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
4665 {
4666         if (is_guest_mode(vcpu)) {
4667                 if (to_vmx(vcpu)->nested.nested_run_pending)
4668                         return 0;
4669                 if (nested_exit_on_intr(vcpu)) {
4670                         nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
4671                                           0, 0);
4672                         /*
4673                          * fall through to normal code, but now in L1, not L2
4674                          */
4675                 }
4676         }
4677 
4678         return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
4679                 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4680                         (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
4681 }
4682 
4683 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
4684 {
4685         int ret;
4686         struct kvm_userspace_memory_region tss_mem = {
4687                 .slot = TSS_PRIVATE_MEMSLOT,
4688                 .guest_phys_addr = addr,
4689                 .memory_size = PAGE_SIZE * 3,
4690                 .flags = 0,
4691         };
4692 
4693         ret = kvm_set_memory_region(kvm, &tss_mem);
4694         if (ret)
4695                 return ret;
4696         kvm->arch.tss_addr = addr;
4697         if (!init_rmode_tss(kvm))
4698                 return  -ENOMEM;
4699 
4700         return 0;
4701 }
4702 
4703 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
4704 {
4705         switch (vec) {
4706         case BP_VECTOR:
4707                 /*
4708                  * Update instruction length as we may reinject the exception
4709                  * from user space while in guest debugging mode.
4710                  */
4711                 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
4712                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
4713                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
4714                         return false;
4715                 /* fall through */
4716         case DB_VECTOR:
4717                 if (vcpu->guest_debug &
4718                         (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
4719                         return false;
4720                 /* fall through */
4721         case DE_VECTOR:
4722         case OF_VECTOR:
4723         case BR_VECTOR:
4724         case UD_VECTOR:
4725         case DF_VECTOR:
4726         case SS_VECTOR:
4727         case GP_VECTOR:
4728         case MF_VECTOR:
4729                 return true;
4730         break;
4731         }
4732         return false;
4733 }
4734 
4735 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
4736                                   int vec, u32 err_code)
4737 {
4738         /*
4739          * Instruction with address size override prefix opcode 0x67
4740          * Cause the #SS fault with 0 error code in VM86 mode.
4741          */
4742         if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
4743                 if (emulate_instruction(vcpu, 0) == EMULATE_DONE) {
4744                         if (vcpu->arch.halt_request) {
4745                                 vcpu->arch.halt_request = 0;
4746                                 return kvm_emulate_halt(vcpu);
4747                         }
4748                         return 1;
4749                 }
4750                 return 0;
4751         }
4752 
4753         /*
4754          * Forward all other exceptions that are valid in real mode.
4755          * FIXME: Breaks guest debugging in real mode, needs to be fixed with
4756          *        the required debugging infrastructure rework.
4757          */
4758         kvm_queue_exception(vcpu, vec);
4759         return 1;
4760 }
4761 
4762 /*
4763  * Trigger machine check on the host. We assume all the MSRs are already set up
4764  * by the CPU and that we still run on the same CPU as the MCE occurred on.
4765  * We pass a fake environment to the machine check handler because we want
4766  * the guest to be always treated like user space, no matter what context
4767  * it used internally.
4768  */
4769 static void kvm_machine_check(void)
4770 {
4771 #if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
4772         struct pt_regs regs = {
4773                 .cs = 3, /* Fake ring 3 no matter what the guest ran on */
4774                 .flags = X86_EFLAGS_IF,
4775         };
4776 
4777         do_machine_check(&regs, 0);
4778 #endif
4779 }
4780 
4781 static int handle_machine_check(struct kvm_vcpu *vcpu)
4782 {
4783         /* already handled by vcpu_run */
4784         return 1;
4785 }
4786 
4787 static int handle_exception(struct kvm_vcpu *vcpu)
4788 {
4789         struct vcpu_vmx *vmx = to_vmx(vcpu);
4790         struct kvm_run *kvm_run = vcpu->run;
4791         u32 intr_info, ex_no, error_code;
4792         unsigned long cr2, rip, dr6;
4793         u32 vect_info;
4794         enum emulation_result er;
4795 
4796         vect_info = vmx->idt_vectoring_info;
4797         intr_info = vmx->exit_intr_info;
4798 
4799         if (is_machine_check(intr_info))
4800                 return handle_machine_check(vcpu);
4801 
4802         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
4803                 return 1;  /* already handled by vmx_vcpu_run() */
4804 
4805         if (is_no_device(intr_info)) {
4806                 vmx_fpu_activate(vcpu);
4807                 return 1;
4808         }
4809 
4810         if (is_invalid_opcode(intr_info)) {
4811                 er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
4812                 if (er != EMULATE_DONE)
4813                         kvm_queue_exception(vcpu, UD_VECTOR);
4814                 return 1;
4815         }
4816 
4817         error_code = 0;
4818         if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
4819                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
4820 
4821         /*
4822          * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
4823          * MMIO, it is better to report an internal error.
4824          * See the comments in vmx_handle_exit.
4825          */
4826         if ((vect_info & VECTORING_INFO_VALID_MASK) &&
4827             !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
4828                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4829                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
4830                 vcpu->run->internal.ndata = 2;
4831                 vcpu->run->internal.data[0] = vect_info;
4832                 vcpu->run->internal.data[1] = intr_info;
4833                 return 0;
4834         }
4835 
4836         if (is_page_fault(intr_info)) {
4837                 /* EPT won't cause page fault directly */
4838                 BUG_ON(enable_ept);
4839                 cr2 = vmcs_readl(EXIT_QUALIFICATION);
4840                 trace_kvm_page_fault(cr2, error_code);
4841 
4842                 if (kvm_event_needs_reinjection(vcpu))
4843                         kvm_mmu_unprotect_page_virt(vcpu, cr2);
4844                 return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
4845         }
4846 
4847         ex_no = intr_info & INTR_INFO_VECTOR_MASK;
4848 
4849         if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
4850                 return handle_rmode_exception(vcpu, ex_no, error_code);
4851 
4852         switch (ex_no) {
4853         case DB_VECTOR:
4854                 dr6 = vmcs_readl(EXIT_QUALIFICATION);
4855                 if (!(vcpu->guest_debug &
4856                       (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
4857                         vcpu->arch.dr6 &= ~15;
4858                         vcpu->arch.dr6 |= dr6;
4859                         kvm_queue_exception(vcpu, DB_VECTOR);
4860                         return 1;
4861                 }
4862                 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
4863                 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
4864                 /* fall through */
4865         case BP_VECTOR:
4866                 /*
4867                  * Update instruction length as we may reinject #BP from
4868                  * user space while in guest debugging mode. Reading it for
4869                  * #DB as well causes no harm, it is not used in that case.
4870                  */
4871                 vmx->vcpu.arch.event_exit_inst_len =
4872                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
4873                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
4874                 rip = kvm_rip_read(vcpu);
4875                 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
4876                 kvm_run->debug.arch.exception = ex_no;
4877                 break;
4878         default:
4879                 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
4880                 kvm_run->ex.exception = ex_no;
4881                 kvm_run->ex.error_code = error_code;
4882                 break;
4883         }
4884         return 0;
4885 }
4886 
4887 static int handle_external_interrupt(struct kvm_vcpu *vcpu)
4888 {
4889         ++vcpu->stat.irq_exits;
4890         return 1;
4891 }
4892 
4893 static int handle_triple_fault(struct kvm_vcpu *vcpu)
4894 {
4895         vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
4896         return 0;
4897 }
4898 
4899 static int handle_io(struct kvm_vcpu *vcpu)
4900 {
4901         unsigned long exit_qualification;
4902         int size, in, string;
4903         unsigned port;
4904 
4905         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4906         string = (exit_qualification & 16) != 0;
4907         in = (exit_qualification & 8) != 0;
4908 
4909         ++vcpu->stat.io_exits;
4910 
4911         if (string || in)
4912                 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
4913 
4914         port = exit_qualification >> 16;
4915         size = (exit_qualification & 7) + 1;
4916         skip_emulated_instruction(vcpu);
4917 
4918         return kvm_fast_pio_out(vcpu, size, port);
4919 }
4920 
4921 static void
4922 vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4923 {
4924         /*
4925          * Patch in the VMCALL instruction:
4926          */
4927         hypercall[0] = 0x0f;
4928         hypercall[1] = 0x01;
4929         hypercall[2] = 0xc1;
4930 }
4931 
4932 static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val)
4933 {
4934         unsigned long always_on = VMXON_CR0_ALWAYSON;
4935 
4936         if (nested_vmx_secondary_ctls_high &
4937                 SECONDARY_EXEC_UNRESTRICTED_GUEST &&
4938             nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
4939                 always_on &= ~(X86_CR0_PE | X86_CR0_PG);
4940         return (val & always_on) == always_on;
4941 }
4942 
4943 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
4944 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
4945 {
4946         if (is_guest_mode(vcpu)) {
4947                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4948                 unsigned long orig_val = val;
4949 
4950                 /*
4951                  * We get here when L2 changed cr0 in a way that did not change
4952                  * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
4953                  * but did change L0 shadowed bits. So we first calculate the
4954                  * effective cr0 value that L1 would like to write into the
4955                  * hardware. It consists of the L2-owned bits from the new
4956                  * value combined with the L1-owned bits from L1's guest_cr0.
4957                  */
4958                 val = (val & ~vmcs12->cr0_guest_host_mask) |
4959                         (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
4960 
4961                 if (!nested_cr0_valid(vmcs12, val))
4962                         return 1;
4963 
4964                 if (kvm_set_cr0(vcpu, val))
4965                         return 1;
4966                 vmcs_writel(CR0_READ_SHADOW, orig_val);
4967                 return 0;
4968         } else {
4969                 if (to_vmx(vcpu)->nested.vmxon &&
4970                     ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
4971                         return 1;
4972                 return kvm_set_cr0(vcpu, val);
4973         }
4974 }
4975 
4976 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
4977 {
4978         if (is_guest_mode(vcpu)) {
4979                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4980                 unsigned long orig_val = val;
4981 
4982                 /* analogously to handle_set_cr0 */
4983                 val = (val & ~vmcs12->cr4_guest_host_mask) |
4984                         (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
4985                 if (kvm_set_cr4(vcpu, val))
4986                         return 1;
4987                 vmcs_writel(CR4_READ_SHADOW, orig_val);
4988                 return 0;
4989         } else
4990                 return kvm_set_cr4(vcpu, val);
4991 }
4992 
4993 /* called to set cr0 as approriate for clts instruction exit. */
4994 static void handle_clts(struct kvm_vcpu *vcpu)
4995 {
4996         if (is_guest_mode(vcpu)) {
4997                 /*
4998                  * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS
4999                  * but we did (!fpu_active). We need to keep GUEST_CR0.TS on,
5000                  * just pretend it's off (also in arch.cr0 for fpu_activate).
5001                  */
5002                 vmcs_writel(CR0_READ_SHADOW,
5003                         vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
5004                 vcpu->arch.cr0 &= ~X86_CR0_TS;
5005         } else
5006                 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
5007 }
5008 
5009 static int handle_cr(struct kvm_vcpu *vcpu)
5010 {
5011         unsigned long exit_qualification, val;
5012         int cr;
5013         int reg;
5014         int err;
5015 
5016         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5017         cr = exit_qualification & 15;
5018         reg = (exit_qualification >> 8) & 15;
5019         switch ((exit_qualification >> 4) & 3) {
5020         case 0: /* mov to cr */
5021                 val = kvm_register_read(vcpu, reg);
5022                 trace_kvm_cr_write(cr, val);
5023                 switch (cr) {
5024                 case 0:
5025                         err = handle_set_cr0(vcpu, val);
5026                         kvm_complete_insn_gp(vcpu, err);
5027                         return 1;
5028                 case 3:
5029                         err = kvm_set_cr3(vcpu, val);
5030                         kvm_complete_insn_gp(vcpu, err);
5031                         return 1;
5032                 case 4:
5033                         err = handle_set_cr4(vcpu, val);
5034                         kvm_complete_insn_gp(vcpu, err);
5035                         return 1;
5036                 case 8: {
5037                                 u8 cr8_prev = kvm_get_cr8(vcpu);
5038                                 u8 cr8 = kvm_register_read(vcpu, reg);
5039                                 err = kvm_set_cr8(vcpu, cr8);
5040                                 kvm_complete_insn_gp(vcpu, err);
5041                                 if (irqchip_in_kernel(vcpu->kvm))
5042                                         return 1;
5043                                 if (cr8_prev <= cr8)
5044                                         return 1;
5045                                 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
5046                                 return 0;
5047                         }
5048                 }
5049                 break;
5050         case 2: /* clts */
5051                 handle_clts(vcpu);
5052                 trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
5053                 skip_emulated_instruction(vcpu);
5054                 vmx_fpu_activate(vcpu);
5055                 return 1;
5056         case 1: /*mov from cr*/
5057                 switch (cr) {
5058                 case 3:
5059                         val = kvm_read_cr3(vcpu);
5060                         kvm_register_write(vcpu, reg, val);
5061                         trace_kvm_cr_read(cr, val);
5062                         skip_emulated_instruction(vcpu);
5063                         return 1;
5064                 case 8:
5065                         val = kvm_get_cr8(vcpu);
5066                         kvm_register_write(vcpu, reg, val);
5067                         trace_kvm_cr_read(cr, val);
5068                         skip_emulated_instruction(vcpu);
5069                         return 1;
5070                 }
5071                 break;
5072         case 3: /* lmsw */
5073                 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5074                 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
5075                 kvm_lmsw(vcpu, val);
5076 
5077                 skip_emulated_instruction(vcpu);
5078                 return 1;
5079         default:
5080                 break;
5081         }
5082         vcpu->run->exit_reason = 0;
5083         vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
5084                (int)(exit_qualification >> 4) & 3, cr);
5085         return 0;
5086 }
5087 
5088 static int handle_dr(struct kvm_vcpu *vcpu)
5089 {
5090         unsigned long exit_qualification;
5091         int dr, reg;
5092 
5093         /* Do not handle if the CPL > 0, will trigger GP on re-entry */
5094         if (!kvm_require_cpl(vcpu, 0))
5095                 return 1;
5096         dr = vmcs_readl(GUEST_DR7);
5097         if (dr & DR7_GD) {
5098                 /*
5099                  * As the vm-exit takes precedence over the debug trap, we
5100                  * need to emulate the latter, either for the host or the
5101                  * guest debugging itself.
5102                  */
5103                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
5104                         vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
5105                         vcpu->run->debug.arch.dr7 = dr;
5106                         vcpu->run->debug.arch.pc =
5107                                 vmcs_readl(GUEST_CS_BASE) +
5108                                 vmcs_readl(GUEST_RIP);
5109                         vcpu->run->debug.arch.exception = DB_VECTOR;
5110                         vcpu->run->exit_reason = KVM_EXIT_DEBUG;
5111                         return 0;
5112                 } else {
5113                         vcpu->arch.dr7 &= ~DR7_GD;
5114                         vcpu->arch.dr6 |= DR6_BD;
5115                         vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
5116                         kvm_queue_exception(vcpu, DB_VECTOR);
5117                         return 1;
5118                 }
5119         }
5120 
5121         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5122         dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
5123         reg = DEBUG_REG_ACCESS_REG(exit_qualification);
5124         if (exit_qualification & TYPE_MOV_FROM_DR) {
5125                 unsigned long val;
5126 
5127                 if (kvm_get_dr(vcpu, dr, &val))
5128                         return 1;
5129                 kvm_register_write(vcpu, reg, val);
5130         } else
5131                 if (kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]))
5132                         return 1;
5133 
5134         skip_emulated_instruction(vcpu);
5135         return 1;
5136 }
5137 
5138 static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
5139 {
5140         return vcpu->arch.dr6;
5141 }
5142 
5143 static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
5144 {
5145 }
5146 
5147 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
5148 {
5149         vmcs_writel(GUEST_DR7, val);
5150 }
5151 
5152 static int handle_cpuid(struct kvm_vcpu *vcpu)
5153 {
5154         kvm_emulate_cpuid(vcpu);
5155         return 1;
5156 }
5157 
5158 static int handle_rdmsr(struct kvm_vcpu *vcpu)
5159 {
5160         u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
5161         u64 data;
5162 
5163         if (vmx_get_msr(vcpu, ecx, &data)) {
5164                 trace_kvm_msr_read_ex(ecx);
5165                 kvm_inject_gp(vcpu, 0);
5166                 return 1;
5167         }
5168 
5169         trace_kvm_msr_read(ecx, data);
5170 
5171         /* FIXME: handling of bits 32:63 of rax, rdx */
5172         vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
5173         vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
5174         skip_emulated_instruction(vcpu);
5175         return 1;
5176 }
5177 
5178 static int handle_wrmsr(struct kvm_vcpu *vcpu)
5179 {
5180         struct msr_data msr;
5181         u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
5182         u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
5183                 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
5184 
5185         msr.data = data;
5186         msr.index = ecx;
5187         msr.host_initiated = false;
5188         if (kvm_set_msr(vcpu, &msr) != 0) {
5189                 trace_kvm_msr_write_ex(ecx, data);
5190                 kvm_inject_gp(vcpu, 0);
5191                 return 1;
5192         }
5193 
5194         trace_kvm_msr_write(ecx, data);
5195         skip_emulated_instruction(vcpu);
5196         return 1;
5197 }
5198 
5199 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
5200 {
5201         kvm_make_request(KVM_REQ_EVENT, vcpu);
5202         return 1;
5203 }
5204 
5205 static int handle_interrupt_window(struct kvm_vcpu *vcpu)
5206 {
5207         u32 cpu_based_vm_exec_control;
5208 
5209         /* clear pending irq */
5210         cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5211         cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
5212         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
5213 
5214         kvm_make_request(KVM_REQ_EVENT, vcpu);
5215 
5216         ++vcpu->stat.irq_window_exits;
5217 
5218         /*
5219          * If the user space waits to inject interrupts, exit as soon as
5220          * possible
5221          */
5222         if (!irqchip_in_kernel(vcpu->kvm) &&
5223             vcpu->run->request_interrupt_window &&
5224             !kvm_cpu_has_interrupt(vcpu)) {
5225                 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
5226                 return 0;
5227         }
5228         return 1;
5229 }
5230 
5231 static int handle_halt(struct kvm_vcpu *vcpu)
5232 {
5233         skip_emulated_instruction(vcpu);
5234         return kvm_emulate_halt(vcpu);
5235 }
5236 
5237 static int handle_vmcall(struct kvm_vcpu *vcpu)
5238 {
5239         skip_emulated_instruction(vcpu);
5240         kvm_emulate_hypercall(vcpu);
5241         return 1;
5242 }
5243 
5244 static int handle_invd(struct kvm_vcpu *vcpu)
5245 {
5246         return emulate_instruction(vcpu, 0) == EMULATE_DONE;
5247 }
5248 
5249 static int handle_invlpg(struct kvm_vcpu *vcpu)
5250 {
5251         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5252 
5253         kvm_mmu_invlpg(vcpu, exit_qualification);
5254         skip_emulated_instruction(vcpu);
5255         return 1;
5256 }
5257 
5258 static int handle_rdpmc(struct kvm_vcpu *vcpu)
5259 {
5260         int err;
5261 
5262         err = kvm_rdpmc(vcpu);
5263         kvm_complete_insn_gp(vcpu, err);
5264 
5265         return 1;
5266 }
5267 
5268 static int handle_wbinvd(struct kvm_vcpu *vcpu)
5269 {
5270         skip_emulated_instruction(vcpu);
5271         kvm_emulate_wbinvd(vcpu);
5272         return 1;
5273 }
5274 
5275 static int handle_xsetbv(struct kvm_vcpu *vcpu)
5276 {
5277         u64 new_bv = kvm_read_edx_eax(vcpu);
5278         u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
5279 
5280         if (kvm_set_xcr(vcpu, index, new_bv) == 0)
5281                 skip_emulated_instruction(vcpu);
5282         return 1;
5283 }
5284 
5285 static int handle_apic_access(struct kvm_vcpu *vcpu)
5286 {
5287         if (likely(fasteoi)) {
5288                 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5289                 int access_type, offset;
5290 
5291                 access_type = exit_qualification & APIC_ACCESS_TYPE;
5292                 offset = exit_qualification & APIC_ACCESS_OFFSET;
5293                 /*
5294                  * Sane guest uses MOV to write EOI, with written value
5295                  * not cared. So make a short-circuit here by avoiding
5296                  * heavy instruction emulation.
5297                  */
5298                 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
5299                     (offset == APIC_EOI)) {
5300                         kvm_lapic_set_eoi(vcpu);
5301                         skip_emulated_instruction(vcpu);
5302                         return 1;
5303                 }
5304         }
5305         return emulate_instruction(vcpu, 0) == EMULATE_DONE;
5306 }
5307 
5308 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
5309 {
5310         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5311         int vector = exit_qualification & 0xff;
5312 
5313         /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
5314         kvm_apic_set_eoi_accelerated(vcpu, vector);
5315         return 1;
5316 }
5317 
5318 static int handle_apic_write(struct kvm_vcpu *vcpu)
5319 {
5320         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5321         u32 offset = exit_qualification & 0xfff;
5322 
5323         /* APIC-write VM exit is trap-like and thus no need to adjust IP */
5324         kvm_apic_write_nodecode(vcpu, offset);
5325         return 1;
5326 }
5327 
5328 static int handle_task_switch(struct kvm_vcpu *vcpu)
5329 {
5330         struct vcpu_vmx *vmx = to_vmx(vcpu);
5331         unsigned long exit_qualification;
5332         bool has_error_code = false;
5333         u32 error_code = 0;
5334         u16 tss_selector;
5335         int reason, type, idt_v, idt_index;
5336 
5337         idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
5338         idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
5339         type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
5340 
5341         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5342 
5343         reason = (u32)exit_qualification >> 30;
5344         if (reason == TASK_SWITCH_GATE && idt_v) {
5345                 switch (type) {
5346                 case INTR_TYPE_NMI_INTR:
5347                         vcpu->arch.nmi_injected = false;
5348                         vmx_set_nmi_mask(vcpu, true);
5349                         break;
5350                 case INTR_TYPE_EXT_INTR:
5351                 case INTR_TYPE_SOFT_INTR:
5352                         kvm_clear_interrupt_queue(vcpu);
5353                         break;
5354                 case INTR_TYPE_HARD_EXCEPTION:
5355                         if (vmx->idt_vectoring_info &
5356                             VECTORING_INFO_DELIVER_CODE_MASK) {
5357                                 has_error_code = true;
5358                                 error_code =
5359                                         vmcs_read32(IDT_VECTORING_ERROR_CODE);
5360                         }
5361                         /* fall through */
5362                 case INTR_TYPE_SOFT_EXCEPTION:
5363                         kvm_clear_exception_queue(vcpu);
5364                         break;
5365                 default:
5366                         break;
5367                 }
5368         }
5369         tss_selector = exit_qualification;
5370 
5371         if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
5372                        type != INTR_TYPE_EXT_INTR &&
5373                        type != INTR_TYPE_NMI_INTR))
5374                 skip_emulated_instruction(vcpu);
5375 
5376         if (kvm_task_switch(vcpu, tss_selector,
5377                             type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
5378                             has_error_code, error_code) == EMULATE_FAIL) {
5379                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5380                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
5381                 vcpu->run->internal.ndata = 0;
5382                 return 0;
5383         }
5384 
5385         /* clear all local breakpoint enable flags */
5386         vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
5387 
5388         /*
5389          * TODO: What about debug traps on tss switch?
5390          *       Are we supposed to inject them and update dr6?
5391          */
5392 
5393         return 1;
5394 }
5395 
5396 static int handle_ept_violation(struct kvm_vcpu *vcpu)
5397 {
5398         unsigned long exit_qualification;
5399         gpa_t gpa;
5400         u32 error_code;
5401         int gla_validity;
5402 
5403         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5404 
5405         gla_validity = (exit_qualification >> 7) & 0x3;
5406         if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
5407                 printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
5408                 printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
5409                         (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
5410                         vmcs_readl(GUEST_LINEAR_ADDRESS));
5411                 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
5412                         (long unsigned int)exit_qualification);
5413                 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
5414                 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
5415                 return 0;
5416         }
5417 
5418         /*
5419          * EPT violation happened while executing iret from NMI,
5420          * "blocked by NMI" bit has to be set before next VM entry.
5421          * There are errata that may cause this bit to not be set:
5422          * AAK134, BY25.
5423          */
5424         if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5425                         cpu_has_virtual_nmis() &&
5426                         (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5427                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
5428 
5429         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5430         trace_kvm_page_fault(gpa, exit_qualification);
5431 
5432         /* It is a write fault? */
5433         error_code = exit_qualification & (1U << 1);
5434         /* It is a fetch fault? */
5435         error_code |= (exit_qualification & (1U << 2)) << 2;
5436         /* ept page table is present? */
5437         error_code |= (exit_qualification >> 3) & 0x1;
5438 
5439         vcpu->arch.exit_qualification = exit_qualification;
5440 
5441         return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
5442 }
5443 
5444 static u64 ept_rsvd_mask(u64 spte, int level)
5445 {
5446         int i;
5447         u64 mask = 0;
5448 
5449         for (i = 51; i > boot_cpu_data.x86_phys_bits; i--)
5450                 mask |= (1ULL << i);
5451 
5452         if (level > 2)
5453                 /* bits 7:3 reserved */
5454                 mask |= 0xf8;
5455         else if (level == 2) {
5456                 if (spte & (1ULL << 7))
5457                         /* 2MB ref, bits 20:12 reserved */
5458                         mask |= 0x1ff000;
5459                 else
5460                         /* bits 6:3 reserved */
5461                         mask |= 0x78;
5462         }
5463 
5464         return mask;
5465 }
5466 
5467 static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
5468                                        int level)
5469 {
5470         printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level);
5471 
5472         /* 010b (write-only) */
5473         WARN_ON((spte & 0x7) == 0x2);
5474 
5475         /* 110b (write/execute) */
5476         WARN_ON((spte & 0x7) == 0x6);
5477 
5478         /* 100b (execute-only) and value not supported by logical processor */
5479         if (!cpu_has_vmx_ept_execute_only())
5480                 WARN_ON((spte & 0x7) == 0x4);
5481 
5482         /* not 000b */
5483         if ((spte & 0x7)) {
5484                 u64 rsvd_bits = spte & ept_rsvd_mask(spte, level);
5485 
5486                 if (rsvd_bits != 0) {
5487                         printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n",
5488                                          __func__, rsvd_bits);
5489                         WARN_ON(1);
5490                 }
5491 
5492                 if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) {
5493                         u64 ept_mem_type = (spte & 0x38) >> 3;
5494 
5495                         if (ept_mem_type == 2 || ept_mem_type == 3 ||
5496                             ept_mem_type == 7) {
5497                                 printk(KERN_ERR "%s: ept_mem_type=0x%llx\n",
5498                                                 __func__, ept_mem_type);
5499                                 WARN_ON(1);
5500                         }
5501                 }
5502         }
5503 }
5504 
5505 static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5506 {
5507         u64 sptes[4];
5508         int nr_sptes, i, ret;
5509         gpa_t gpa;
5510 
5511         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5512 
5513         ret = handle_mmio_page_fault_common(vcpu, gpa, true);
5514         if (likely(ret == RET_MMIO_PF_EMULATE))
5515                 return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
5516                                               EMULATE_DONE;
5517 
5518         if (unlikely(ret == RET_MMIO_PF_INVALID))
5519                 return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0);
5520 
5521         if (unlikely(ret == RET_MMIO_PF_RETRY))
5522                 return 1;
5523 
5524         /* It is the real ept misconfig */
5525         printk(KERN_ERR "EPT: Misconfiguration.\n");
5526         printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
5527 
5528         nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes);
5529 
5530         for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
5531                 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
5532 
5533         vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
5534         vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
5535 
5536         return 0;
5537 }
5538 
5539 static int handle_nmi_window(struct kvm_vcpu *vcpu)
5540 {
5541         u32 cpu_based_vm_exec_control;
5542 
5543         /* clear pending NMI */
5544         cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5545         cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
5546         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
5547         ++vcpu->stat.nmi_window_exits;
5548         kvm_make_request(KVM_REQ_EVENT, vcpu);
5549 
5550         return 1;
5551 }
5552 
5553 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5554 {
5555         struct vcpu_vmx *vmx = to_vmx(vcpu);
5556         enum emulation_result err = EMULATE_DONE;
5557         int ret = 1;
5558         u32 cpu_exec_ctrl;
5559         bool intr_window_requested;
5560         unsigned count = 130;
5561 
5562         cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5563         intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
5564 
5565         while (!guest_state_valid(vcpu) && count-- != 0) {
5566                 if (intr_window_requested && vmx_interrupt_allowed(vcpu))
5567                         return handle_interrupt_window(&vmx->vcpu);
5568 
5569                 if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
5570                         return 1;
5571 
5572                 err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
5573 
5574                 if (err == EMULATE_USER_EXIT) {
5575                         ++vcpu->stat.mmio_exits;
5576                         ret = 0;
5577                         goto out;
5578                 }
5579 
5580                 if (err != EMULATE_DONE) {
5581                         vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5582                         vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
5583                         vcpu->run->internal.ndata = 0;
5584                         return 0;
5585                 }
5586 
5587                 if (vcpu->arch.halt_request) {
5588                         vcpu->arch.halt_request = 0;
5589                         ret = kvm_emulate_halt(vcpu);
5590                         goto out;
5591                 }
5592 
5593                 if (signal_pending(current))
5594                         goto out;
5595                 if (need_resched())
5596                         schedule();
5597         }
5598 
5599         vmx->emulation_required = emulation_required(vcpu);
5600 out:
5601         return ret;
5602 }
5603 
5604 /*
5605  * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
5606  * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
5607  */
5608 static int handle_pause(struct kvm_vcpu *vcpu)
5609 {
5610         skip_emulated_instruction(vcpu);
5611         kvm_vcpu_on_spin(vcpu);
5612 
5613         return 1;
5614 }
5615 
5616 static int handle_invalid_op(struct kvm_vcpu *vcpu)
5617 {
5618         kvm_queue_exception(vcpu, UD_VECTOR);
5619         return 1;
5620 }
5621 
5622 /*
5623  * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
5624  * We could reuse a single VMCS for all the L2 guests, but we also want the
5625  * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
5626  * allows keeping them loaded on the processor, and in the future will allow
5627  * optimizations where prepare_vmcs02 doesn't need to set all the fields on
5628  * every entry if they never change.
5629  * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
5630  * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
5631  *
5632  * The following functions allocate and free a vmcs02 in this pool.
5633  */
5634 
5635 /* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
5636 static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
5637 {
5638         struct vmcs02_list *item;
5639         list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
5640                 if (item->vmptr == vmx->nested.current_vmptr) {
5641                         list_move(&item->list, &vmx->nested.vmcs02_pool);
5642                         return &item->vmcs02;
5643                 }
5644 
5645         if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
5646                 /* Recycle the least recently used VMCS. */
5647                 item = list_entry(vmx->nested.vmcs02_pool.prev,
5648                         struct vmcs02_list, list);
5649                 item->vmptr = vmx->nested.current_vmptr;
5650                 list_move(&item->list, &vmx->nested.vmcs02_pool);
5651                 return &item->vmcs02;
5652         }
5653 
5654         /* Create a new VMCS */
5655         item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
5656         if (!item)
5657                 return NULL;
5658         item->vmcs02.vmcs = alloc_vmcs();
5659         if (!item->vmcs02.vmcs) {
5660                 kfree(item);
5661                 return NULL;
5662         }
5663         loaded_vmcs_init(&item->vmcs02);
5664         item->vmptr = vmx->nested.current_vmptr;
5665         list_add(&(item->list), &(vmx->nested.vmcs02_pool));
5666         vmx->nested.vmcs02_num++;
5667         return &item->vmcs02;
5668 }
5669 
5670 /* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
5671 static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
5672 {
5673         struct vmcs02_list *item;
5674         list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
5675                 if (item->vmptr == vmptr) {
5676                         free_loaded_vmcs(&item->vmcs02);
5677                         list_del(&item->list);
5678                         kfree(item);
5679                         vmx->nested.vmcs02_num--;
5680                         return;
5681                 }
5682 }
5683 
5684 /*
5685  * Free all VMCSs saved for this vcpu, except the one pointed by
5686  * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one
5687  * currently used, if running L2), and vmcs01 when running L2.
5688  */
5689 static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
5690 {
5691         struct vmcs02_list *item, *n;
5692         list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
5693                 if (vmx->loaded_vmcs != &item->vmcs02)
5694                         free_loaded_vmcs(&item->vmcs02);
5695                 list_del(&item->list);
5696                 kfree(item);
5697         }
5698         vmx->nested.vmcs02_num = 0;
5699 
5700         if (vmx->loaded_vmcs != &vmx->vmcs01)
5701                 free_loaded_vmcs(&vmx->vmcs01);
5702 }
5703 
5704 /*
5705  * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
5706  * set the success or error code of an emulated VMX instruction, as specified
5707  * by Vol 2B, VMX Instruction Reference, "Conventions".
5708  */
5709 static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
5710 {
5711         vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
5712                         & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5713                             X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
5714 }
5715 
5716 static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
5717 {
5718         vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5719                         & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
5720                             X86_EFLAGS_SF | X86_EFLAGS_OF))
5721                         | X86_EFLAGS_CF);
5722 }
5723 
5724 static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
5725                                         u32 vm_instruction_error)
5726 {
5727         if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
5728                 /*
5729                  * failValid writes the error number to the current VMCS, which
5730                  * can't be done there isn't a current VMCS.
5731                  */
5732                 nested_vmx_failInvalid(vcpu);
5733                 return;
5734         }
5735         vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5736                         & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5737                             X86_EFLAGS_SF | X86_EFLAGS_OF))
5738                         | X86_EFLAGS_ZF);
5739         get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
5740         /*
5741          * We don't need to force a shadow sync because
5742          * VM_INSTRUCTION_ERROR is not shadowed
5743          */
5744 }
5745 
5746 /*
5747  * Emulate the VMXON instruction.
5748  * Currently, we just remember that VMX is active, and do not save or even
5749  * inspect the argument to VMXON (the so-called "VMXON pointer") because we
5750  * do not currently need to store anything in that guest-allocated memory
5751  * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
5752  * argument is different from the VMXON pointer (which the spec says they do).
5753  */
5754 static int handle_vmon(struct kvm_vcpu *vcpu)
5755 {
5756         struct kvm_segment cs;
5757         struct vcpu_vmx *vmx = to_vmx(vcpu);
5758         struct vmcs *shadow_vmcs;
5759         const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
5760                 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
5761 
5762         /* The Intel VMX Instruction Reference lists a bunch of bits that
5763          * are prerequisite to running VMXON, most notably cr4.VMXE must be
5764          * set to 1 (see vmx_set_cr4() for when we allow the guest to set this).
5765          * Otherwise, we should fail with #UD. We test these now:
5766          */
5767         if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) ||
5768             !kvm_read_cr0_bits(vcpu, X86_CR0_PE) ||
5769             (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
5770                 kvm_queue_exception(vcpu, UD_VECTOR);
5771                 return 1;
5772         }
5773 
5774         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
5775         if (is_long_mode(vcpu) && !cs.l) {
5776                 kvm_queue_exception(vcpu, UD_VECTOR);
5777                 return 1;
5778         }
5779 
5780         if (vmx_get_cpl(vcpu)) {
5781                 kvm_inject_gp(vcpu, 0);
5782                 return 1;
5783         }
5784         if (vmx->nested.vmxon) {
5785                 nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
5786                 skip_emulated_instruction(vcpu);
5787                 return 1;
5788         }
5789 
5790         if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
5791                         != VMXON_NEEDED_FEATURES) {
5792                 kvm_inject_gp(vcpu, 0);
5793                 return 1;
5794         }
5795 
5796         if (enable_shadow_vmcs) {
5797                 shadow_vmcs = alloc_vmcs();
5798                 if (!shadow_vmcs)
5799                         return -ENOMEM;
5800                 /* mark vmcs as shadow */
5801                 shadow_vmcs->revision_id |= (1u << 31);
5802                 /* init shadow vmcs */
5803                 vmcs_clear(shadow_vmcs);
5804                 vmx->nested.current_shadow_vmcs = shadow_vmcs;
5805         }
5806 
5807         INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
5808         vmx->nested.vmcs02_num = 0;
5809 
5810         vmx->nested.vmxon = true;
5811 
5812         skip_emulated_instruction(vcpu);
5813         nested_vmx_succeed(vcpu);
5814         return 1;
5815 }
5816 
5817 /*
5818  * Intel's VMX Instruction Reference specifies a common set of prerequisites
5819  * for running VMX instructions (except VMXON, whose prerequisites are
5820  * slightly different). It also specifies what exception to inject otherwise.
5821  */
5822 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
5823 {
5824         struct kvm_segment cs;
5825         struct vcpu_vmx *vmx = to_vmx(vcpu);
5826 
5827         if (!vmx->nested.vmxon) {
5828                 kvm_queue_exception(vcpu, UD_VECTOR);
5829                 return 0;
5830         }
5831 
5832         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
5833         if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
5834             (is_long_mode(vcpu) && !cs.l)) {
5835                 kvm_queue_exception(vcpu, UD_VECTOR);
5836                 return 0;
5837         }
5838 
5839         if (vmx_get_cpl(vcpu)) {
5840                 kvm_inject_gp(vcpu, 0);
5841                 return 0;
5842         }
5843 
5844         return 1;
5845 }
5846 
5847 static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
5848 {
5849         u32 exec_control;
5850         if (enable_shadow_vmcs) {
5851                 if (vmx->nested.current_vmcs12 != NULL) {
5852                         /* copy to memory all shadowed fields in case
5853                            they were modified */
5854                         copy_shadow_to_vmcs12(vmx);
5855                         vmx->nested.sync_shadow_vmcs = false;
5856                         exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
5857                         exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
5858                         vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
5859                         vmcs_write64(VMCS_LINK_POINTER, -1ull);
5860                 }
5861         }
5862         kunmap(vmx->nested.current_vmcs12_page);
5863         nested_release_page(vmx->nested.current_vmcs12_page);
5864 }
5865 
5866 /*
5867  * Free whatever needs to be freed from vmx->nested when L1 goes down, or
5868  * just stops using VMX.
5869  */
5870 static void free_nested(struct vcpu_vmx *vmx)
5871 {
5872         if (!vmx->nested.vmxon)
5873                 return;
5874         vmx->nested.vmxon = false;
5875         if (vmx->nested.current_vmptr != -1ull) {
5876                 nested_release_vmcs12(vmx);
5877                 vmx->nested.current_vmptr = -1ull;
5878                 vmx->nested.current_vmcs12 = NULL;
5879         }
5880         if (enable_shadow_vmcs)
5881                 free_vmcs(vmx->nested.current_shadow_vmcs);
5882         /* Unpin physical memory we referred to in current vmcs02 */
5883         if (vmx->nested.apic_access_page) {
5884                 nested_release_page(vmx->nested.apic_access_page);
5885                 vmx->nested.apic_access_page = 0;
5886         }
5887 
5888         nested_free_all_saved_vmcss(vmx);
5889 }
5890 
5891 /* Emulate the VMXOFF instruction */
5892 static int handle_vmoff(struct kvm_vcpu *vcpu)
5893 {
5894         if (!nested_vmx_check_permission(vcpu))
5895                 return 1;
5896         free_nested(to_vmx(vcpu));
5897         skip_emulated_instruction(vcpu);
5898         nested_vmx_succeed(vcpu);
5899         return 1;
5900 }
5901 
5902 /*
5903  * Decode the memory-address operand of a vmx instruction, as recorded on an
5904  * exit caused by such an instruction (run by a guest hypervisor).
5905  * On success, returns 0. When the operand is invalid, returns 1 and throws
5906  * #UD or #GP.
5907  */
5908 static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
5909                                  unsigned long exit_qualification,
5910                                  u32 vmx_instruction_info, gva_t *ret)
5911 {
5912         /*
5913          * According to Vol. 3B, "Information for VM Exits Due to Instruction
5914          * Execution", on an exit, vmx_instruction_info holds most of the
5915          * addressing components of the operand. Only the displacement part
5916          * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
5917          * For how an actual address is calculated from all these components,
5918          * refer to Vol. 1, "Operand Addressing".
5919          */
5920         int  scaling = vmx_instruction_info & 3;
5921         int  addr_size = (vmx_instruction_info >> 7) & 7;
5922         bool is_reg = vmx_instruction_info & (1u << 10);
5923         int  seg_reg = (vmx_instruction_info >> 15) & 7;
5924         int  index_reg = (vmx_instruction_info >> 18) & 0xf;
5925         bool index_is_valid = !(vmx_instruction_info & (1u << 22));
5926         int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
5927         bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
5928 
5929         if (is_reg) {
5930                 kvm_queue_exception(vcpu, UD_VECTOR);
5931                 return 1;
5932         }
5933 
5934         /* Addr = segment_base + offset */
5935         /* offset = base + [index * scale] + displacement */
5936         *ret = vmx_get_segment_base(vcpu, seg_reg);
5937         if (base_is_valid)
5938                 *ret += kvm_register_read(vcpu, base_reg);
5939         if (index_is_valid)
5940                 *ret += kvm_register_read(vcpu, index_reg)<<scaling;
5941         *ret += exit_qualification; /* holds the displacement */
5942 
5943         if (addr_size == 1) /* 32 bit */
5944                 *ret &= 0xffffffff;
5945 
5946         /*
5947          * TODO: throw #GP (and return 1) in various cases that the VM*
5948          * instructions require it - e.g., offset beyond segment limit,
5949          * unusable or unreadable/unwritable segment, non-canonical 64-bit
5950          * address, and so on. Currently these are not checked.
5951          */
5952         return 0;
5953 }
5954 
5955 /* Emulate the VMCLEAR instruction */
5956 static int handle_vmclear(struct kvm_vcpu *vcpu)
5957 {
5958         struct vcpu_vmx *vmx = to_vmx(vcpu);
5959         gva_t gva;
5960         gpa_t vmptr;
5961         struct vmcs12 *vmcs12;
5962         struct page *page;
5963         struct x86_exception e;
5964 
5965         if (!nested_vmx_check_permission(vcpu))
5966                 return 1;
5967 
5968         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
5969                         vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
5970                 return 1;
5971 
5972         if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
5973                                 sizeof(vmptr), &e)) {
5974                 kvm_inject_page_fault(vcpu, &e);
5975                 return 1;
5976         }
5977 
5978         if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
5979                 nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
5980                 skip_emulated_instruction(vcpu);
5981                 return 1;
5982         }
5983 
5984         if (vmptr == vmx->nested.current_vmptr) {
5985                 nested_release_vmcs12(vmx);
5986                 vmx->nested.current_vmptr = -1ull;
5987                 vmx->nested.current_vmcs12 = NULL;
5988         }
5989 
5990         page = nested_get_page(vcpu, vmptr);
5991         if (page == NULL) {
5992                 /*
5993                  * For accurate processor emulation, VMCLEAR beyond available
5994                  * physical memory should do nothing at all. However, it is
5995                  * possible that a nested vmx bug, not a guest hypervisor bug,
5996                  * resulted in this case, so let's shut down before doing any
5997                  * more damage:
5998                  */
5999                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
6000                 return 1;
6001         }
6002         vmcs12 = kmap(page);
6003         vmcs12->launch_state = 0;
6004         kunmap(page);
6005         nested_release_page(page);
6006 
6007         nested_free_vmcs02(vmx, vmptr);
6008 
6009         skip_emulated_instruction(vcpu);
6010         nested_vmx_succeed(vcpu);
6011         return 1;
6012 }
6013 
6014 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
6015 
6016 /* Emulate the VMLAUNCH instruction */
6017 static int handle_vmlaunch(struct kvm_vcpu *vcpu)
6018 {
6019         return nested_vmx_run(vcpu, true);
6020 }
6021 
6022 /* Emulate the VMRESUME instruction */
6023 static int handle_vmresume(struct kvm_vcpu *vcpu)
6024 {
6025 
6026         return nested_vmx_run(vcpu, false);
6027 }
6028 
6029 enum vmcs_field_type {
6030         VMCS_FIELD_TYPE_U16 = 0,
6031         VMCS_FIELD_TYPE_U64 = 1,
6032         VMCS_FIELD_TYPE_U32 = 2,
6033         VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
6034 };
6035 
6036 static inline int vmcs_field_type(unsigned long field)
6037 {
6038         if (0x1 & field)        /* the *_HIGH fields are all 32 bit */
6039                 return VMCS_FIELD_TYPE_U32;
6040         return (field >> 13) & 0x3 ;
6041 }
6042 
6043 static inline int vmcs_field_readonly(unsigned long field)
6044 {
6045         return (((field >> 10) & 0x3) == 1);
6046 }
6047 
6048 /*
6049  * Read a vmcs12 field. Since these can have varying lengths and we return
6050  * one type, we chose the biggest type (u64) and zero-extend the return value
6051  * to that size. Note that the caller, handle_vmread, might need to use only
6052  * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
6053  * 64-bit fields are to be returned).
6054  */
6055 static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu,
6056                                         unsigned long field, u64 *ret)
6057 {
6058         short offset = vmcs_field_to_offset(field);
6059         char *p;
6060 
6061         if (offset < 0)
6062                 return 0;
6063 
6064         p = ((char *)(get_vmcs12(vcpu))) + offset;
6065 
6066         switch (vmcs_field_type(field)) {
6067         case VMCS_FIELD_TYPE_NATURAL_WIDTH:
6068                 *ret = *((natural_width *)p);
6069                 return 1;
6070         case VMCS_FIELD_TYPE_U16:
6071                 *ret = *((u16 *)p);
6072                 return 1;
6073         case VMCS_FIELD_TYPE_U32:
6074                 *ret = *((u32 *)p);
6075                 return 1;
6076         case VMCS_FIELD_TYPE_U64:
6077                 *ret = *((u64 *)p);
6078                 return 1;
6079         default:
6080                 return 0; /* can never happen. */
6081         }
6082 }
6083 
6084 
6085 static inline bool vmcs12_write_any(struct kvm_vcpu *vcpu,
6086                                     unsigned long field, u64 field_value){
6087         short offset = vmcs_field_to_offset(field);
6088         char *p = ((char *) get_vmcs12(vcpu)) + offset;
6089         if (offset < 0)
6090                 return false;
6091 
6092         switch (vmcs_field_type(field)) {
6093         case VMCS_FIELD_TYPE_U16:
6094                 *(u16 *)p = field_value;
6095                 return true;
6096         case VMCS_FIELD_TYPE_U32:
6097                 *(u32 *)p = field_value;
6098                 return true;
6099         case VMCS_FIELD_TYPE_U64:
6100                 *(u64 *)p = field_value;
6101                 return true;
6102         case VMCS_FIELD_TYPE_NATURAL_WIDTH:
6103                 *(natural_width *)p = field_value;
6104                 return true;
6105         default:
6106                 return false; /* can never happen. */
6107         }
6108 
6109 }
6110 
6111 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
6112 {
6113         int i;
6114         unsigned long field;
6115         u64 field_value;
6116         struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
6117         const unsigned long *fields = shadow_read_write_fields;
6118         const int num_fields = max_shadow_read_write_fields;
6119 
6120         vmcs_load(shadow_vmcs);
6121 
6122         for (i = 0; i < num_fields; i++) {
6123                 field = fields[i];
6124                 switch (vmcs_field_type(field)) {
6125                 case VMCS_FIELD_TYPE_U16:
6126                         field_value = vmcs_read16(field);
6127                         break;
6128                 case VMCS_FIELD_TYPE_U32:
6129                         field_value = vmcs_read32(field);
6130                         break;
6131                 case VMCS_FIELD_TYPE_U64:
6132                         field_value = vmcs_read64(field);
6133                         break;
6134                 case VMCS_FIELD_TYPE_NATURAL_WIDTH:
6135                         field_value = vmcs_readl(field);
6136                         break;
6137                 }
6138                 vmcs12_write_any(&vmx->vcpu, field, field_value);
6139         }
6140 
6141         vmcs_clear(shadow_vmcs);
6142         vmcs_load(vmx->loaded_vmcs->vmcs);
6143 }
6144 
6145 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
6146 {
6147         const unsigned long *fields[] = {
6148                 shadow_read_write_fields,
6149                 shadow_read_only_fields
6150         };
6151         const int max_fields[] = {
6152                 max_shadow_read_write_fields,
6153                 max_shadow_read_only_fields
6154         };
6155         int i, q;
6156         unsigned long field;
6157         u64 field_value = 0;
6158         struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
6159 
6160         vmcs_load(shadow_vmcs);
6161 
6162         for (q = 0; q < ARRAY_SIZE(fields); q++) {
6163                 for (i = 0; i < max_fields[q]; i++) {
6164                         field = fields[q][i];
6165                         vmcs12_read_any(&vmx->vcpu, field, &field_value);
6166 
6167                         switch (vmcs_field_type(field)) {
6168                         case VMCS_FIELD_TYPE_U16:
6169                                 vmcs_write16(field, (u16)field_value);
6170                                 break;
6171                         case VMCS_FIELD_TYPE_U32:
6172                                 vmcs_write32(field, (u32)field_value);
6173                                 break;
6174                         case VMCS_FIELD_TYPE_U64:
6175                                 vmcs_write64(field, (u64)field_value);
6176                                 break;
6177                         case VMCS_FIELD_TYPE_NATURAL_WIDTH:
6178                                 vmcs_writel(field, (long)field_value);
6179                                 break;
6180                         }
6181                 }
6182         }
6183 
6184         vmcs_clear(shadow_vmcs);
6185         vmcs_load(vmx->loaded_vmcs->vmcs);
6186 }
6187 
6188 /*
6189  * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
6190  * used before) all generate the same failure when it is missing.
6191  */
6192 static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
6193 {
6194         struct vcpu_vmx *vmx = to_vmx(vcpu);
6195         if (vmx->nested.current_vmptr == -1ull) {
6196                 nested_vmx_failInvalid(vcpu);
6197                 skip_emulated_instruction(vcpu);
6198                 return 0;
6199         }
6200         return 1;
6201 }
6202 
6203 static int handle_vmread(struct kvm_vcpu *vcpu)
6204 {
6205         unsigned long field;
6206         u64 field_value;
6207         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6208         u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6209         gva_t gva = 0;
6210 
6211         if (!nested_vmx_check_permission(vcpu) ||
6212             !nested_vmx_check_vmcs12(vcpu))
6213                 return 1;
6214 
6215         /* Decode instruction info and find the field to read */
6216         field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
6217         /* Read the field, zero-extended to a u64 field_value */
6218         if (!vmcs12_read_any(vcpu, field, &field_value)) {
6219                 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
6220                 skip_emulated_instruction(vcpu);
6221                 return 1;
6222         }
6223         /*
6224          * Now copy part of this value to register or memory, as requested.
6225          * Note that the number of bits actually copied is 32 or 64 depending
6226          * on the guest's mode (32 or 64 bit), not on the given field's length.
6227          */
6228         if (vmx_instruction_info & (1u << 10)) {
6229                 kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
6230                         field_value);
6231         } else {
6232                 if (get_vmx_mem_address(vcpu, exit_qualification,
6233                                 vmx_instruction_info, &gva))
6234                         return 1;
6235                 /* _system ok, as nested_vmx_check_permission verified cpl=0 */
6236                 kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
6237                              &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
6238         }
6239 
6240         nested_vmx_succeed(vcpu);
6241         skip_emulated_instruction(vcpu);
6242         return 1;
6243 }
6244 
6245 
6246 static int handle_vmwrite(struct kvm_vcpu *vcpu)
6247 {
6248         unsigned long field;
6249         gva_t gva;
6250         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6251         u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6252         /* The value to write might be 32 or 64 bits, depending on L1's long
6253          * mode, and eventually we need to write that into a field of several
6254          * possible lengths. The code below first zero-extends the value to 64
6255          * bit (field_value), and then copies only the approriate number of
6256          * bits into the vmcs12 field.
6257          */
6258         u64 field_value = 0;
6259         struct x86_exception e;
6260 
6261         if (!nested_vmx_check_permission(vcpu) ||
6262             !nested_vmx_check_vmcs12(vcpu))
6263                 return 1;
6264 
6265         if (vmx_instruction_info & (1u << 10))
6266                 field_value = kvm_register_read(vcpu,
6267                         (((vmx_instruction_info) >> 3) & 0xf));
6268         else {
6269                 if (get_vmx_mem_address(vcpu, exit_qualification,
6270                                 vmx_instruction_info, &gva))
6271                         return 1;
6272                 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
6273                            &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) {
6274                         kvm_inject_page_fault(vcpu, &e);
6275                         return 1;
6276                 }
6277         }
6278 
6279 
6280         field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
6281         if (vmcs_field_readonly(field)) {
6282                 nested_vmx_failValid(vcpu,
6283                         VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
6284                 skip_emulated_instruction(vcpu);
6285                 return 1;
6286         }
6287 
6288         if (!vmcs12_write_any(vcpu, field, field_value)) {
6289                 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
6290                 skip_emulated_instruction(vcpu);
6291                 return 1;
6292         }
6293 
6294         nested_vmx_succeed(vcpu);
6295         skip_emulated_instruction(vcpu);
6296         return 1;
6297 }
6298 
6299 /* Emulate the VMPTRLD instruction */
6300 static int handle_vmptrld(struct kvm_vcpu *vcpu)
6301 {
6302         struct vcpu_vmx *vmx = to_vmx(vcpu);
6303         gva_t gva;
6304         gpa_t vmptr;
6305         struct x86_exception e;
6306         u32 exec_control;
6307 
6308         if (!nested_vmx_check_permission(vcpu))
6309                 return 1;
6310 
6311         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
6312                         vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
6313                 return 1;
6314 
6315         if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
6316                                 sizeof(vmptr), &e)) {
6317                 kvm_inject_page_fault(vcpu, &e);
6318                 return 1;
6319         }
6320 
6321         if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
6322                 nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
6323                 skip_emulated_instruction(vcpu);
6324                 return 1;
6325         }
6326 
6327         if (vmx->nested.current_vmptr != vmptr) {
6328                 struct vmcs12 *new_vmcs12;
6329                 struct page *page;
6330                 page = nested_get_page(vcpu, vmptr);
6331                 if (page == NULL) {
6332                         nested_vmx_failInvalid(vcpu);
6333                         skip_emulated_instruction(vcpu);
6334                         return 1;
6335                 }
6336                 new_vmcs12 = kmap(page);
6337                 if (new_vmcs12->revision_id != VMCS12_REVISION) {
6338                         kunmap(page);
6339                         nested_release_page_clean(page);
6340                         nested_vmx_failValid(vcpu,
6341                                 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
6342                         skip_emulated_instruction(vcpu);
6343                         return 1;
6344                 }
6345                 if (vmx->nested.current_vmptr != -1ull)
6346                         nested_release_vmcs12(vmx);
6347 
6348                 vmx->nested.current_vmptr = vmptr;
6349                 vmx->nested.current_vmcs12 = new_vmcs12;
6350                 vmx->nested.current_vmcs12_page = page;
6351                 if (enable_shadow_vmcs) {
6352                         exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6353                         exec_control |= SECONDARY_EXEC_SHADOW_VMCS;
6354                         vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
6355                         vmcs_write64(VMCS_LINK_POINTER,
6356                                      __pa(vmx->nested.current_shadow_vmcs));
6357                         vmx->nested.sync_shadow_vmcs = true;
6358                 }
6359         }
6360 
6361         nested_vmx_succeed(vcpu);
6362         skip_emulated_instruction(vcpu);
6363         return 1;
6364 }
6365 
6366 /* Emulate the VMPTRST instruction */
6367 static int handle_vmptrst(struct kvm_vcpu *vcpu)
6368 {
6369         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6370         u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6371         gva_t vmcs_gva;
6372         struct x86_exception e;
6373 
6374         if (!nested_vmx_check_permission(vcpu))
6375                 return 1;
6376 
6377         if (get_vmx_mem_address(vcpu, exit_qualification,
6378                         vmx_instruction_info, &vmcs_gva))
6379                 return 1;
6380         /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */
6381         if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
6382                                  (void *)&to_vmx(vcpu)->nested.current_vmptr,
6383                                  sizeof(u64), &e)) {
6384                 kvm_inject_page_fault(vcpu, &e);
6385                 return 1;
6386         }
6387         nested_vmx_succeed(vcpu);
6388         skip_emulated_instruction(vcpu);
6389         return 1;
6390 }
6391 
6392 /* Emulate the INVEPT instruction */
6393 static int handle_invept(struct kvm_vcpu *vcpu)
6394 {
6395         u32 vmx_instruction_info, types;
6396         unsigned long type;
6397         gva_t gva;
6398         struct x86_exception e;
6399         struct {
6400                 u64 eptp, gpa;
6401         } operand;
6402         u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK;
6403 
6404         if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
6405             !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
6406                 kvm_queue_exception(vcpu, UD_VECTOR);
6407                 return 1;
6408         }
6409 
6410         if (!nested_vmx_check_permission(vcpu))
6411                 return 1;
6412 
6413         if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
6414                 kvm_queue_exception(vcpu, UD_VECTOR);
6415                 return 1;
6416         }
6417 
6418         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6419         type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
6420 
6421         types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
6422 
6423         if (!(types & (1UL << type))) {
6424                 nested_vmx_failValid(vcpu,
6425                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
6426                 skip_emulated_instruction(vcpu);
6427                 return 1;
6428         }
6429 
6430         /* According to the Intel VMX instruction reference, the memory
6431          * operand is read even if it isn't needed (e.g., for type==global)
6432          */
6433         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
6434                         vmx_instruction_info, &gva))
6435                 return 1;
6436         if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
6437                                 sizeof(operand), &e)) {
6438                 kvm_inject_page_fault(vcpu, &e);
6439                 return 1;
6440         }
6441 
6442         switch (type) {
6443         case VMX_EPT_EXTENT_CONTEXT:
6444                 if ((operand.eptp & eptp_mask) !=
6445                                 (nested_ept_get_cr3(vcpu) & eptp_mask))
6446                         break;
6447         case VMX_EPT_EXTENT_GLOBAL:
6448                 kvm_mmu_sync_roots(vcpu);
6449                 kvm_mmu_flush_tlb(vcpu);
6450                 nested_vmx_succeed(vcpu);
6451                 break;
6452         default:
6453                 BUG_ON(1);
6454                 break;
6455         }
6456 
6457         skip_emulated_instruction(vcpu);
6458         return 1;
6459 }
6460 
6461 static int handle_invvpid(struct kvm_vcpu *vcpu)
6462 {
6463         kvm_queue_exception(vcpu, UD_VECTOR);
6464         return 1;
6465 }
6466 
6467 /*
6468  * The exit handlers return 1 if the exit was handled fully and guest execution
6469  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
6470  * to be done to userspace and return 0.
6471  */
6472 static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
6473         [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
6474         [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
6475         [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
6476         [EXIT_REASON_NMI_WINDOW]              = handle_nmi_window,
6477         [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
6478         [EXIT_REASON_CR_ACCESS]               = handle_cr,
6479         [EXIT_REASON_DR_ACCESS]               = handle_dr,
6480         [EXIT_REASON_CPUID]                   = handle_cpuid,
6481         [EXIT_REASON_MSR_READ]                = handle_rdmsr,
6482         [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
6483         [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
6484         [EXIT_REASON_HLT]                     = handle_halt,
6485         [EXIT_REASON_INVD]                    = handle_invd,
6486         [EXIT_REASON_INVLPG]                  = handle_invlpg,
6487         [EXIT_REASON_RDPMC]                   = handle_rdpmc,
6488         [EXIT_REASON_VMCALL]                  = handle_vmcall,
6489         [EXIT_REASON_VMCLEAR]                 = handle_vmclear,
6490         [EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
6491         [EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
6492         [EXIT_REASON_VMPTRST]                 = handle_vmptrst,
6493         [EXIT_REASON_VMREAD]                  = handle_vmread,
6494         [EXIT_REASON_VMRESUME]                = handle_vmresume,
6495         [EXIT_REASON_VMWRITE]                 = handle_vmwrite,
6496         [EXIT_REASON_VMOFF]                   = handle_vmoff,
6497         [EXIT_REASON_VMON]                    = handle_vmon,
6498         [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
6499         [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
6500         [EXIT_REASON_APIC_WRITE]              = handle_apic_write,
6501         [EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
6502         [EXIT_REASON_WBINVD]                  = handle_wbinvd,
6503         [EXIT_REASON_XSETBV]                  = handle_xsetbv,
6504         [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
6505         [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
6506         [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
6507         [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
6508         [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
6509         [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_invalid_op,
6510         [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
6511         [EXIT_REASON_INVEPT]                  = handle_invept,
6512         [EXIT_REASON_INVVPID]                 = handle_invvpid,
6513 };
6514 
6515 static const int kvm_vmx_max_exit_handlers =
6516         ARRAY_SIZE(kvm_vmx_exit_handlers);
6517 
6518 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
6519                                        struct vmcs12 *vmcs12)
6520 {
6521         unsigned long exit_qualification;
6522         gpa_t bitmap, last_bitmap;
6523         unsigned int port;
6524         int size;
6525         u8 b;
6526 
6527         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
6528                 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
6529 
6530         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6531 
6532         port = exit_qualification >> 16;
6533         size = (exit_qualification & 7) + 1;
6534 
6535         last_bitmap = (gpa_t)-1;
6536         b = -1;
6537 
6538         while (size > 0) {
6539                 if (port < 0x8000)
6540                         bitmap = vmcs12->io_bitmap_a;
6541                 else if (port < 0x10000)
6542                         bitmap = vmcs12->io_bitmap_b;
6543                 else
6544                         return 1;
6545                 bitmap += (port & 0x7fff) / 8;
6546 
6547                 if (last_bitmap != bitmap)
6548                         if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1))
6549                                 return 1;
6550                 if (b & (1 << (port & 7)))
6551                         return 1;
6552 
6553                 port++;
6554                 size--;
6555                 last_bitmap = bitmap;
6556         }
6557 
6558         return 0;
6559 }
6560 
6561 /*
6562  * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
6563  * rather than handle it ourselves in L0. I.e., check whether L1 expressed
6564  * disinterest in the current event (read or write a specific MSR) by using an
6565  * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
6566  */
6567 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
6568         struct vmcs12 *vmcs12, u32 exit_reason)
6569 {
6570         u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
6571         gpa_t bitmap;
6572 
6573         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
6574                 return 1;
6575 
6576         /*
6577          * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
6578          * for the four combinations of read/write and low/high MSR numbers.
6579          * First we need to figure out which of the four to use:
6580          */
6581         bitmap = vmcs12->msr_bitmap;
6582         if (exit_reason == EXIT_REASON_MSR_WRITE)
6583                 bitmap += 2048;
6584         if (msr_index >= 0xc0000000) {
6585                 msr_index -= 0xc0000000;
6586                 bitmap += 1024;
6587         }
6588 
6589         /* Then read the msr_index'th bit from this bitmap: */
6590         if (msr_index < 1024*8) {
6591                 unsigned char b;
6592                 if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1))
6593                         return 1;
6594                 return 1 & (b >> (msr_index & 7));
6595         } else
6596                 return 1; /* let L1 handle the wrong parameter */
6597 }
6598 
6599 /*
6600  * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
6601  * rather than handle it ourselves in L0. I.e., check if L1 wanted to
6602  * intercept (via guest_host_mask etc.) the current event.
6603  */
6604 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
6605         struct vmcs12 *vmcs12)
6606 {
6607         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6608         int cr = exit_qualification & 15;
6609         int reg = (exit_qualification >> 8) & 15;
6610         unsigned long val = kvm_register_read(vcpu, reg);
6611 
6612         switch ((exit_qualification >> 4) & 3) {
6613         case 0: /* mov to cr */
6614                 switch (cr) {
6615                 case 0:
6616                         if (vmcs12->cr0_guest_host_mask &
6617                             (val ^ vmcs12->cr0_read_shadow))
6618                                 return 1;
6619                         break;
6620                 case 3:
6621                         if ((vmcs12->cr3_target_count >= 1 &&
6622                                         vmcs12->cr3_target_value0 == val) ||
6623                                 (vmcs12->cr3_target_count >= 2 &&
6624                                         vmcs12->cr3_target_value1 == val) ||
6625                                 (vmcs12->cr3_target_count >= 3 &&
6626                                         vmcs12->cr3_target_value2 == val) ||
6627                                 (vmcs12->cr3_target_count >= 4 &&
6628                                         vmcs12->cr3_target_value3 == val))
6629                                 return 0;
6630                         if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
6631                                 return 1;
6632                         break;