diff -r 19201eebab16 Config.mk
--- a/Config.mk	Thu Sep 25 13:33:50 2008 +0100
+++ b/Config.mk	Tue Nov 18 21:04:55 2008 +0900
@@ -84,7 +84,7 @@ QEMU_REMOTE=http://xenbits.xensource.com
 
 # Specify which qemu-dm to use. This may be `ioemu' to use the old
 # Mercurial in-tree version, or a local directory, or a git URL.
-# CONFIG_QEMU   ?= ioemu
+CONFIG_QEMU   ?= ioemu
 # CONFIG_QEMU   ?= ../qemu-xen.git
 ifeq ($(XEN_TARGET_ARCH),ia64)
 CONFIG_QEMU   ?= ioemu
diff -r 19201eebab16 tools/ioemu/target-i386-dm/helper2.c
--- a/tools/ioemu/target-i386-dm/helper2.c	Thu Sep 25 13:33:50 2008 +0100
+++ b/tools/ioemu/target-i386-dm/helper2.c	Tue Nov 18 21:04:56 2008 +0900
@@ -41,7 +41,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <inttypes.h>
-#include <signal.h>
+#include <sys/signal.h>
 #include <assert.h>
 
 #include <limits.h>
@@ -91,6 +91,8 @@ int send_vcpu = 0;
 //the evtchn port for polling the notification,
 #define NR_CPUS 32
 evtchn_port_t ioreq_local_port[NR_CPUS];
+
+volatile int kemari_enabled = 0;
 
 CPUX86State *cpu_x86_init(void)
 {
@@ -525,6 +527,12 @@ void cpu_handle_ioreq(void *opaque)
     }
 }
 
+static void sigusr1_handler(int sig_type)
+{
+    kemari_enabled = 1; /* QEMU will run in kemari mode */
+    xenstore_process_logdirty_event();
+}
+
 int main_loop(void)
 {
     extern int vm_running;
@@ -534,6 +542,7 @@ int main_loop(void)
     int evtchn_fd = xce_handle == -1 ? -1 : xc_evtchn_fd(xce_handle);
     char *qemu_file;
     fd_set fds;
+    struct sigaction sigusr1;
 
     buffered_io_timer = qemu_new_timer(rt_clock, handle_buffered_io,
 				       cpu_single_env);
@@ -541,6 +550,13 @@ int main_loop(void)
 
     if (evtchn_fd != -1)
         qemu_set_fd_handler(evtchn_fd, cpu_handle_ioreq, NULL, env);
+
+    sigusr1.sa_handler = sigusr1_handler;
+    sigemptyset(&sigusr1.sa_mask);
+#ifndef  __MINIOS__
+    sigusr1.sa_flags = SA_RESTART;
+#endif /* ! __MINIOS__ */
+    sigaction(SIGUSR1, &sigusr1, 0);
 
     xenstore_record_dm_state("running");
     while (1) {
@@ -556,9 +572,11 @@ int main_loop(void)
         main_loop_wait(1); /* For the select() on events */
 
         /* Save the device state */
-        asprintf(&qemu_file, "/var/lib/xen/qemu-save.%d", domid);
-        do_savevm(qemu_file);
-        free(qemu_file);
+        if (!kemari_enabled) {
+            asprintf(&qemu_file, "/var/lib/xen/qemu-save.%d", domid);
+            do_savevm(qemu_file);
+            free(qemu_file);
+        }
 
         xenstore_record_dm_state("paused");
 
diff -r 19201eebab16 tools/ioemu/vl.c
--- a/tools/ioemu/vl.c	Thu Sep 25 13:33:50 2008 +0100
+++ b/tools/ioemu/vl.c	Tue Nov 18 21:04:56 2008 +0900
@@ -216,6 +216,8 @@ extern int domid;
 extern int domid;
 
 PCI_EMULATION_INFO *PciEmulationInfoHead = NULL;
+
+extern volatile int kemari_enabled;
 
 /***********************************************************/
 /* x86 ISA bus support */
@@ -4928,6 +4930,10 @@ int qemu_savevm_state(QEMUFile *f)
     qemu_put_be64(f, 0); /* total size */
 
     for(se = first_se; se != NULL; se = se->next) {
+        /* ignore vga in kemari mode */
+        if (kemari_enabled && strstr(se->idstr, "vga"))
+            continue;
+
         /* ID string */
         len = strlen(se->idstr);
         qemu_put_byte(f, len);
diff -r 19201eebab16 tools/ioemu/xenstore.c
--- a/tools/ioemu/xenstore.c	Thu Sep 25 13:33:50 2008 +0100
+++ b/tools/ioemu/xenstore.c	Tue Nov 18 21:04:56 2008 +0900
@@ -373,6 +373,9 @@ unsigned long logdirty_bitmap_size;
 unsigned long logdirty_bitmap_size;
 extern int vga_ram_size, bios_size;
 
+extern volatile int kemari_enabled;
+static char *kemari_qemu_info = NULL;
+
 void xenstore_process_logdirty_event(void)
 {
     char *act;
@@ -446,6 +449,7 @@ void xenstore_process_logdirty_event(voi
             seg = NULL;
             return;
         }
+        kemari_qemu_info = seg + logdirty_bitmap_size * 2;
 #endif
 
         /* Remember the paths for the next-active and active entries */
@@ -463,26 +467,52 @@ void xenstore_process_logdirty_event(voi
         }
     }
 
-    fprintf(logfile, "Triggered log-dirty buffer switch\n");
-    
-    /* Read the required active buffer from the store */
-    act = xs_read(xsh, XBT_NULL, next_active_path, &len);
-    if (!act) {
-        fprintf(logfile, "Log-dirty: can't read next-active\n");
-        exit(1);
-    }
-
-    /* Switch buffers */
-    i = act[0] - '0';
-    if (i != 0 && i != 1) {
-        fprintf(logfile, "Log-dirty: bad next-active entry: %s\n", act);
-        exit(1);
-    }
-    logdirty_bitmap = (unsigned long *)(seg + i * logdirty_bitmap_size);
-
-    /* Ack that we've switched */
-    xs_write(xsh, XBT_NULL, active_path, act, len);
-    free(act);
+    if (kemari_enabled) {
+        char *qemu_file;
+
+        while (kemari_qemu_info[1])
+            xen_rmb();
+
+        i = kemari_qemu_info[0];
+        if (i != 0 && i != 1) {
+            fprintf(logfile, "Log-dirty: bad next-active entry: %s\n", act);
+            exit(1);
+        }
+
+        logdirty_bitmap = (unsigned long *)(seg + i * logdirty_bitmap_size);
+        kemari_qemu_info[1] = 1;
+        xen_wmb();
+
+        while (kemari_qemu_info[2])
+            xen_rmb();
+        /* Save the device state to tmpfs */
+        asprintf(&qemu_file, "/dev/shm/qemu-save.%d", domid);
+        do_savevm(qemu_file);
+        free(qemu_file);
+        kemari_qemu_info[2] = 1;
+        xen_wmb();
+    } else {
+        fprintf(logfile, "Triggered log-dirty buffer switch\n");
+
+        /* Read the required active buffer from the store */
+        act = xs_read(xsh, XBT_NULL, next_active_path, &len);
+        if (!act) {
+            fprintf(logfile, "Log-dirty: can't read next-active\n");
+            exit(1);
+        }
+
+        /* Switch buffers */
+        i = act[0] - '0';
+        if (i != 0 && i != 1) {
+            fprintf(logfile, "Log-dirty: bad next-active entry: %s\n", act);
+            exit(1);
+        }
+        logdirty_bitmap = (unsigned long *)(seg + i * logdirty_bitmap_size);
+
+        /* Ack that we've switched */
+        xs_write(xsh, XBT_NULL, active_path, act, len);
+        free(act);
+    }
 }
 
 
diff -r 19201eebab16 tools/libxc/Makefile
--- a/tools/libxc/Makefile	Thu Sep 25 13:33:50 2008 +0100
+++ b/tools/libxc/Makefile	Tue Nov 18 21:04:56 2008 +0900
@@ -30,6 +30,8 @@ GUEST_SRCS-y :=
 GUEST_SRCS-y :=
 GUEST_SRCS-y += xg_private.c
 GUEST_SRCS-$(CONFIG_MIGRATE) += xc_domain_restore.c xc_domain_save.c
+GUEST_SRCS-$(CONFIG_MIGRATE) += xc_dom_kemari.c xc_dom_kemari_save.c
+GUEST_SRCS-$(CONFIG_MIGRATE) += xc_dom_kemari_restore.c
 GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c
 
 vpath %.c ../../xen/common/libelf
diff -r 19201eebab16 tools/libxc/xc_cpuid_x86.c
--- a/tools/libxc/xc_cpuid_x86.c	Thu Sep 25 13:33:50 2008 +0100
+++ b/tools/libxc/xc_cpuid_x86.c	Tue Nov 18 21:04:56 2008 +0900
@@ -148,7 +148,7 @@ static void xc_cpuid_hvm_policy(
     int xc, domid_t domid, const unsigned int *input, unsigned int *regs)
 {
     char brand[13];
-    unsigned long pae;
+    unsigned long long pae;
     int is_pae;
 
     xc_get_hvm_param(xc, domid, HVM_PARAM_PAE_ENABLED, &pae);
diff -r 19201eebab16 tools/libxc/xc_domain.c
--- a/tools/libxc/xc_domain.c	Thu Sep 25 13:33:50 2008 +0100
+++ b/tools/libxc/xc_domain.c	Tue Nov 18 21:04:56 2008 +0900
@@ -690,7 +690,7 @@ int xc_domain_send_trigger(int xc_handle
     return do_domctl(xc_handle, &domctl);
 }
 
-int xc_set_hvm_param(int handle, domid_t dom, int param, unsigned long value)
+int xc_set_hvm_param(int handle, domid_t dom, int param, unsigned long long value)
 {
     DECLARE_HYPERCALL;
     xen_hvm_param_t arg;
@@ -709,7 +709,7 @@ int xc_set_hvm_param(int handle, domid_t
     return rc;
 }
 
-int xc_get_hvm_param(int handle, domid_t dom, int param, unsigned long *value)
+int xc_get_hvm_param(int handle, domid_t dom, int param, unsigned long long *value)
 {
     DECLARE_HYPERCALL;
     xen_hvm_param_t arg;
diff -r 19201eebab16 tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c	Thu Sep 25 13:33:50 2008 +0100
+++ b/tools/libxc/xc_domain_save.c	Tue Nov 18 21:04:56 2008 +0900
@@ -1400,7 +1400,7 @@ int xc_domain_save(int xc_handle, int io
         } chunk = { -3, 0 };
 
         xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT,
-                         (unsigned long *)&chunk.ident_pt);
+                         &chunk.ident_pt);
 
         if ( (chunk.ident_pt != 0) &&
              write_exact(io_fd, &chunk, sizeof(chunk)) )
@@ -1425,11 +1425,11 @@ int xc_domain_save(int xc_handle, int io
         /* Save magic-page locations. */
         memset(magic_pfns, 0, sizeof(magic_pfns));
         xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
-                         (unsigned long *)&magic_pfns[0]);
+                         &magic_pfns[0]);
         xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
-                         (unsigned long *)&magic_pfns[1]);
+                         &magic_pfns[1]);
         xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
-                         (unsigned long *)&magic_pfns[2]);
+                         &magic_pfns[2]);
         if ( write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
         {
             PERROR("Error when writing to state file (7)");
diff -r 19201eebab16 tools/libxc/xc_resume.c
--- a/tools/libxc/xc_resume.c	Thu Sep 25 13:33:50 2008 +0100
+++ b/tools/libxc/xc_resume.c	Tue Nov 18 21:04:56 2008 +0900
@@ -27,7 +27,7 @@ static int modify_returncode(int xc_hand
     /* HVM guests without PV drivers do not have a return code to modify. */
     if ( info.hvm )
     {
-        unsigned long irq = 0;
+        unsigned long long irq = 0;
         xc_get_hvm_param(xc_handle, domid, HVM_PARAM_CALLBACK_IRQ, &irq);
         if ( !irq )
             return 0;
diff -r 19201eebab16 tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h	Thu Sep 25 13:33:50 2008 +0100
+++ b/tools/libxc/xenctrl.h	Tue Nov 18 21:04:56 2008 +0900
@@ -1009,8 +1009,8 @@ const char *xc_error_code_to_desc(int co
  */
 xc_error_handler xc_set_error_handler(xc_error_handler handler);
 
-int xc_set_hvm_param(int handle, domid_t dom, int param, unsigned long value);
-int xc_get_hvm_param(int handle, domid_t dom, int param, unsigned long *value);
+int xc_set_hvm_param(int handle, domid_t dom, int param, unsigned long long value);
+int xc_get_hvm_param(int handle, domid_t dom, int param, unsigned long long *value);
 
 /* IA64 specific, nvram save */
 int xc_ia64_save_to_nvram(int xc_handle, uint32_t dom);
@@ -1150,4 +1150,13 @@ int xc_pm_get_cxstat(int xc_handle, int 
 int xc_pm_get_cxstat(int xc_handle, int cpuid, struct xc_cx_stat *cxpt);
 int xc_pm_reset_cxstat(int xc_handle, int cpuid);
 
+/* kemari control interface */
+int xc_kemari_control(int xc_handle,
+                      uint32_t domid,
+                      uint32_t cmd,
+                      evtchn_port_t *port,
+                      uint32_t *num_pages,
+                      uint64_t *mfn,
+                      uint16_t tap_mode);
+
 #endif /* XENCTRL_H */
diff -r 19201eebab16 tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h	Thu Sep 25 13:33:50 2008 +0100
+++ b/tools/libxc/xenguest.h	Wed Nov 19 13:50:59 2008 +0900
@@ -43,6 +43,51 @@ int xc_domain_save(int xc_handle, int io
  * @return 0 on success, -1 on failure
  */
 int xc_domain_restore(int xc_handle, int io_fd, uint32_t dom,
+                      unsigned int store_evtchn, unsigned long *store_mfn,
+                      unsigned int console_evtchn, unsigned long *console_mfn,
+                      unsigned int hvm, unsigned int pae);
+
+/**
+ * This function will save a running domain for Kemari.
+ *
+ * @parm xc_handle a handle to an open hypervisor interface
+ * @parm fd the file descriptor to save a domain to
+ * @parm dom the id of the domain
+ * @return 0 on success, -1 on failure
+ */
+int xc_kemari_save(int xc_handle, int io_fd, uint32_t dom, 
+                   uint32_t flags /* XCFLAGS_xxx */,
+                   int hvm,
+                   void *(*init_qemu_maps)(int, unsigned));
+
+/**
+ * This function will update a domain for Kemari.
+ *
+ * @parm xc_handle a handle to an open hypervisor interface
+ * @parm fd the file descriptor to save a domain to
+ * @parm dom the id of the domain
+ * @return 0 on success, -1 on failure
+ */
+int xc_kemari_update(int xc_handle, int io_fd, uint32_t dom, 
+                     void *kemari_ring, uint32_t flags,
+                     void (*qemu_save_image)(int),
+                     void (*qemu_end_flip)(void),
+                     void (*qemu_end_save)(void),
+                     void (*qemu_image_sent)(void));
+
+/**
+ * This function will restore a saved domain for Kemari.
+ *
+ * @parm xc_handle a handle to an open hypervisor interface
+ * @parm fd the file descriptor to restore a domain from
+ * @parm dom the id of the domain
+ * @parm store_evtchn the store event channel for this domain to use
+ * @parm store_mfn returned with the mfn of the store page
+ * @parm hvm non-zero if this is a HVM restore
+ * @parm pae non-zero if this HVM domain has PAE support enabled
+ * @return 0 on success, -1 on failure
+ */
+int xc_kemari_restore(int xc_handle, int io_fd, uint32_t dom,
                       unsigned int store_evtchn, unsigned long *store_mfn,
                       unsigned int console_evtchn, unsigned long *console_mfn,
                       unsigned int hvm, unsigned int pae);
diff -r 19201eebab16 tools/libxc/xg_private.h
--- a/tools/libxc/xg_private.h	Thu Sep 25 13:33:50 2008 +0100
+++ b/tools/libxc/xg_private.h	Tue Nov 18 21:04:56 2008 +0900
@@ -17,6 +17,7 @@
 
 #include <xen/memory.h>
 #include <xen/elfnote.h>
+#include <xen/kemari.h>
 
 #ifndef ELFSIZE
 #include <limits.h>
diff -r 19201eebab16 tools/libxc/xg_save_restore.h
--- a/tools/libxc/xg_save_restore.h	Thu Sep 25 13:33:50 2008 +0100
+++ b/tools/libxc/xg_save_restore.h	Wed Nov 19 13:50:59 2008 +0900
@@ -13,7 +13,7 @@
 ** We process save/restore/migrate in batches of pages; the below
 ** determines how many pages we (at maximum) deal with in each batch.
 */
-#define MAX_BATCH_SIZE 1024   /* up to 1024 pages (4MB) at a time */
+#define MAX_BATCH_SIZE 262144   /* up to 262144 pages (1GB) at a time */
 
 /* When pinning page tables at the end of restore, we also use batching. */
 #define MAX_PIN_BATCH  1024
diff -r 19201eebab16 tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c	Thu Sep 25 13:33:50 2008 +0100
+++ b/tools/python/xen/lowlevel/xc/xc.c	Tue Nov 18 21:04:56 2008 +0900
@@ -490,7 +490,7 @@ static PyObject *pyxc_get_hvm_param(XcOb
 {
     uint32_t dom;
     int param;
-    unsigned long value;
+    unsigned long long value;
 
     static char *kwd_list[] = { "domid", "param", NULL }; 
     if ( !PyArg_ParseTupleAndKeywords(args, kwds, "ii", kwd_list,
@@ -500,7 +500,7 @@ static PyObject *pyxc_get_hvm_param(XcOb
     if ( xc_get_hvm_param(self->xc_handle, dom, param, &value) != 0 )
         return pyxc_error_to_exception();
 
-    return PyLong_FromUnsignedLong(value);
+    return PyLong_FromUnsignedLongLong(value);
 
 }
 
diff -r 19201eebab16 tools/python/xen/xend/XendAPI.py
--- a/tools/python/xen/xend/XendAPI.py	Thu Sep 25 13:33:50 2008 +0100
+++ b/tools/python/xen/xend/XendAPI.py	Tue Nov 18 21:04:57 2008 +0900
@@ -1780,9 +1780,10 @@ class XendAPI(object):
         port = other_config.get("port", 0)
         node = other_config.get("node", -1)
         ssl = other_config.get("ssl", None)
+        kemari = other_config.get("kemari", None)
         
         xendom.domain_migrate(xeninfo.getDomid(), destination_url,
-                              bool(live), port, node, ssl)
+                              bool(live), port, node, ssl, kemari)
         return xen_api_success_void()
 
     def VM_save(self, _, vm_ref, dest, checkpoint):
diff -r 19201eebab16 tools/python/xen/xend/XendCheckpoint.py
--- a/tools/python/xen/xend/XendCheckpoint.py	Thu Sep 25 13:33:50 2008 +0100
+++ b/tools/python/xen/xend/XendCheckpoint.py	Tue Nov 18 21:04:57 2008 +0900
@@ -28,7 +28,9 @@ QEMU_SIGNATURE = "QemuDeviceModelRecord"
 QEMU_SIGNATURE = "QemuDeviceModelRecord"
 dm_batch = 512
 XC_SAVE = "xc_save"
+XC_KEMARI_SAVE = "xc_kemari_save"
 XC_RESTORE = "xc_restore"
+XC_KEMARI_RESTORE = "xc_kemari_restore"
 
 
 sizeof_int = calcsize("i")
@@ -64,11 +66,38 @@ def insert_after(list, pred, value):
               list.insert (i+1, value)
     return
 
-
-def save(fd, dominfo, network, live, dst, checkpoint=False, node=-1):
+def get_dev_info(info, n):
+    i = 0
+    while i < len(info):
+        if (info[i][0] == n):
+            return [n, info[i][1]]
+        i = i + 1
+    return [n, '']
+
+def save(fd, dominfo, network, live, dst, checkpoint=False, node=-1, kemari=False):
     write_exact(fd, SIGNATURE, "could not write guest state file: signature")
 
     sxprep = dominfo.sxpr()
+
+    # Add kemari option if enabled.
+    if kemari:
+        sxprep.append(['kemari', kemari])
+        pv_devlist = []
+        pv_devs = dominfo.getDeviceSxprs('vbd')
+        for x in pv_devs:
+            devinfo = []
+            for n in ['event-channel', 'ring-ref']:
+                devinfo.append(get_dev_info(x[1], n))
+            pv_devlist.append([x[0], devinfo])
+        pv_devs = dominfo.getDeviceSxprs('vif')
+        for x in pv_devs:
+            devinfo = []
+            for n in ['event-channel', 'tx-ring-ref', 'rx-ring-ref',
+                'request-rx-copy', 'feature-rx-notify', 'feature-sg',
+                'feature-gso-tcpv4']:
+                devinfo.append(get_dev_info(x[1], n))
+            pv_devlist.append([x[0], devinfo])
+        sxprep.append(['kemari-device-info', pv_devlist])
 
     if node > -1:
         insert_after(sxprep,'vcpus',['node', str(node)])
@@ -97,7 +126,16 @@ def save(fd, dominfo, network, live, dst
         # enabled. Passing "0" simply uses the defaults compiled into
         # libxenguest; see the comments and/or code in xc_linux_save() for
         # more information.
-        cmd = [xen.util.auxbin.pathTo(XC_SAVE), str(fd),
+        if kemari:
+            if not hvm:
+                raise XendError("You can only use kemari on HVM domain")
+
+            cmd = [xen.util.auxbin.pathTo(XC_KEMARI_SAVE), str(fd),
+               str(dominfo.getDomid()), "0", "0", 
+               str(int(live) | (int(hvm) << 2)) ]
+            log.debug("[xc_save]: %s", string.join(cmd))
+        else:
+            cmd = [xen.util.auxbin.pathTo(XC_SAVE), str(fd),
                str(dominfo.getDomid()), "0", "0", 
                str(int(live) | (int(hvm) << 2)) ]
         log.debug("[xc_save]: %s", string.join(cmd))
@@ -125,7 +163,7 @@ def save(fd, dominfo, network, live, dst
         forkHelper(cmd, fd, saveInputHandler, False)
 
         # put qemu device model state
-        if os.path.exists("/var/lib/xen/qemu-save.%d" % dominfo.getDomid()):
+        if not kemari and os.path.exists("/var/lib/xen/qemu-save.%d" % dominfo.getDomid()):
             write_exact(fd, QEMU_SIGNATURE, "could not write qemu signature")
             qemu_fd = os.open("/var/lib/xen/qemu-save.%d" % dominfo.getDomid(),
                               os.O_RDONLY)
@@ -138,7 +176,7 @@ def save(fd, dominfo, network, live, dst
             os.close(qemu_fd)
             os.remove("/var/lib/xen/qemu-save.%d" % dominfo.getDomid())
 
-        if checkpoint:
+        if checkpoint or kemari:
             dominfo.resumeDomain()
         else:
             dominfo.destroy()
@@ -184,6 +222,16 @@ def restore(xd, fd, dominfo = None, paus
         raise XendError("not a valid guest state file: config parse")
 
     vmconfig = p.get_val()
+
+    # Checks kemari is enabled or not.
+    # Since Xen do not know kemari option, this option will not be migrated.
+    is_kemari = False
+    kemari_device_info = []
+    for v in vmconfig:
+        if v[0] == 'kemari' and v[1]:
+            is_kemari = True
+        if v[0] == 'kemari-device-info' and v[1]:
+            kemari_device_info = v[1]
 
     if not relocating:
         domconfig = XendConfig(sxp_obj = vmconfig)
@@ -258,7 +306,15 @@ def restore(xd, fd, dominfo = None, paus
         shadow_cur = xc.shadow_mem_control(dominfo.getDomid(), shadow / 1024)
         dominfo.info['shadow_memory'] = shadow_cur
 
-        cmd = map(str, [xen.util.auxbin.pathTo(XC_RESTORE),
+        # Use Kemari restore.  Switching mechanism between normal migration
+        # and Kemari migration will be implemented lator.
+        #
+        if is_kemari:
+            cmd = map(str, [xen.util.auxbin.pathTo(XC_KEMARI_RESTORE),
+                        fd, dominfo.getDomid(),
+                        store_port, console_port, int(is_hvm), pae, apic])
+        else:
+            cmd = map(str, [xen.util.auxbin.pathTo(XC_RESTORE),
                         fd, dominfo.getDomid(),
                         store_port, console_port, int(is_hvm), pae, apic])
         log.debug("[xc_restore]: %s", string.join(cmd))
@@ -266,6 +322,8 @@ def restore(xd, fd, dominfo = None, paus
         handler = RestoreInputHandler()
 
         forkHelper(cmd, fd, handler.handler, True)
+        if is_kemari:
+            os.close(fd)
 
         # We don't want to pass this fd to any other children -- we 
         # might need to recover the disk space that backs it.
@@ -285,7 +343,7 @@ def restore(xd, fd, dominfo = None, paus
         # get qemu state and create a tmp file for dm restore
         # Even PV guests may have QEMU stat, but its not currently
         # used so only bother with HVM currently.
-        if is_hvm:
+        if is_hvm and not is_kemari:
             qemu_signature = read_exact(fd, len(QEMU_SIGNATURE),
                                         "invalid device model signature read")
             if qemu_signature != QEMU_SIGNATURE:
@@ -303,8 +361,10 @@ def restore(xd, fd, dominfo = None, paus
             os.close(qemu_fd)
             restore_image.setCpuid()
 
-
-        os.read(fd, 1)           # Wait for source to close connection
+        if is_kemari:
+            restore_image.setCpuid()
+        else:
+            os.read(fd, 1)           # Wait for source to close connection
         
         dominfo.completeRestore(handler.store_mfn, handler.console_mfn)
 
@@ -322,7 +382,10 @@ def restore(xd, fd, dominfo = None, paus
             lock = False;
 
         try:
-            dominfo.waitForDevices() # Wait for backends to set up
+            if is_kemari:
+                dominfo.waitForAttachedDevices(kemari_device_info)
+            else:
+                dominfo.waitForDevices() # Wait for backends to set up
         except Exception, exn:
             log.exception(exn)
 
diff -r 19201eebab16 tools/python/xen/xend/XendDomain.py
--- a/tools/python/xen/xend/XendDomain.py	Thu Sep 25 13:33:50 2008 +0100
+++ b/tools/python/xen/xend/XendDomain.py	Tue Nov 18 21:04:57 2008 +0900
@@ -1267,7 +1267,7 @@ class XendDomain:
 
         return val       
 
-    def domain_migrate(self, domid, dst, live=False, port=0, node=-1, ssl=None):
+    def domain_migrate(self, domid, dst, live=False, port=0, node=-1, ssl=None, kemari=None):
         """Start domain migration.
         
         @param domid: Domain ID or Name
@@ -1332,7 +1332,7 @@ class XendDomain:
 
             try:
                 XendCheckpoint.save(p2cwrite, dominfo, True, live, dst,
-                                    node=node)
+                                    node=node, kemari=kemari)
             finally:
                 sock.shutdown()
                 sock.close()
@@ -1358,7 +1358,7 @@ class XendDomain:
 
             try:
                 XendCheckpoint.save(sock.fileno(), dominfo, True, live,
-                                    dst, node=node)
+                                    dst, node=node, kemari=kemari)
             finally:
                 sock.close()
 
diff -r 19201eebab16 tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py	Thu Sep 25 13:33:50 2008 +0100
+++ b/tools/python/xen/xend/XendDomainInfo.py	Tue Nov 18 21:04:57 2008 +0900
@@ -869,6 +869,14 @@ class XendDomainInfo:
         for devclass in XendDevices.valid_devices():
             self.getDeviceController(devclass).waitForDevices()
 
+    def waitForAttachedDevices(self, devinfo):
+        """Wait for this domain's configured devices to connect.
+
+        @raise VmError: if any device fails to initialise.
+        """
+        for devclass in XendDevices.valid_devices():
+            self.getDeviceController(devclass).waitForAttachedDevices(devinfo)
+
     def hvm_destroyPCIDevice(self, vslot):
         log.debug("hvm_destroyPCIDevice called %s", vslot)
 
diff -r 19201eebab16 tools/python/xen/xend/server/DevController.py
--- a/tools/python/xen/xend/server/DevController.py	Thu Sep 25 13:33:50 2008 +0100
+++ b/tools/python/xen/xend/server/DevController.py	Tue Nov 18 21:04:57 2008 +0900
@@ -53,6 +53,7 @@ xenbusState = {
     'Closed'       : 6,
     'Reconfiguring': 7,
     'Reconfigured' : 8,
+    'Attached'     : 9,
     }
 
 xoptions = XendOptions.instance()
@@ -190,6 +191,59 @@ class DevController:
                 err = "Busy."
             raise VmError("Device %s (%s) could not be connected.\n%s" %
                           (devid, self.deviceClass, err))
+
+
+    def waitForAttachedDevices(self, devinfo):
+        log.debug("Waiting for attached devices %s.", self.deviceClass)
+        seq = self.deviceIDs()
+        return [self.waitForAttachedDevice(item, devinfo) for item in seq]
+
+
+    def waitForAttachedDevice(self, devid, devinfo):
+        log.debug("Waiting for attached %s.", devid)
+
+        if not self.hotplug:
+            return
+        
+        (status, err) = self.waitForBackend(devid)
+
+        if status == Timeout:
+            self.destroyDevice(devid, False)
+            raise VmError("Device %s (%s) could not be connected. "
+                          "Hotplug scripts not working." %
+                          (devid, self.deviceClass))
+
+        elif status == Error:
+            self.destroyDevice(devid, False)
+            raise VmError("Device %s (%s) could not be connected. "
+                          "Backend device not found." %
+                          (devid, self.deviceClass))
+
+        elif status == Missing:
+            # Don't try to destroy the device; it's already gone away.
+            raise VmError("Device %s (%s) could not be connected. "
+                          "Device not found." % (devid, self.deviceClass))
+
+        elif status == Busy:
+            err = None
+            frontpath = self.frontendPath(devid)
+            backpath = xstransact.Read(frontpath, "backend")
+            if backpath:
+                err = xstransact.Read(backpath, HOTPLUG_ERROR_NODE)
+            if not err:
+                err = "Busy."
+                
+            self.destroyDevice(devid, False)
+            raise VmError("Device %s (%s) could not be connected.\n%s" %
+                          (devid, self.deviceClass, err))
+
+        for x in devinfo:
+            if x[0] == str(devid): # x[0] was changed to string for transfer.
+                for y in x[1]:
+                    if y[0] and y[1]:
+                        self.writeFrontend(devid, y[0], str(y[1]))
+                        log.debug("%s %s set for %s.", y[0], y[1], devid)
+                self.writeFrontend(devid, 'state', str(xenbusState['Attached']))
 
 
     def waitForDevice_destroy(self, devid, backpath):
@@ -483,6 +537,13 @@ class DevController:
         else:
             raise VmError("Device %s not connected" % devid)
 
+    def writeFrontend(self, devid, *args):
+        frontpath = self.frontendPath(devid)
+
+        if frontpath:
+            xstransact.Write(frontpath, *args)
+        else:
+            raise VmError("Device %s not connected" % devid)
 
 ## private:
 
diff -r 19201eebab16 tools/python/xen/xend/server/vfbif.py
--- a/tools/python/xen/xend/server/vfbif.py	Thu Sep 25 13:33:50 2008 +0100
+++ b/tools/python/xen/xend/server/vfbif.py	Tue Nov 18 21:04:57 2008 +0900
@@ -39,6 +39,10 @@ class VfbifController(DevController):
                      if devinfo[i] is not None])
 
     def waitForDevice(self, devid):
+        # is a qemu-dm managed device, don't wait for hotplug for these.
+        return
+
+    def waitForAttachedDevice(self, devid, devinfo):
         # is a qemu-dm managed device, don't wait for hotplug for these.
         return
 
diff -r 19201eebab16 tools/python/xen/xm/migrate.py
--- a/tools/python/xen/xm/migrate.py	Thu Sep 25 13:33:50 2008 +0100
+++ b/tools/python/xen/xm/migrate.py	Tue Nov 18 21:04:57 2008 +0900
@@ -51,6 +51,10 @@ gopts.opt('ssl', short='s',
           fn=set_true, default=None,
           use="Use ssl connection for migration.")
 
+gopts.opt('kemari', short='k',
+          fn=set_true, default=None,
+          use="Use kemari migration.")
+
 def help():
     return str(gopts)
     
@@ -70,7 +74,8 @@ def main(argv):
         other_config = {
             "port":     opts.vals.port,
             "node":     opts.vals.node,
-            "ssl":      opts.vals.ssl
+            "ssl":      opts.vals.ssl,
+            "kemari":   opts.vals.kemari
             }
         server.xenapi.VM.migrate(vm_ref, dst, bool(opts.vals.live),
                                  other_config)
@@ -78,4 +83,5 @@ def main(argv):
         server.xend.domain.migrate(dom, dst, opts.vals.live,
                                    opts.vals.port,
                                    opts.vals.node,
-                                   opts.vals.ssl)
+                                   opts.vals.ssl,
+                                   opts.vals.kemari)
diff -r 19201eebab16 tools/xcutils/Makefile
--- a/tools/xcutils/Makefile	Thu Sep 25 13:33:50 2008 +0100
+++ b/tools/xcutils/Makefile	Tue Nov 18 21:04:57 2008 +0900
@@ -19,6 +19,7 @@ PROG_DEP = .*.d
 PROG_DEP = .*.d
 
 PROGRAMS = xc_restore xc_save readnotes lsevtchn
+PROGRAMS += xc_kemari_test xc_kemari_save xc_kemari_restore
 
 LDLIBS   = $(LDFLAGS_libxenctrl) $(LDFLAGS_libxenguest) $(LDFLAGS_libxenstore)
 
diff -r 19201eebab16 xen/arch/x86/Makefile
--- a/xen/arch/x86/Makefile	Thu Sep 25 13:33:50 2008 +0100
+++ b/xen/arch/x86/Makefile	Tue Nov 18 21:04:57 2008 +0900
@@ -4,6 +4,7 @@ subdir-y += hvm
 subdir-y += hvm
 subdir-y += mm
 subdir-y += oprofile
+subdir-y += kemari
 
 subdir-$(x86_32) += x86_32
 subdir-$(x86_64) += x86_64
diff -r 19201eebab16 xen/arch/x86/domctl.c
--- a/xen/arch/x86/domctl.c	Thu Sep 25 13:33:50 2008 +0100
+++ b/xen/arch/x86/domctl.c	Tue Nov 18 21:04:57 2008 +0900
@@ -20,6 +20,7 @@
 #include <xen/trace.h>
 #include <xen/console.h>
 #include <xen/iocap.h>
+#include <xen/kemari.h>
 #include <xen/paging.h>
 #include <asm/irq.h>
 #include <asm/hvm/hvm.h>
@@ -997,6 +998,21 @@ long arch_do_domctl(
     }
     break;
 
+    case XEN_DOMCTL_kemari_op:
+    {
+        struct domain *d = rcu_lock_domain_by_id(domctl->domain);
+
+        ret = -ESRCH;
+        if ( unlikely(d == NULL) )
+            break;
+
+        ret = do_kemari_op(d, &domctl->u.kemari_op);
+
+        copy_to_guest(u_domctl, domctl, 1);
+        rcu_unlock_domain(d);
+    }
+    break;
+
     default:
         ret = -ENOSYS;
         break;
diff -r 19201eebab16 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c	Thu Sep 25 13:33:50 2008 +0100
+++ b/xen/arch/x86/mm.c	Tue Nov 18 21:04:58 2008 +0900
@@ -3376,6 +3376,8 @@ long arch_memory_op(int op, XEN_GUEST_HA
         case XENMAPSPACE_shared_info:
             if ( xatp.idx == 0 )
                 mfn = virt_to_mfn(d->shared_info);
+            gdprintk(XENLOG_DEBUG, "shared_info: mfn=%lu, gpfn=%lu\n",
+                     mfn, xatp.gpfn);
             break;
         case XENMAPSPACE_grant_table:
             spin_lock(&d->grant_table->lock);
@@ -3386,7 +3388,8 @@ long arch_memory_op(int op, XEN_GUEST_HA
 
             if ( xatp.idx < nr_grant_frames(d->grant_table) )
                 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
-
+            gdprintk(XENLOG_DEBUG, "grant_table: mfn=%lu, gpfn=%lu\n",
+                     mfn, xatp.gpfn);
             spin_unlock(&d->grant_table->lock);
             break;
         default:
diff -r 19201eebab16 xen/common/event_channel.c
--- a/xen/common/event_channel.c	Thu Sep 25 13:33:50 2008 +0100
+++ b/xen/common/event_channel.c	Tue Nov 18 21:04:58 2008 +0900
@@ -201,7 +201,8 @@ static long evtchn_bind_interdomain(evtc
     if ( !port_is_valid(rd, rport) )
         ERROR_EXIT_DOM(-EINVAL, rd);
     rchn = evtchn_from_port(rd, rport);
-    if ( (rchn->state != ECS_UNBOUND) ||
+    /* kemari needs to reuse rchn information */
+    if ( (rchn->state != ECS_UNBOUND) && 
          (rchn->u.unbound.remote_domid != ld->domain_id) )
         ERROR_EXIT_DOM(-EINVAL, rd);
 
@@ -348,6 +349,113 @@ static long evtchn_bind_pirq(evtchn_bind
     return rc;
 }
 
+long evtchn_bind_tap(struct evtchn_bind_tap *bind_tap)
+{
+    struct evtchn *lchn, *rchn;
+    struct domain *ld, *rd;
+    int            lport = bind_tap->tap_port, rport;
+    domid_t        ldom = bind_tap->tap_dom;
+    long ret;
+
+    if ( (ld = rcu_lock_domain_by_id(ldom)) == NULL )
+        return -ESRCH;
+
+    spin_lock(&ld->evtchn_lock);
+
+    ret = -EINVAL;
+    if ( !port_is_valid(ld, lport) )
+        goto lchn_out;
+    lchn = evtchn_from_port(ld, lport);
+    if ( lchn->state != ECS_INTERDOMAIN )
+        goto lchn_out;
+
+    ret = -ESRCH;
+    rd = lchn->u.interdomain.remote_dom;
+    if ( rd == NULL )
+        goto lchn_out;
+
+    spin_lock(&rd->evtchn_lock);
+
+    rport = lchn->u.interdomain.remote_port;
+    if ( !port_is_valid(rd, rport) )
+        goto rchn_out;
+    rchn = evtchn_from_port(rd, rport);
+    if ( rchn->state != ECS_INTERDOMAIN )
+        goto rchn_out;
+
+    lchn->state = ECS_TAP;
+    lchn->tap.mode = bind_tap->mode;
+    lchn->tap.redirect = bind_tap->redirect;
+
+    rchn->state = ECS_TAP;
+    rchn->tap.redirect = bind_tap->redirect;
+
+    ret = 0;
+
+ rchn_out:
+    spin_unlock(&rd->evtchn_lock);
+
+ lchn_out:    
+    spin_unlock(&ld->evtchn_lock);
+
+    rcu_unlock_domain(ld);
+
+    return ret;
+}
+
+long evtchn_unbind_tap(struct evtchn_bind_tap *bind_tap)
+{
+    struct evtchn *lchn, *rchn;
+    struct domain *ld, *rd;
+    int            lport = bind_tap->tap_port, rport;
+    domid_t        ldom = bind_tap->tap_dom;
+    long ret;
+
+    if ( (ld = rcu_lock_domain_by_id(ldom)) == NULL )
+        return -ESRCH;
+
+    spin_lock(&ld->evtchn_lock);
+
+    ret = -EINVAL;
+    if ( !port_is_valid(ld, lport) )
+        goto lchn_out;
+    lchn = evtchn_from_port(ld, lport);
+    if ( lchn->state != ECS_TAP )
+        goto lchn_out;
+
+    ret = -ESRCH;
+    rd = lchn->u.interdomain.remote_dom;
+    if ( rd == NULL )
+        goto lchn_out;
+
+    spin_lock(&rd->evtchn_lock);
+
+    rport = lchn->u.interdomain.remote_port;
+    if ( !port_is_valid(rd, rport) )
+        goto rchn_out;
+    rchn = evtchn_from_port(rd, rport);
+    if ( rchn->state != ECS_TAP )
+        goto rchn_out;
+
+    lchn->state = ECS_INTERDOMAIN;
+    lchn->tap.mode = bind_tap->mode;
+    lchn->tap.redirect = NULL;
+
+    rchn->state = ECS_INTERDOMAIN;
+    rchn->tap.redirect = NULL;
+
+    ret = 0;
+
+ rchn_out:
+    spin_unlock(&rd->evtchn_lock);
+
+ lchn_out:    
+    spin_unlock(&ld->evtchn_lock);
+
+    rcu_unlock_domain(ld);
+
+    return ret;
+}
 
 static long __evtchn_close(struct domain *d1, int port1)
 {
@@ -509,6 +617,13 @@ int evtchn_send(struct domain *d, unsign
 
     switch ( lchn->state )
     {
+    case ECS_TAP:
+        rd    = lchn->u.interdomain.remote_dom;
+        rport = lchn->u.interdomain.remote_port;
+        rchn  = evtchn_from_port(rd, rport);
+
+        lchn->tap.redirect(lchn, rchn);
+
     case ECS_INTERDOMAIN:
         rd    = lchn->u.interdomain.remote_dom;
         rport = lchn->u.interdomain.remote_port;
@@ -1037,6 +1152,30 @@ void notify_via_xen_event_channel(int lp
     spin_unlock(&ld->evtchn_lock);
 }
 
+void notify_via_xen_evtchn_tap(struct domain *ld, int lport)
+{
+    struct evtchn *lchn, *rchn;
+    struct domain *rd;
+    int            rport;
+
+    if (ld != current->domain)
+        spin_lock(&ld->evtchn_lock);
+
+    ASSERT(port_is_valid(ld, lport));
+    lchn = evtchn_from_port(ld, lport);
+    ASSERT(lchn->consumer_is_xen);
+
+    if ( likely(lchn->state == ECS_INTERDOMAIN) )
+    {
+        rd    = lchn->u.interdomain.remote_dom;
+        rport = lchn->u.interdomain.remote_port;
+        rchn  = evtchn_from_port(rd, rport);
+        evtchn_set_pending(rd->vcpu[rchn->notify_vcpu_id], rport);
+    }
+
+    if (ld != current->domain)
+        spin_unlock(&ld->evtchn_lock);
+}
 
 int evtchn_init(struct domain *d)
 {
diff -r 19201eebab16 xen/common/grant_table.c
--- a/xen/common/grant_table.c	Thu Sep 25 13:33:50 2008 +0100
+++ b/xen/common/grant_table.c	Tue Nov 18 21:04:58 2008 +0900
@@ -404,7 +404,6 @@ __gnttab_map_grant_ref(
     op->dev_bus_addr = (u64)frame << PAGE_SHIFT;
     op->handle       = handle;
     op->status       = GNTST_okay;
-
     rcu_unlock_domain(rd);
     return;
 
diff -r 19201eebab16 xen/include/public/domctl.h
--- a/xen/include/public/domctl.h	Thu Sep 25 13:33:50 2008 +0100
+++ b/xen/include/public/domctl.h	Tue Nov 18 21:04:58 2008 +0900
@@ -614,6 +614,38 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_subsc
 #define XEN_DOMCTL_set_machine_address_size  51
 #define XEN_DOMCTL_get_machine_address_size  52
 
+/* Kemari interface */
+#define XEN_DOMCTL_kemari_op         53
+
+#define _XEN_KEMARI_OP_enable 0
+#define XEN_KEMARI_OP_enable  (1UL<<_XEN_KEMARI_OP_enable)
+#define _XEN_KEMARI_OP_off    1
+#define XEN_KEMARI_OP_off     (1UL<<_XEN_KEMARI_OP_off)
+#define _XEN_KEMARI_OP_attach 2
+#define XEN_KEMARI_OP_attach  (1UL<<_XEN_KEMARI_OP_attach)
+#define _XEN_KEMARI_OP_detach 3
+#define XEN_KEMARI_OP_detach  (1UL<<_XEN_KEMARI_OP_detach)
+
+struct xen_domctl_kemari_op {
+    uint32_t cmd;
+
+    union {
+        struct {
+            uint32_t port;
+            uint32_t num_pages;
+            uint64_t mfn;
+        } enable; /* XEN_KEMARI_OP_enable */
+        struct {
+            uint32_t port;
+            uint16_t evtchn_tap_mode;
+        } attach; /* XEN_KEMARI_OP_attach */
+        struct {
+            uint32_t port;
+        } detach; /* XEN_KEMARI_OP_detach */
+    } u;
+};
+typedef struct xen_domctl_kemari_op xen_domctl_kemari_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_kemari_op_t);
 
 struct xen_domctl {
     uint32_t cmd;
@@ -654,6 +686,7 @@ struct xen_domctl {
         struct xen_domctl_set_opt_feature   set_opt_feature;
         struct xen_domctl_set_target        set_target;
         struct xen_domctl_subscribe         subscribe;
+        struct xen_domctl_kemari_op         kemari_op;
 #if defined(__i386__) || defined(__x86_64__)
         struct xen_domctl_cpuid             cpuid;
 #endif
diff -r 19201eebab16 xen/include/public/io/xenbus.h
--- a/xen/include/public/io/xenbus.h	Thu Sep 25 13:33:50 2008 +0100
+++ b/xen/include/public/io/xenbus.h	Tue Nov 18 21:04:58 2008 +0900
@@ -63,7 +63,9 @@ enum xenbus_state {
      */
     XenbusStateReconfiguring = 7,
 
-    XenbusStateReconfigured  = 8
+    XenbusStateReconfigured  = 8,
+
+    XenbusStateAttached      = 9
 };
 typedef enum xenbus_state XenbusState;
 
diff -r 19201eebab16 xen/include/xen/event.h
--- a/xen/include/xen/event.h	Thu Sep 25 13:33:50 2008 +0100
+++ b/xen/include/xen/event.h	Tue Nov 18 21:04:58 2008 +0900
@@ -79,4 +79,18 @@ void notify_via_xen_event_channel(int lp
         mb(); /* set blocked status /then/ caller does his work */      \
     } while ( 0 )
 
+struct evtchn_bind_tap {
+    /* IN parameters. */
+    domid_t       tap_dom;
+    uint32_t      tap_port;
+    uint8_t       mode;
+    long          (*redirect) (struct evtchn *lchn, struct evtchn *rchn);
+};
+
+void notify_via_xen_evtchn_tap(struct domain *ld, int lport);
+
+long evtchn_bind_tap(struct evtchn_bind_tap *bind_tap);
+
+long evtchn_unbind_tap(struct evtchn_bind_tap *bind_tap);
+
 #endif /* __XEN_EVENT_H__ */
diff -r 19201eebab16 xen/include/xen/lib.h
--- a/xen/include/xen/lib.h	Thu Sep 25 13:33:50 2008 +0100
+++ b/xen/include/xen/lib.h	Tue Nov 18 21:04:58 2008 +0900
@@ -45,7 +45,7 @@ struct domain;
 
 void cmdline_parse(char *cmdline);
 
-/*#define DEBUG_TRACE_DUMP*/
+#define DEBUG_TRACE_DUMP
 #ifdef DEBUG_TRACE_DUMP
 extern void debugtrace_dump(void);
 extern void debugtrace_printk(const char *fmt, ...);
diff -r 19201eebab16 xen/include/xen/sched.h
--- a/xen/include/xen/sched.h	Thu Sep 25 13:33:50 2008 +0100
+++ b/xen/include/xen/sched.h	Tue Nov 18 21:04:58 2008 +0900
@@ -19,6 +19,7 @@
 #include <xen/xenoprof.h>
 #include <xen/rcupdate.h>
 #include <xen/irq.h>
+#include <xen/kemari.h>
 
 #ifdef CONFIG_COMPAT
 #include <compat/vcpu.h>
@@ -47,6 +48,7 @@ struct evtchn
 #define ECS_PIRQ         4 /* Channel is bound to a physical IRQ line.       */
 #define ECS_VIRQ         5 /* Channel is bound to a virtual IRQ line.        */
 #define ECS_IPI          6 /* Channel is bound to a virtual IPI line.        */
+#define ECS_TAP          7 /* Channel is bound and tapped.                   */
     u8  state;             /* ECS_* */
     u8  consumer_is_xen;   /* Consumed by Xen or by guest? */
     u16 notify_vcpu_id;    /* VCPU for local delivery notification */
@@ -61,6 +63,11 @@ struct evtchn
         u16 pirq;      /* state == ECS_PIRQ */
         u16 virq;      /* state == ECS_VIRQ */
     } u;
+    struct {
+        u8 mode;    /* Tap IN, OUT or both.  */
+        /* Fucntion to call when an event is detected. */
+        long (*redirect) (struct evtchn *lchn, struct evtchn *rchn);
+    } tap;
 #ifdef FLASK_ENABLE
     void *ssid;
 #endif
@@ -249,6 +256,9 @@ struct domain
     /* OProfile support. */
     struct xenoprof *xenoprof;
     int32_t time_offset_seconds;
+
+    /* Kemari support. */
+    struct kemari *kemari;
 
     struct rcu_head rcu;
 
diff -r 19201eebab16 tools/libxc/xc_dom_kemari.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_dom_kemari.c	Wed Nov 19 13:50:59 2008 +0900
@@ -0,0 +1,79 @@
+/*
+ * xc_dom_kemari.c
+ *
+ * The API for manipulating and obtaining information on kemari-domains.
+ *
+ * Copyright (C) 2008 Nippon Telegraph and Telephone Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * 
+ */
+
+#include "xc_private.h"
+
+/*
+ * Kemari controller interface.
+ */
+int xc_kemari_control(int xc_handle,
+    uint32_t domid,
+    uint32_t cmd,
+    evtchn_port_t *port,
+    uint32_t *num_pages,
+    uint64_t *mfn,
+    uint16_t tap_mode)
+{
+    int rc;
+    struct xen_domctl_kemari_op *kemari_op;
+    DECLARE_DOMCTL;
+
+    domctl.cmd = XEN_DOMCTL_kemari_op;
+    domctl.domain = (domid_t)domid;
+
+    kemari_op = &domctl.u.kemari_op;
+    kemari_op->cmd = cmd;
+
+    if ( cmd == XEN_KEMARI_OP_attach )
+    {
+        kemari_op->u.attach.port = *port;
+        kemari_op->u.attach.evtchn_tap_mode = tap_mode;
+    }
+
+    if ( cmd /* == */ & XEN_KEMARI_OP_detach )
+        kemari_op->u.detach.port = *port;
+
+    DPRINTF("xc_kemari_control: cmd=%d\n", cmd);
+
+    rc = do_domctl(xc_handle, &domctl);
+
+    if ( cmd == XEN_KEMARI_OP_enable )
+    {
+        *port = kemari_op->u.enable.port;
+        *mfn = kemari_op->u.enable.mfn;
+        *num_pages = kemari_op->u.enable.num_pages;
+    }        
+
+    return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
diff -r 19201eebab16 tools/libxc/xc_dom_kemari_restore.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_dom_kemari_restore.c	Wed Nov 19 13:50:59 2008 +0900
@@ -0,0 +1,761 @@
+/******************************************************************************
+ * xc_dom_kemari_restore.c
+ *
+ * Restore the state of a guest session for kemari.
+ *
+ * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation. 
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * This source code is based on xc_domain_restore.c.
+ *
+ * Copyright (c) 2003, K A Fraser.
+ * Copyright (c) 2006, Intel Corporation
+ * Copyright (c) 2007, XenSource Inc.
+ */
+
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "xg_private.h"
+#include "xg_save_restore.h"
+#include "xc_dom.h"
+
+#include <xen/hvm/ioreq.h>
+#include <xen/hvm/params.h>
+
+/* number of pfns this guest has (i.e. number of entries in the P2M) */
+static unsigned long p2m_size;
+
+/* number of 'in use' pfns in the guest (i.e. #P2M entries with a valid mfn) */
+static unsigned long nr_pfns;
+
+/* A table mapping each PFN to its new MFN. */
+static xen_pfn_t *p2m = NULL;
+
+/* A table of P2M mappings in the current region */
+static xen_pfn_t *p2m_batch = NULL;
+
+int xc_kemari_restore(int xc_handle, int io_fd, uint32_t dom,
+                      unsigned int store_evtchn, unsigned long *store_mfn,
+                      unsigned int console_evtchn, unsigned long *console_mfn,
+                      unsigned int hvm, unsigned int pae)
+{
+    int rc = 1, frc, i, n, m;
+    unsigned long mfn, pfn;
+    unsigned int prev_pc, this_pc;
+
+    /* The new domain's shared-info frame number. */
+    unsigned long shared_info_frame;
+
+    /* A table containing the type of each PFN (/not/ MFN!). */
+    unsigned long *pfn_type = NULL;
+
+    /* A table of MFNs to map in the current region */
+    xen_pfn_t *region_mfn = NULL;
+
+    /* Types of the pfns in the current region */
+    unsigned long region_pfn_type[MAX_BATCH_SIZE];
+
+    /* Our mapping of the current region (batch) */
+    char *region_base;
+
+    /* Magic frames in HVM guests: ioreqs and xenstore comms. */
+    uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
+
+    /* Temporary buffered memory space until all pages are read. */
+    char *tmp_region = NULL;
+
+    /* if true, go into transaction mode */
+    int kemari_transaction_mode = 0;
+
+    /* index for grant table */
+    int grant_idx = 0;
+
+    /* Callback IRQ */
+    uint64_t callback_irq = 0;
+
+    /* active and non-active id of flip buffer */
+    int info_active = 0, info_non_active = 1;
+
+    /* Buffer for holding HVM context */
+    uint8_t *hvm_buf[2] = {NULL,NULL};
+    uint32_t hvm_buf_size = 0;
+
+    /* Buffer for qemu image */
+    uint8_t *qemu_image[2] = {NULL,NULL};
+    uint32_t qemu_image_size[2] = {0,0};
+    uint32_t qemu_buff_size = 0;
+
+    /* Buffer for the EPT identity PT location. */
+    uint64_t ident_pt[2] = {0,0};
+
+
+    if ( !hvm ) {
+        ERROR("Kemari only works on HVM domain.");
+        goto out;
+    }
+
+    /* For info only */
+    nr_pfns = 0;
+
+    if ( read_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
+    {
+        ERROR("read: p2m_size");
+        goto out;
+    }
+    DPRINTF("xc_kemari_restore start: p2m_size = %lx\n", p2m_size);
+
+    /* We want zeroed memory so use calloc rather than malloc. */
+    p2m        = calloc(p2m_size, sizeof(xen_pfn_t));
+    pfn_type   = calloc(p2m_size, sizeof(unsigned long));
+
+    region_mfn = xg_memalign(PAGE_SIZE, ROUNDUP(
+                              MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
+    p2m_batch  = xg_memalign(PAGE_SIZE, ROUNDUP(
+                              MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
+
+    /* use aligned page for speed up memmove(3) */
+    tmp_region = xg_memalign(PAGE_SIZE, PAGE_SIZE * MAX_BATCH_SIZE);
+
+    if ( (p2m == NULL) || (pfn_type == NULL) ||
+         (region_mfn == NULL) || (p2m_batch == NULL) ||
+         (tmp_region == NULL) )
+    {
+        ERROR("memory alloc failed");
+        errno = ENOMEM;
+        goto out;
+    }
+
+    memset(region_mfn, 0,
+           ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT)); 
+    memset(p2m_batch, 0,
+           ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT)); 
+    memset(tmp_region, 0, PAGE_SIZE * MAX_BATCH_SIZE);
+           
+    if ( lock_pages(region_mfn, sizeof(xen_pfn_t) * MAX_BATCH_SIZE) )
+    {
+        ERROR("Could not lock region_mfn");
+        goto out;
+    }
+
+    if ( lock_pages(p2m_batch, sizeof(xen_pfn_t) * MAX_BATCH_SIZE) )
+    {
+        ERROR("Could not lock p2m_batch");
+        goto out;
+    }
+
+    if ( lock_pages(tmp_region, sizeof(xen_pfn_t) * MAX_BATCH_SIZE) )
+    {
+        ERROR("Could not lock region_mfn");
+        goto out;
+    }
+
+    /* Get the domain's shared-info frame. */
+    if ( read_exact(io_fd, &shared_info_frame, sizeof(unsigned long)))
+    {
+        ERROR("Error when reading shared_info_frame");
+        goto out;
+    }
+    DPRINTF("xc_kemari_restore: shared_info_frame: %lx\n", shared_info_frame);
+
+    /* read HVM-specific parameters */
+    if ( read_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
+    {
+        ERROR("error reading magic page addresses");
+        goto out;
+    }
+
+    if (read_exact(io_fd, &callback_irq, sizeof(callback_irq)))
+    {
+        ERROR("error reading magic page addresses");
+        goto out;
+    }
+
+    /* Mark all PFNs as invalid; we allocate on demand */
+    for ( pfn = 0; pfn < p2m_size; pfn++ )
+        p2m[pfn] = INVALID_P2M_ENTRY;
+
+    /*
+     * Now simply read each saved frame into its new machine frame.
+     * We uncanonicalise page tables as we go.
+     */
+    prev_pc = 0;
+
+    n = m = 0;
+    for ( ; ; )
+    {
+        int num_pages;
+        int nr_mfns;
+
+        num_pages = 0;
+        for ( ; ; ) {
+            int j;
+
+            this_pc = (n * 100) / p2m_size;
+            if ( (this_pc - prev_pc) >= 5 )
+            {
+                PPRINTF("\b\b\b\b%3d%%", this_pc);
+                prev_pc = this_pc;
+            }
+
+            if ( read_exact(io_fd, &j, sizeof(int)) )
+            {
+                ERROR("Error when reading batch size");
+                goto build;
+            }
+
+            PPRINTF("batch %d\n",j);
+
+            if ( j == -3 )
+            {
+                /* Skip padding 4 bytes then read the EPT identity PT location. */
+                if ( read_exact(io_fd, &ident_pt[info_non_active],
+                        sizeof(uint32_t)) ||
+                     read_exact(io_fd, &ident_pt[info_non_active],
+                        sizeof(uint64_t)) )
+                {
+                    ERROR("error read the address of the EPT identity map");
+                    goto build;
+                }
+
+                continue;
+            }
+
+            if (j == -4)
+            {
+                uint32_t rec_size;
+                if ( read_exact(io_fd, &rec_size, sizeof(uint32_t)) )
+                {
+                    ERROR("error read the qemu file size");
+                    goto build;
+                }
+
+                if (qemu_buff_size < rec_size)
+                {
+                    qemu_buff_size = rec_size;
+                    qemu_image[0] = realloc(qemu_image[0], qemu_buff_size);
+                    qemu_image[1] = realloc(qemu_image[1], qemu_buff_size);
+                    if ((qemu_image[0] == NULL) || (qemu_image[1] == NULL))
+                    {
+                        ERROR("error allocate memory");
+                        goto out;
+                    }
+                }
+
+                qemu_image_size[info_non_active] = rec_size;
+                if ( read_exact(io_fd, qemu_image[info_non_active],
+                    qemu_image_size[info_non_active]) )
+                {
+                    ERROR("error read the qemu image file");
+                    goto build;
+                }
+
+                continue;
+            }
+
+            if ( j == 0 )
+                break;  /* our work here is done */
+
+            /* j > 0: Read pages here */
+            if ( (j > MAX_BATCH_SIZE) || (j < 0) )
+            {
+                ERROR("Max batch size exceeded. Giving up. %d", j);
+                goto out;
+            }
+
+            if ( read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long)) )
+            {
+                ERROR("Error when reading region pfn types");
+                goto build;
+            }
+
+            if (kemari_transaction_mode) {
+                if (num_pages != 0)
+                {
+                    ERROR("Sorry!  You cannot execute page-send-phase "
+                        "twice.  We will fix this bug in the future.");
+                    DPRINTF("Sorry\n");
+                    goto out;
+                }
+                num_pages = j;
+
+                for ( i = 0; i < num_pages; i++)
+                {
+                    void *page = tmp_region + i*PAGE_SIZE;
+                    unsigned long pagetype;
+
+                    pagetype = region_pfn_type[i] &
+                            XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+                    if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
+                        /* a bogus/unmapped page: skip it */
+                        continue;
+
+                    if ( read_exact(io_fd, page, PAGE_SIZE) )
+                    {
+                        ERROR("Error when reading page (type was %lx)", 
+                            pagetype);
+                        goto build;
+                    }
+                }
+
+                /* 
+                 * Discard cache for portion of file read so far up to last
+                 *  page boundary every 16MB or so.
+                 */
+                m += j;
+                if ( m > MAX_PAGECACHE_USAGE )
+                {
+                    discard_file_cache(io_fd, 0 /* no flush */);
+                    m = 0;
+                }
+                continue;
+            }
+
+            /* Normal mode */
+            /* First pass for this batch: work out how much memory to alloc */
+            nr_mfns = 0; 
+            for ( i = 0; i < j; i++ )
+            {
+                unsigned long pfn, pagetype;
+                pfn      = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+                pagetype = region_pfn_type[i] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+                if ( (pagetype != XEN_DOMCTL_PFINFO_XTAB) && 
+                    (p2m[pfn] == INVALID_P2M_ENTRY) )
+                {
+                    /* Have a live PFN which hasn't had an MFN allocated */
+                    p2m_batch[nr_mfns++] = pfn; 
+                    p2m[pfn]--;
+                }
+            } 
+
+            /* Now allocate a bunch of mfns for this batch */
+            if ( nr_mfns &&
+                (xc_domain_memory_populate_physmap(xc_handle, dom, nr_mfns, 0,
+                                                0, p2m_batch) != 0) )
+            { 
+                ERROR("Failed to allocate memory for batch.! %d\n", nr_mfns); 
+                for (i = 0; i < nr_mfns; i++)
+                    DPRINTF("p2m_batch[%d] = %lx\n", i, p2m_batch[i]);
+                errno = ENOMEM;
+                goto out;
+            }
+
+            /* set special pages */
+            {
+            struct xen_add_to_physmap xatp;
+            for (i = 0; i < nr_mfns; i++)
+                if (p2m_batch[i] == shared_info_frame) {
+                    xatp.domid = dom;
+                    xatp.space = XENMAPSPACE_shared_info;
+                    xatp.idx = 0;
+                    xatp.gpfn = shared_info_frame;
+                    DPRINTF("setting up shared_info_frame: %lu\n",
+                        shared_info_frame);
+                    if (xc_memory_op(xc_handle, XENMEM_add_to_physmap, &xatp)
+                        != 0)
+                    {
+                        ERROR("Error setting shared_info_frame");
+                        goto out;
+                    }
+                } else if ((p2m_batch[i] > shared_info_frame)
+                    && (p2m_batch[i] <= shared_info_frame + 32)) {
+                    xatp.domid = dom;
+                    xatp.space = XENMAPSPACE_grant_table;
+                    xatp.idx = grant_idx;
+                    xatp.gpfn = p2m_batch[i];
+                    DPRINTF("grant[%d]: %lu\n", grant_idx, xatp.gpfn);
+                    if (xc_memory_op(xc_handle, XENMEM_add_to_physmap,
+                        &xatp) != 0)
+                    {
+                        PERROR("Cannot map grant table pfn: %lu", xatp.gpfn);
+                        goto out;
+                    }
+                    grant_idx++;
+                }
+            }
+
+            /* Second pass for this batch: update p2m[] and region_mfn[] */
+            nr_mfns = 0; 
+            for ( i = 0; i < j; i++ )
+            {
+                unsigned long pfn, pagetype;
+                pfn      = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+                pagetype = region_pfn_type[i] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+                if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
+                    region_mfn[i] = ~0UL; /* map will fail but we don't care */
+                else 
+                {
+                    if ( p2m[pfn] == (INVALID_P2M_ENTRY-1) )
+                    {
+                        /* We just allocated a new mfn above; update p2m */
+                        p2m[pfn] = p2m_batch[nr_mfns++]; 
+                        nr_pfns++; 
+                    }
+
+                    /* setup region_mfn[] for batch map.
+                     * For HVM guests, this interface takes PFNs, not MFNs */
+                    region_mfn[i] = pfn;
+                }
+            } 
+
+            /* Map relevant mfns */
+            region_base = xc_map_foreign_batch(
+                xc_handle, dom, PROT_WRITE, region_mfn, j);
+
+            if ( region_base == NULL )
+            {
+                ERROR("map batch failed");
+                goto out;
+            }
+
+            for ( i = 0; i < j; i++ )
+            {
+                void *page;
+                unsigned long pagetype;
+                pfn      = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+                pagetype = region_pfn_type[i] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+                if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
+                    /* a bogus/unmapped page: skip it */
+                    continue;
+
+                if ( pfn > p2m_size )
+                {
+                    ERROR("pfn out of range");
+                    goto out;
+                }
+
+                pfn_type[pfn] = pagetype;
+
+                mfn = p2m[pfn];
+
+                page = region_base + i*PAGE_SIZE;
+
+                if ( read_exact(io_fd, page, PAGE_SIZE) )
+                {
+                    ERROR("Error when reading page (type was %lx)", pagetype);
+                    goto out;
+                }
+
+                pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+                if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) && 
+                    (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
+                {
+                    DPRINTF("uncanonicalize_pagetable pagetype = %lx pfn = %lu\n", pagetype, pfn);
+                }
+                else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB )
+                {
+                    ERROR("Bogus page type %lx page table is out of range: "
+                        "i=%d p2m_size=%lu", pagetype, i, p2m_size);
+                    goto out;
+
+                }
+            } /* end of 'batch' for loop */
+
+            munmap(region_base, j*PAGE_SIZE);
+            n+= j; /* crude stats */
+
+            /* 
+             * Discard cache for portion of file read so far up to last
+             *  page boundary every 16MB or so.
+             */
+            m += j;
+            if ( m > MAX_PAGECACHE_USAGE )
+            {
+                discard_file_cache(io_fd, 0 /* no flush */);
+                m = 0;
+            }
+        }
+
+        /* HVM specific */
+        {
+            uint32_t rec_len;
+            
+            /* Read HVM context */
+            if ( read_exact(io_fd, &rec_len, sizeof(uint32_t)) )
+            {
+                ERROR("error read hvm context size!\n");
+                goto build;
+            }
+
+            if (hvm_buf[info_non_active] == NULL)
+            { /* hvm_buf will be reused. */
+                hvm_buf_size = rec_len;
+                hvm_buf[0] = malloc(hvm_buf_size);
+                hvm_buf[1] = malloc(hvm_buf_size);
+                if ( hvm_buf[0] == NULL || hvm_buf[1] == NULL)
+                {
+                    ERROR("memory alloc for hvm context buffer failed");
+                    errno = ENOMEM;
+                    goto out;
+                }
+            }
+            else if (rec_len != hvm_buf_size)
+            {
+                ERROR("Sorry, we did not thought about HVM image size "
+                    "change.");
+                    goto out;
+            }
+            
+            if ( read_exact(io_fd, hvm_buf[info_non_active], hvm_buf_size) )
+            {
+                ERROR("error loading the HVM context");
+                goto build;
+            }
+        }
+
+        /*
+         * Commit!
+         */
+        {
+            int zero = 0;
+
+            if ( write_exact(io_fd, &zero, sizeof(int))) {
+                ERROR("Error when replying to sender (errno %d)", errno);
+                goto out;
+            }
+        }
+
+        /* commit pages */
+        if (kemari_transaction_mode && num_pages > 0)
+        {
+            int nr_mfns;
+            /* First pass for this batch: work out how much memory to alloc */
+            nr_mfns = 0; 
+            for ( i = 0; i < num_pages; i++ )
+            {
+                unsigned long pfn, pagetype;
+                pfn      = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+                pagetype = region_pfn_type[i] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+                if ( (pagetype != XEN_DOMCTL_PFINFO_XTAB) && 
+                     (p2m[pfn] == INVALID_P2M_ENTRY) )
+                {
+                    /* Have a live PFN which hasn't had an MFN allocated */
+                    p2m_batch[nr_mfns++] = pfn; 
+                    p2m[pfn]--;
+                    DPRINTF("Cannot be occur!!! no map for pfn: %lu\n", pfn);
+                }
+            } 
+
+            /* Now allocate a bunch of mfns for this batch */
+            if ( nr_mfns &&
+                 (xc_domain_memory_populate_physmap(xc_handle, dom, nr_mfns, 0,
+                                                    0, p2m_batch) != 0) )
+            { 
+                ERROR("Failed to allocate memory for batch.!\n"); 
+                errno = ENOMEM;
+                goto out;
+            }
+
+            /* Second pass for this batch: update p2m[] and region_mfn[] */
+            nr_mfns = 0; 
+            for ( i = 0; i < num_pages; i++ )
+            {
+                unsigned long pfn, pagetype;
+                pfn      = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+                pagetype = region_pfn_type[i] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+                if ( pagetype == XEN_DOMCTL_PFINFO_XTAB ) {
+                    DPRINTF("pfn %lu = XEN_DOMCTL_PFINFO_XTAB\n", pfn);
+                    region_mfn[i] = ~0UL; /* map will fail but we don't care */
+                }
+                else 
+                {
+                    if ( p2m[pfn] == (INVALID_P2M_ENTRY-1) )
+                    {
+                        /* We just allocated a new mfn above; update p2m */
+                        p2m[pfn] = p2m_batch[nr_mfns++]; 
+                        nr_pfns++; 
+                    }
+
+                    /* setup region_mfn[] for batch map.
+                     * For HVM guests, this interface takes PFNs, not MFNs */
+                    region_mfn[i] = pfn;
+                }
+            } 
+
+            /* Map relevant mfns */
+            region_base = xc_map_foreign_batch(
+                xc_handle, dom, PROT_WRITE, region_mfn, num_pages);
+
+            if ( region_base == NULL )
+            {
+                ERROR("map batch failed");
+                goto out;
+            }
+
+            for ( i = 0; i < num_pages; i++ )
+            {
+                void *page, *spage;
+                unsigned long pagetype;
+
+                pfn      = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+                pagetype = region_pfn_type[i] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+                if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
+                    /* a bogus/unmapped page: skip it */
+                    continue;
+
+                if ( pfn > p2m_size )
+                {
+                    ERROR("pfn out of range");
+                    goto out;
+                }
+
+                pfn_type[pfn] = pagetype;
+
+                mfn = p2m[pfn];
+
+                page = region_base + i*PAGE_SIZE;
+                spage = tmp_region + i*PAGE_SIZE;
+
+                if ( !memmove(page, spage, PAGE_SIZE) )
+                {
+                    ERROR("Error when reading page (type was %lx)", pagetype);
+                    goto out;
+                }
+
+                pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+                if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) && 
+                     (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
+                {
+                    DPRINTF("uncanonicalize_pagetable pagetype = %lx pfn = %lu\n", pagetype, pfn);
+                }
+                else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB )
+                {
+                    ERROR("Bogus page type %lx page table is out of range: "
+                        "i=%d p2m_size=%lu", pagetype, i, p2m_size);
+                    goto out;
+
+                }
+
+            } /* end of 'batch' for loop */
+
+            munmap(region_base, num_pages*PAGE_SIZE);
+            num_pages = 0; /* clear num_pages for refill */
+        }
+
+        /* commit HVM specific status */
+        info_active = info_non_active;
+        info_non_active = info_active ? 0 : 1;
+
+        /* HVM success! */
+        rc = 0;
+        kemari_transaction_mode = 1;
+    }
+
+ build: /* building HVM context */
+    DPRINTF("building status %d\n", rc);
+    if (rc == 0)
+    {
+        FILE *qemu_fp;
+        char path[128];
+
+        /* set the EPT identity PT location */
+        xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT,
+            ident_pt[info_active]);
+
+        if ( (frc = xc_set_hvm_param(xc_handle, dom, 
+                                     HVM_PARAM_IOREQ_PFN, magic_pfns[0]))
+             || (frc = xc_set_hvm_param(xc_handle, dom, 
+                                        HVM_PARAM_BUFIOREQ_PFN, magic_pfns[1]))
+             || (frc = xc_set_hvm_param(xc_handle, dom, 
+                                        HVM_PARAM_STORE_PFN, magic_pfns[2]))
+             || (frc = xc_set_hvm_param(xc_handle, dom, 
+                                        HVM_PARAM_PAE_ENABLED, pae))
+             || (frc = xc_set_hvm_param(xc_handle, dom, 
+                                        HVM_PARAM_STORE_EVTCHN,
+                                        store_evtchn))
+             || (frc = xc_set_hvm_param(xc_handle, dom, 
+                                        HVM_PARAM_CALLBACK_IRQ,
+                                        callback_irq)) )
+        {
+            ERROR("error setting HVM params: %i", frc);
+            rc = 3;
+            goto out;
+        }
+        *store_mfn = magic_pfns[2];
+        DPRINTF("kemari_restore: magic_pfns 0: %lld, 1: %lld, 2: %lld\n",
+            magic_pfns[0], magic_pfns[1], magic_pfns[2]);
+
+        frc = xc_domain_hvm_setcontext(xc_handle, dom, hvm_buf[info_active],
+            hvm_buf_size);
+        if ( frc )
+        {
+            ERROR("error setting the HVM context");
+            rc = 4;
+            goto out;
+        }
+
+        if (qemu_image_size[info_active] == 0)
+        {
+            ERROR("Did not received QEMU image");
+            rc = 5;
+            goto out;
+        }
+        snprintf(path, sizeof(path), "/var/lib/xen/qemu-save.%d", dom);
+        if ((qemu_fp = fopen(path, "w")) == NULL)
+        {
+            ERROR("error opening QEMU image");
+            rc = 5;
+            goto out;
+        }
+        if (fwrite(qemu_image[info_active], qemu_image_size[info_active],
+            1, qemu_fp) != 1)
+        {
+            ERROR("error writing QEMU image");
+            rc = 5;
+            goto out;
+        }
+        fclose(qemu_fp);
+    }
+
+ out:
+    if ( (rc != 0) && (dom != 0) )
+        xc_domain_destroy(xc_handle, dom);
+    free(p2m);
+    free(pfn_type);
+    free(region_mfn);
+    free(p2m_batch);
+    free(tmp_region);
+    free(hvm_buf[0]);
+    free(hvm_buf[1]);
+    free(qemu_image[0]);
+    free(qemu_image[1]);
+
+    /* discard cache for save file  */
+    discard_file_cache(io_fd, 1 /*flush*/);
+
+    DPRINTF("Restore exit with rc=%d\n", rc);
+    
+    return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
diff -r 19201eebab16 tools/libxc/xc_dom_kemari_save.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_dom_kemari_save.c	Wed Nov 19 13:50:59 2008 +0900
@@ -0,0 +1,959 @@
+/******************************************************************************
+ * xc_dom_kemari_save.c
+ *
+ * Save the state of a running Linux session.
+ *
+ * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation. 
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * This source code is based on xc_domain_save.c.
+ * Copied BITS_PER_LONG, BITS_TO_LONGS, BITMAP_SIZE, BITMAP_SHIFT,
+ * RATE_IS_MAX, test_bit, clear_bit, set_bit, tv_delta, noncached_write,
+ * initialize_mbit_rate, and ratewrite from xc_domain_save.c
+ *
+ * Copyright (c) 2003, K A Fraser.
+ */
+
+#include <inttypes.h>
+#include <time.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/time.h>
+
+#include "xc_private.h"
+#include "xc_dom.h"
+#include "xg_private.h"
+#include "xg_save_restore.h"
+
+#include <xen/hvm/params.h>
+#include "xc_e820.h"
+
+#ifdef  __MINIOS__
+static ssize_t sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
+{
+    char buf[1024];
+    int len, wrote_len = 0;
+
+    if (offset != NULL) {
+        ERROR("Sorry sendfile for stubdomain should not have offset");
+        errno = EIO;
+        return -1;
+    }
+
+    while (count > 0) {
+        len = (count < sizeof(buf))?count:sizeof(buf);
+        len = read(in_fd, buf, len);
+        if (len < 0)
+            return -1;
+        if (write_exact(out_fd, buf, len))
+            return -1;
+        wrote_len += len;
+        count -= len;
+    }
+    return wrote_len;
+}
+#else  /* !__MINIOS__ */
+#include <sys/sendfile.h>
+#endif  /* __MINIOS__ */
+
+/* HVM: shared-memory bitmaps for getting log-dirty bits from qemu-dm */
+static unsigned long *qemu_bitmaps[2];
+static int qemu_active;
+static int qemu_non_active;
+
+/* number of pfns this guest has (i.e. number of entries in the P2M) */
+static unsigned long p2m_size;
+
+/* page frame numbers */
+static unsigned long *pfn_type = NULL;
+
+/* HVM: a buffer for holding HVM context */
+static uint32_t hvm_buf_size = 0;
+static uint8_t *hvm_buf = NULL;
+
+/* The new domain's shared-info frame number. */
+static unsigned long shared_info_frame;
+
+
+/* grep fodder: machine_to_phys */
+
+
+/*
+** During (live) save/migrate, we maintain a number of bitmaps to track
+** which pages we have to send, to fixup, and to skip.
+*/
+
+#define BITS_PER_LONG (sizeof(unsigned long) * 8)
+#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
+#define BITMAP_SIZE   (BITS_TO_LONGS(p2m_size) * sizeof(unsigned long))
+
+#define BITMAP_ENTRY(_nr,_bmap) \
+   ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
+
+#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
+
+static inline int test_bit (int nr, volatile void * addr)
+{
+    return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
+}
+
+static inline void clear_bit (int nr, volatile void * addr)
+{
+    BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
+}
+
+static inline void set_bit ( int nr, volatile void * addr)
+{
+    BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
+}
+
+static uint64_t tv_delta(struct timeval *new, struct timeval *old)
+{
+    return (((new->tv_sec - old->tv_sec)*1000000) +
+            (new->tv_usec - old->tv_usec));
+}
+
+static int noncached_write(int fd, void *buffer, int len) 
+{
+    static int write_count = 0;
+    int rc = (write_exact(fd, buffer, len) == 0) ? len : -1;
+
+    write_count += len;
+    if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
+    {
+        /* Time to discard cache - dont care if this fails */
+        discard_file_cache(fd, 0 /* no flush */);
+        write_count = 0;
+    }
+
+    return rc;
+}
+
+#ifdef ADAPTIVE_SAVE
+
+/*
+** We control the rate at which we transmit (or save) to minimize impact
+** on running domains (including the target if we're doing live migrate).
+*/
+
+#define MAX_MBIT_RATE    500      /* maximum transmit rate for migrate */
+#define START_MBIT_RATE  100      /* initial transmit rate for migrate */
+
+/* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
+#define RATE_TO_BTU      781250
+
+/* Amount in bytes we allow ourselves to send in a burst */
+#define BURST_BUDGET (100*1024)
+
+/* We keep track of the current and previous transmission rate */
+static int mbit_rate, ombit_rate = 0;
+
+/* Have we reached the maximum transmission rate? */
+#define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
+
+static inline void initialize_mbit_rate()
+{
+    mbit_rate = START_MBIT_RATE;
+}
+
+static int ratewrite(int io_fd, void *buf, int n)
+{
+    static int budget = 0;
+    static int burst_time_us = -1;
+    static struct timeval last_put = { 0 };
+    struct timeval now;
+    struct timespec delay;
+    long long delta;
+
+    if ( START_MBIT_RATE == 0 )
+        return noncached_write(io_fd, buf, n);
+
+    budget -= n;
+    if ( budget < 0 )
+    {
+        if ( mbit_rate != ombit_rate )
+        {
+            burst_time_us = RATE_TO_BTU / mbit_rate;
+            ombit_rate = mbit_rate;
+            DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
+                    mbit_rate, BURST_BUDGET, burst_time_us);
+        }
+        if ( last_put.tv_sec == 0 )
+        {
+            budget += BURST_BUDGET;
+            gettimeofday(&last_put, NULL);
+        }
+        else
+        {
+            while ( budget < 0 )
+            {
+                gettimeofday(&now, NULL);
+                delta = tv_delta(&now, &last_put);
+                while ( delta > burst_time_us )
+                {
+                    budget += BURST_BUDGET;
+                    last_put.tv_usec += burst_time_us;
+                    if ( last_put.tv_usec > 1000000 )
+                    {
+                        last_put.tv_usec -= 1000000;
+                        last_put.tv_sec++;
+                    }
+                    delta -= burst_time_us;
+                }
+                if ( budget > 0 )
+                    break;
+                delay.tv_sec = 0;
+                delay.tv_nsec = 1000 * (burst_time_us - delta);
+                while ( delay.tv_nsec > 0 )
+                    if ( nanosleep(&delay, &delay) == 0 )
+                        break;
+            }
+        }
+    }
+    return noncached_write(io_fd, buf, n);
+}
+
+#else /* ! ADAPTIVE SAVE */
+
+#define RATE_IS_MAX() (0)
+#define ratewrite(_io_fd, _buf, _n) noncached_write((_io_fd), (_buf), (_n))
+#define initialize_mbit_rate()
+
+#endif
+
+static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
+                       xc_shadow_op_stats_t *stats, int print)
+{
+    static struct timeval wall_last;
+    static long long      d0_cpu_last;
+    static long long      d1_cpu_last;
+
+    struct timeval        wall_now;
+    long long             wall_delta;
+    long long             d0_cpu_now, d0_cpu_delta;
+    long long             d1_cpu_now, d1_cpu_delta;
+
+    gettimeofday(&wall_now, NULL);
+
+    d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
+    d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
+
+    if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
+        DPRINTF("ARRHHH!!\n");
+
+    wall_delta = tv_delta(&wall_now,&wall_last)/1000;
+    if ( wall_delta == 0 )
+        wall_delta = 1;
+
+    d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
+    d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
+
+    if ( print )
+        DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
+                "dirtied %dMb/s %" PRId32 " pages\n",
+                wall_delta,
+                (int)((d0_cpu_delta*100)/wall_delta),
+                (int)((d1_cpu_delta*100)/wall_delta),
+                (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
+                (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
+                stats->dirty_count);
+
+#ifdef ADAPTIVE_SAVE
+    if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
+    {
+        mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
+            + 50;
+        if ( mbit_rate > MAX_MBIT_RATE )
+            mbit_rate = MAX_MBIT_RATE;
+    }
+#endif
+
+    d0_cpu_last = d0_cpu_now;
+    d1_cpu_last = d1_cpu_now;
+    wall_last   = wall_now;
+
+    return 0;
+}
+
+
+static int send_ident_pt(int xc_handle, int io_fd, uint32_t dom)
+{
+    struct {
+        int minusthree;
+        uint32_t pad;
+        uint64_t ident_pt;
+    } chunk = { -3, 0 };
+
+    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT,
+                     &chunk.ident_pt);
+
+    if ( (chunk.ident_pt != 0) &&
+         write_exact(io_fd, &chunk, sizeof(chunk)) )
+    {
+        PERROR("Error when writing the ident_pt for EPT guest");
+        return -1;
+    }
+
+    return 0;
+}
+
+static int send_qemu_image(int xc_handle, int io_fd, uint32_t dom)
+{
+    char path[128];
+    struct stat st;
+    struct {
+        int minusfour;
+        uint32_t image_size;
+    } chunk = { -4, 0 };
+    int qemu_fd;
+    int rc = -1;
+
+    snprintf(path, sizeof(path), "/dev/shm/qemu-save.%d", dom);
+    if ((qemu_fd = open(path, O_RDONLY)) == -1)
+    {
+        PERROR("Error when opening qemu image %s", path);
+        goto out;
+    }
+
+    if (fstat(qemu_fd, &st) == -1)
+    {
+        PERROR("Error fstat qemu file %s", path);
+        goto out;
+    }
+    chunk.image_size = st.st_size;
+
+    if ( write_exact(io_fd, &chunk, sizeof(chunk)) )
+    {
+        PERROR("Error when writing header for qemu image");
+        goto out;
+    }
+
+    if ( sendfile(io_fd, qemu_fd, NULL, chunk.image_size) != 
+        chunk.image_size)
+    {
+        PERROR("Error when writing qemu image");
+        goto out;
+    }
+    close(qemu_fd);
+
+    rc = 0;
+out:
+    return rc;
+}
+
+static int send_hvm_context(int xc_handle, int io_fd, uint32_t dom)
+{
+    uint32_t rec_size;
+    int rc = -1;
+
+    /* Get HVM context from Xen and save it too */
+    if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf, 
+                                              hvm_buf_size)) == -1 )
+    {
+        ERROR("HVM:Could not get hvm buffer");
+        goto out;
+    }
+    
+    if ( write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
+    {
+        PERROR("error write hvm buffer size");
+        goto out;
+    }
+        
+    if ( write_exact(io_fd, hvm_buf, rec_size) )
+    {
+        PERROR("write HVM info failed!\n");
+        goto out;
+    }
+    rc = 0;
+
+out:
+    return rc;
+}
+
+int xc_kemari_save(int xc_handle, int io_fd, uint32_t dom, uint32_t flags, 
+                   int hvm, void *(*init_qemu_maps)(int, unsigned))
+{
+    int rc = 1, i, j, iter = 0;
+    int debug = (flags & XCFLAGS_DEBUG);
+    int sent_last_iter, skip_this_iter;
+    static xc_dominfo_t info;
+
+
+    /* base of the region in which domain memory is mapped */
+    unsigned char *region_base = NULL;
+
+    /* bitmap of pages:
+       - that should be sent this iteration (unless later marked as skip);
+       - to skip this iteration because already dirty;
+       - to fixup by sending at the end if not already resent; */
+    unsigned long *to_send = NULL, *to_fix = NULL;
+
+    xc_shadow_op_stats_t stats;
+
+    unsigned long needed_to_fix = 0;
+    unsigned long total_sent    = 0;
+
+    /* HVM: magic frames for ioreqs and xenstore comms. */
+    uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
+
+    /* callback irq */
+    uint64_t callback_irq = 0;
+
+    if ( !hvm )
+    {
+        ERROR("HVM domain is required for the kemari migration.");
+        return 1;
+    }
+
+    initialize_mbit_rate();
+
+    if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
+    {
+        ERROR("Could not get domain info");
+        return 1;
+    }
+
+    shared_info_frame = info.shared_info_frame;
+    DPRINTF("xc_kemari_save: shared_info_frame: %lu\n", shared_info_frame);
+
+    /* Get the size of the P2M table */
+    p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom) + 1;
+    DPRINTF("xc_kemari_save: p2m_size: %lu\n", p2m_size);
+
+    /* Domain is still running at this point */
+    {
+        /* Get qemu-dm logging dirty pages too */
+        void *seg = init_qemu_maps(dom, BITMAP_SIZE);
+        qemu_bitmaps[0] = seg;
+        qemu_bitmaps[1] = seg + BITMAP_SIZE;
+        qemu_active = 0;
+        qemu_non_active = 1;
+    }
+
+    /* pretend we sent all the pages last iteration */
+    sent_last_iter = p2m_size;
+
+    /* Setup to_send / to_fix bitmaps */
+    to_send = xg_memalign(PAGE_SIZE, ROUNDUP(BITMAP_SIZE, PAGE_SHIFT)); 
+    to_fix  = calloc(1, BITMAP_SIZE);
+
+    if ( !to_send || !to_fix )
+    {
+        ERROR("Couldn't allocate to_send array");
+        goto out;
+    }
+
+    memset(to_send, 0xff, BITMAP_SIZE);
+
+    if ( lock_pages(to_send, BITMAP_SIZE) )
+    {
+        ERROR("Unable to lock to_send");
+        return 1;
+    }
+
+    {
+        /* Need another buffer for HVM context */
+        hvm_buf_size = xc_domain_hvm_getcontext(xc_handle, dom, 0, 0);
+        if ( hvm_buf_size == -1 )
+        {
+            ERROR("Couldn't get HVM context size from Xen");
+            goto out;
+        }
+        hvm_buf = malloc(hvm_buf_size);
+        if ( !hvm_buf )
+        {
+            ERROR("Couldn't allocate memory");
+            goto out;
+        }
+    }
+
+    pfn_type   = xg_memalign(PAGE_SIZE, ROUNDUP(
+                              MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
+    if ( pfn_type == NULL )
+    {
+        ERROR("failed to alloc memory for pfn_type arrays");
+        errno = ENOMEM;
+        goto out;
+    }
+    memset(pfn_type, 0,
+           ROUNDUP(MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
+
+    if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) )
+    {
+        ERROR("Unable to lock pfn_type array");
+        goto out;
+    }
+
+    /* Start writing out the saved-domain record. */
+    if ( write_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
+    {
+        PERROR("write: p2m_size");
+        goto out;
+    }
+
+    /* send shared_info_frame */
+    if ( write_exact(io_fd, &shared_info_frame, sizeof(unsigned long)) )
+    {
+        PERROR("write: shared_info_frame");
+        goto out;
+    }
+
+    /* Save magic-page locations. */
+    memset(magic_pfns, 0, sizeof(magic_pfns));
+    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
+                     &magic_pfns[0]);
+    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
+                     &magic_pfns[1]);
+    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
+                     &magic_pfns[2]);
+    DPRINTF("kemari_restore: magic_pfns 0: %lld, 1: %lld, 2: %lld\n",
+        magic_pfns[0], magic_pfns[1], magic_pfns[2]);
+    if ( write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
+    {
+        PERROR("Error when writing to state file (7)");
+        goto out;
+    }
+
+    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_CALLBACK_IRQ,
+                     &callback_irq);
+    DPRINTF("kemari_restore: callback irq %llx", callback_irq);
+    if ( write_exact(io_fd, &callback_irq, sizeof(callback_irq)) )
+    {
+        PERROR("Error when writing to state file (8)");
+        goto out;
+    }
+
+    print_stats(xc_handle, dom, 0, &stats, 0);
+
+    /* Now write out each data page, canonicalising page tables as we go... */
+    {
+        unsigned int prev_pc, sent_this_iter, N, batch, run;
+
+        iter++;
+        sent_this_iter = 0;
+        skip_this_iter = 0;
+        prev_pc = 0;
+        N = 0;
+
+        DPRINTF("Saving memory pages: iter %d   0%%", iter);
+
+        while ( N < p2m_size )
+        {
+            unsigned int this_pc = (N * 100) / p2m_size;
+
+            if ( (this_pc - prev_pc) >= 5 )
+            {
+                DPRINTF("\b\b\b\b%3d%%", this_pc);
+                prev_pc = this_pc;
+            }
+
+            /* load pfn_type[] with the mfn of all the pages we're doing in
+               this batch. */
+            for  ( batch = 0;
+                   (batch < MAX_BATCH_SIZE) && (N < p2m_size);
+                   N++ )
+            {
+                int n = N;
+
+                if ( debug )
+                {
+                    DPRINTF("%d pfn= %08lx mfn= %08lx %d",
+                            iter, (unsigned long)n,
+                            (long unsigned int)0,
+                            test_bit(n, to_send));
+                    DPRINTF("\n");
+                }
+
+                if ( !( (test_bit(n, to_send)) || (test_bit(n, to_fix))) )
+                    continue;
+
+                /* Skip PFNs that aren't really there */
+                if (((n >= 0xa0 && n < 0xc0) /* VGA hole */
+                             || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT) 
+                                 && n < (1ULL<<32) >> PAGE_SHIFT)) /* MMIO */ ) {
+                    if (n >= shared_info_frame && n <= shared_info_frame + 32) {
+                        DPRINTF("shared_info_frame or grant: %d\n", n); 
+                    } else {
+                        continue;
+                    }
+                }
+
+                /*
+                ** we get here if:
+                **  1. page is marked to_send & hasn't already been re-dirtied
+                **  2. add in pages that still need fixup (net bufs)
+                */
+
+                /* Hypercall interfaces operate in PFNs for HVM guests
+                * and MFNs for PV guests */
+                pfn_type[batch] = n;
+                    
+                if ( !is_mapped(pfn_type[batch]) )
+                {
+                    /*
+                    ** not currently in psuedo-physical map -- set bit
+                    ** in to_fix since we must send this page in last_iter
+                    ** unless its sent sooner anyhow, or it never enters
+                    ** pseudo-physical map (e.g. for ballooned down doms)
+                    */
+                    set_bit(n, to_fix);
+                    continue;
+                }
+
+                if ( test_bit(n, to_fix) &&
+                     !test_bit(n, to_send) )
+                {
+                    needed_to_fix++;
+                    DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
+                            iter, n, pfn_type[batch]);
+                }
+                
+                clear_bit(n, to_fix);
+                
+                batch++;
+            }
+
+            if ( batch == 0 )
+                goto skip; /* vanishingly unlikely... */
+
+            region_base = xc_map_foreign_batch(
+                xc_handle, dom, PROT_READ, pfn_type, batch);
+            if ( region_base == NULL )
+            {
+                ERROR("map batch failed");
+                goto out;
+            }
+
+            {
+                /* Look for and skip completely empty batches. */
+                for ( j = 0; j < batch; j++ )
+                    if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) !=
+                         XEN_DOMCTL_PFINFO_XTAB )
+                        break;
+                if ( j == batch )
+                {
+                    munmap(region_base, batch*PAGE_SIZE);
+                    continue; /* bail on this batch: no valid pages */
+                }
+            }
+
+            if ( write_exact(io_fd, &batch, sizeof(unsigned int)) )
+            {
+                PERROR("Error when writing to state file (2)");
+                goto out;
+            }
+
+            if ( write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch) )
+            {
+                PERROR("Error when writing to state file (3)");
+                goto out;
+            }
+
+            /* entering this loop, pfn_type is now in pfns (Not mfns) */
+            run = 0;
+            for ( j = 0; j < batch; j++ )
+            {
+                unsigned long pfn, pagetype;
+
+                pfn      = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+                pagetype = pfn_type[j] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+                if ( pagetype != 0 )
+                {
+                    /* If the page is not a normal data page, write out any
+                       run of pages we may have previously acumulated */
+                    if ( run )
+                    {
+                        if ( ratewrite(io_fd, 
+                                       (char*)region_base+(PAGE_SIZE*(j-run)), 
+                                       PAGE_SIZE*run) != PAGE_SIZE*run )
+                        {
+                            ERROR("Error when writing to state file (4a)"
+                                  " (errno %d)", errno);
+                            goto out;
+                        }
+                        run = 0;
+                    }
+                }
+
+                /* skip pages that aren't present */
+                if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
+                    continue;
+
+                pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+                if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
+                     (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
+                {
+                    DPRINTF("canonicalize_pagetable pagetype = %lx pfn = %lu\n", pagetype, pfn);
+                }
+                else
+                {
+                    /* We have a normal page: accumulate it for writing. */
+                    run++;
+                }
+            } /* end of the write out for this batch */
+
+            if ( run )
+            {
+                /* write out the last accumulated run of pages */
+                if ( ratewrite(io_fd, 
+                               (char*)region_base+(PAGE_SIZE*(j-run)), 
+                               PAGE_SIZE*run) != PAGE_SIZE*run )
+                {
+                    ERROR("Error when writing to state file (4c)"
+                          " (errno %d)", errno);
+                    goto out;
+                }                        
+            }
+
+            sent_this_iter += batch;
+
+            munmap(region_base, batch*PAGE_SIZE);
+
+        } /* end of this while loop for this iteration */
+
+      skip:
+
+        total_sent += sent_this_iter;
+
+        DPRINTF("\r %d: sent %d, skipped %d, ",
+                iter, sent_this_iter, skip_this_iter );
+
+        {
+            print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
+
+            DPRINTF("Total pages sent= %ld (%.2fx)\n",
+                    total_sent, ((float)total_sent)/p2m_size );
+            DPRINTF("(of which %ld were fixups)\n", needed_to_fix  );
+        }
+    } /* end of infinite for loop */
+
+    DPRINTF("All memory is saved\n");
+
+    if (send_ident_pt(xc_handle, io_fd, dom) < 0)
+        goto out;
+
+    /* Zero terminate */
+    i = 0;
+    if ( write_exact(io_fd, &i, sizeof(int)) )
+    {
+        PERROR("Error when writing to state file (6')");
+        goto out;
+    }
+
+    if (send_hvm_context(xc_handle, io_fd, dom) < 0)
+        goto out;
+
+    if (!debug)
+    {
+        int rcv_status;
+        if ( read_exact(io_fd, &rcv_status, sizeof(int))) {
+            ERROR("Error when reading receiver status");
+            goto out;
+        }
+        DPRINTF("status received: %d\n", rcv_status);
+    }
+
+    /* HVM guests are done now */
+    rc = 0;
+
+ out:
+
+    /* Flush last write and discard cache for file. */
+    discard_file_cache(io_fd, 1 /* flush */);
+
+    free(to_send);
+    free(to_fix);
+
+    DPRINTF("Save exit rc=%d\n",rc);
+
+    return !!rc;
+}
+
+
+int xc_kemari_update(int xc_handle, int io_fd, uint32_t dom, 
+                     void *kemari_ring, uint32_t flags,
+                     void (*qemu_save_image)(int),
+                     void (*qemu_end_flip)(void),
+                     void (*qemu_end_save)(void),
+                     void (*qemu_image_sent)(void))
+{
+    int rc = 1, k;
+    int debug = (flags & XCFLAGS_DEBUG);
+    uint32_t i, j, index = 0;
+    unsigned int batch = 0;
+    struct kemari_ring *ring = (struct kemari_ring *)kemari_ring;
+    struct kemari_ent *buf;
+    unsigned char *region_base = NULL;
+
+    /* flip active qemu */
+    qemu_active = qemu_non_active;
+    qemu_non_active = qemu_active ? 0 : 1;
+    qemu_save_image(qemu_active);
+
+    /*
+     * main iteration starts from here
+     */
+    while (ring->cons < ring->prod) {
+
+        kemari_ring_read(ring, &buf);
+
+        for (i = buf->u.index.start, j = buf->u.index.end; i < j; i++) {
+
+            int next, offset = 0;
+
+            index = i * BITS_PER_LONG;
+
+            kemari_ring_read(ring, &buf);
+
+            while (buf->u.dirty_bitmap && offset < BITS_PER_LONG) {
+                int n;
+                next = ffs(buf->u.dirty_bitmap);
+                buf->u.dirty_bitmap >>= next;
+                offset += next;
+                n = offset + index - 1;
+                if (((n >= 0xa0 && n < 0xc0) /* VGA hole */
+                         || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT) 
+                             && n < (1ULL<<32) >> PAGE_SHIFT)) /* MMIO */ ) {
+                    if (n >= shared_info_frame && n <= shared_info_frame + 32) {
+                        ;
+                    } else {
+                        continue;
+                    }
+                }
+                pfn_type[batch] = n;
+                batch++;
+            }
+          
+            if ((batch + BITS_PER_LONG - 1 < MAX_BATCH_SIZE) &&
+                !(ring->cons == ring->prod))
+                continue;
+
+            /* Pull in the dirty bits from qemu-dm too */
+            qemu_end_flip();
+            for ( k = 0; k < BITMAP_SIZE / BITS_PER_LONG; k++) {
+                if (qemu_bitmaps[qemu_non_active][k] != 0) {
+                    unsigned int bmp = qemu_bitmaps[qemu_non_active][k];
+
+                    index = k * BITS_PER_LONG;
+                    while (bmp && offset < BITS_PER_LONG) {
+                        int n, next, offset = 0;
+                        next = ffs(bmp);
+                        bmp >>= next;
+                        offset += next;
+                        n = offset + index - 1;
+
+                        pfn_type[batch] = n;
+                        batch++;
+                    }
+                    qemu_bitmaps[qemu_non_active][k] = 0;
+                }
+                if (batch >= MAX_BATCH_SIZE) {
+                    ERROR("Sorry, reached MAX_BATCH_SIZE.  "
+                        "We will fix this lator.");
+                    goto out;
+                }
+            }
+
+            /* send pages */
+            if ((region_base = xc_map_foreign_batch(
+                     xc_handle, dom, PROT_READ, pfn_type, batch)) == 0) {
+                ERROR("map batch failed");
+            }
+
+            if( write_exact(io_fd, &batch, sizeof(unsigned int))) {
+                ERROR("Error when writing to state file (2) (errno %d)",
+                      errno);
+                goto out;
+            }
+
+            PPRINTF("batch %d\n", batch);
+
+            if( write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch)) {
+                ERROR("Error when writing to state file (3) (errno %d)",
+                      errno);
+                goto out;
+            }
+
+            for (k = 0; k < batch; k++) {
+                unsigned long pagetype;
+                void *spage = (char *)region_base + (PAGE_SIZE*k);
+
+                pagetype = pfn_type[k] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+                /* can't happen */
+                if ( pagetype == XEN_DOMCTL_PFINFO_XTAB ) {
+                    DPRINTF("pfn_type[%d]=%lu=XEN_DOMCTL_PFINFO_XTAB\n", k, pfn_type[k]);
+                    continue;
+                }
+
+                if ( write_exact(io_fd, spage, PAGE_SIZE)) {
+                        ERROR("Error when writing pfn_type[%d]=%lu to state file (4)"
+                              " (errno %d)", k, pfn_type[k], errno);
+                        goto out;
+                }
+            }
+
+            munmap(region_base, batch*PAGE_SIZE);
+            batch = 0;
+        }
+    }
+
+    if (send_ident_pt(xc_handle, io_fd, dom) < 0)
+        goto out;
+    qemu_end_save();
+    if (!debug && send_qemu_image(xc_handle, io_fd, dom) < 0)
+        goto out;
+    qemu_image_sent();
+
+    /* Zero terminate */
+    i = 0;
+    if ( write_exact(io_fd, &i, sizeof(int)) )
+    {
+        PERROR("Error when writing to state file (6')");
+        goto out;
+    }
+
+    if (send_hvm_context(xc_handle, io_fd, dom) < 0)
+        goto out;
+
+    if (!debug)
+    {
+        int rcv_status;
+        if ( read_exact(io_fd, &rcv_status, sizeof(int))) {
+            ERROR("Error when reading receiver status");
+            goto out;
+        }
+    }
+
+    rc = 0;
+out:
+
+    return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 19201eebab16 tools/xcutils/xc_kemari_restore.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xcutils/xc_kemari_restore.c	Wed Nov 19 13:50:59 2008 +0900
@@ -0,0 +1,88 @@
+/* 
+ * xc_kemari_restore.c
+ *
+ * Restore the state of a running Linux session.
+ *
+ * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation. 
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file "COPYING" in the main directory of
+ * this archive for more details.
+ *
+ * This source code is based on xc_restore.c.
+ *
+ * Copyright (C) 2005 by Christian Limpach
+ *
+ */
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+
+#include <err.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include <xenctrl.h>
+#include <xenguest.h>
+#include <xc_private.h>
+
+static int io_fd;
+
+static void close_handler(int sig_type)
+{
+    /* let xc_kemari_restore move build process */
+    close(io_fd);
+}
+
+int
+main(int argc, char **argv)
+{
+    unsigned int domid, store_evtchn, console_evtchn;
+    unsigned int hvm, pae, apic;
+    int xc_fd, ret, one = 1;
+    unsigned long store_mfn, console_mfn;
+    struct sigaction act;
+
+    if ( argc != 8 )
+        errx(1, "usage: %s iofd domid store_evtchn "
+             "console_evtchn hvm pae apic", argv[0]);
+
+    xc_fd = xc_interface_open();
+    if ( xc_fd < 0 )
+        errx(1, "failed to open control interface");
+
+    io_fd = atoi(argv[1]);
+    domid = atoi(argv[2]);
+    store_evtchn = atoi(argv[3]);
+    console_evtchn = atoi(argv[4]);
+    hvm  = atoi(argv[5]);
+    pae  = atoi(argv[6]);
+    apic = atoi(argv[7]);
+
+    act.sa_handler = close_handler;
+    sigemptyset(&act.sa_mask);
+    act.sa_flags = 0;
+    sigaction(SIGHUP, &act, 0);
+    sigaction(SIGINT, &act, 0);
+
+    if ( setsockopt(io_fd, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(one)) < 0 )
+        DPRINTF("failed to set TCP_NODELAY");
+
+    ret = xc_kemari_restore(xc_fd, io_fd, domid, store_evtchn, &store_mfn,
+                            console_evtchn, &console_mfn, hvm, pae);
+
+    if ( ret == 0 )
+    {
+        printf("store-mfn %li\n", store_mfn);
+        if ( !hvm )
+            printf("console-mfn %li\n", console_mfn);
+        fflush(stdout);
+    }
+
+    xc_interface_close(xc_fd);
+
+    return ret;
+}
diff -r 19201eebab16 tools/xcutils/xc_kemari_save.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xcutils/xc_kemari_save.c	Wed Nov 19 13:50:59 2008 +0900
@@ -0,0 +1,518 @@
+/* 
+ * xc_kemari_save.c
+ *
+ * Save the state of a running Linux session.
+ *
+ * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation. 
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file "COPYING" in the main directory of
+ * this archive for more details.
+ *
+ * This source code is based on xc_save.c.
+ * Copied qemu_destroy_buffer and init_qemu_maps from xc_save.c.
+ *
+ * Copyright (C) 2005 by Christian Limpach
+ *
+ */
+
+
+#include <err.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+
+#include <xs.h>
+#include <xenctrl.h>
+#include <xenguest.h>
+#include <xc_private.h>
+#include <xen/kemari.h>
+
+static volatile sig_atomic_t run = 1;
+static int xc_handle, xce_handle, io_fd;
+static struct kemari_ring *ring = NULL;
+static uint32_t kemari_ring_size = 0;
+static pid_t qemu_pid;
+static int is_finalized = 0;
+static int domid;
+
+/* For HVM guests, there are two sources of dirty pages: the Xen shadow
+ * log-dirty bitmap, which we get with a hypercall, and qemu's version.
+ * The protocol for getting page-dirtying data from qemu uses a
+ * double-buffered shared memory interface directly between xc_save and
+ * qemu-dm. 
+ *
+ * xc_save calculates the size of the bitmaps and notifies qemu-dm 
+ * through the store that it wants to share the bitmaps.  qemu-dm then 
+ * starts filling in the 'active' buffer. 
+ *
+ * To change the buffers over, xc_save writes the other buffer number to
+ * the store and waits for qemu to acknowledge that it is now writing to
+ * the new active buffer.  xc_save can then process and clear the old
+ * active buffer. */
+
+static char *qemu_active_path;
+static char *qemu_next_active_path;
+static int qemu_shmid = -1;
+static struct xs_handle *xs;
+
+
+/* Mark the shared-memory segment for destruction */
+static void qemu_destroy_buffer(void)
+{
+    if (qemu_shmid != -1)
+        shmctl(qemu_shmid, IPC_RMID, NULL);
+    qemu_shmid = -1;
+}
+
+static char *kemari_qemu_info = NULL;
+static void qemu_save_image(int next_active)
+{
+    kemari_qemu_info[0] = next_active;
+    kemari_qemu_info[1] = 0;
+    xen_wmb();
+    kill(qemu_pid, SIGUSR1);
+}
+
+static void qemu_end_flip(void)
+{
+    while (kemari_qemu_info[1] == 0)
+        xen_rmb();
+}
+
+static void qemu_end_save(void)
+{
+    while (kemari_qemu_info[2] == 0)
+        xen_rmb();
+}
+
+static void qemu_image_sent(void)
+{
+    /* after QEMU image sent */
+    kemari_qemu_info[2] = 0;
+    xen_wmb();
+}
+
+static void *init_qemu_maps(int domid, unsigned int bitmap_size)
+{
+    key_t key;
+    char key_ascii[17] = {0,};
+    void *seg; 
+    char *path, *p;
+
+    /* Make a shared-memory segment */
+    do {
+        key = rand(); /* No security, just a sequence of numbers */
+        qemu_shmid = shmget(key, 2 * bitmap_size + PAGE_SIZE, 
+                       IPC_CREAT|IPC_EXCL|S_IRUSR|S_IWUSR);
+        if (qemu_shmid == -1 && errno != EEXIST)
+            errx(1, "can't get shmem to talk to qemu-dm");
+    } while (qemu_shmid == -1);
+
+    /* Remember to tidy up after ourselves */
+    atexit(qemu_destroy_buffer);
+
+    /* Map it into our address space */
+    seg = shmat(qemu_shmid, NULL, 0);
+    if (seg == (void *) -1) 
+        errx(1, "can't map shmem to talk to qemu-dm");
+    memset(seg, 0, 2 * bitmap_size + PAGE_SIZE);
+
+    /* Write the size of it into the first 32 bits */
+    *(uint32_t *)seg = bitmap_size;
+
+    /* Tell qemu about it */
+    if ((xs = xs_daemon_open()) == NULL)
+        errx(1, "Couldn't contact xenstore");
+    if (!(path = strdup("/local/domain/0/device-model/")))
+        errx(1, "can't get domain path in store");
+    if (!(path = realloc(path, strlen(path) 
+                         + 10 
+                         + strlen("/logdirty/next-active") + 1))) 
+        errx(1, "no memory for constructing xenstore path");
+    snprintf(path + strlen(path), 11, "%i", domid);
+    strcat(path, "/logdirty/");
+    p = path + strlen(path);
+
+    strcpy(p, "key");
+    snprintf(key_ascii, 17, "%16.16llx", (unsigned long long) key);
+    if (!xs_write(xs, XBT_NULL, path, key_ascii, 16))
+        errx(1, "can't write key (%s) to store path (%s)\n", key_ascii, path);
+
+    /* Watch for qemu's indication of the active buffer, and request it 
+     * to start writing to buffer 0 */
+    strcpy(p, "active");
+    if (!xs_watch(xs, path, "qemu-active-buffer"))
+        errx(1, "can't set watch in store (%s)\n", path);
+    if (!(qemu_active_path = strdup(path)))
+        errx(1, "no memory for copying xenstore path");
+
+    strcpy(p, "next-active");
+    if (!(qemu_next_active_path = strdup(path)))
+        errx(1, "no memory for copying xenstore path");
+
+    kemari_qemu_info = seg + 2 * bitmap_size;
+    xen_wmb();
+    qemu_save_image(0);
+
+    free(path);
+    return seg;
+}
+
+static void close_handler(int sig_type)
+{
+    run = 0;
+}
+
+static int handle_event(int domid, unsigned int flags)
+{
+    int ret = 1, rcv_port;
+
+    if ((rcv_port = xc_evtchn_pending(xce_handle)) < 0) {
+        ERROR("Failed to read from event fd");
+        goto out;
+    }
+
+    if (xc_kemari_update(xc_handle, io_fd, domid, ring, flags,
+       qemu_save_image, qemu_end_flip, qemu_end_save, qemu_image_sent) != 0) {
+        xc_domain_pause(xc_handle, domid);
+        kill(qemu_pid, SIGSTOP);
+        ERROR("xc_kemari_update failed");
+        goto out;
+    }
+
+    if (xc_evtchn_unmask(xce_handle, rcv_port) < 0) {
+        ERROR("Failed to write to event fd");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static void set_signal_handler(void (*handler)(int))
+{
+    struct sigaction act;
+
+    act.sa_handler = handler;
+    sigemptyset(&act.sa_mask);
+    act.sa_flags = 0;
+    sigaction(SIGQUIT, &act, 0);
+    sigaction(SIGINT, &act, 0);
+    sigaction(SIGHUP, &act, 0);
+    sigaction(SIGTERM, &act, 0);
+}
+
+static int attach_ports(int domid)
+{
+    struct xs_handle *xs_handle;
+    char **list, *data;
+    unsigned int list_size, data_size;
+    char path[128];
+    uint32_t port;
+    int i, ret = 1;
+
+    if ((xs_handle = xs_daemon_open()) == NULL)
+        errx(1, "Couldn't contact xenstore");
+
+    /*
+     * attach block port.
+     */
+    snprintf(path, sizeof(path), "/local/domain/%d/device/vbd", domid);
+    list = xs_directory(xs_handle, XBT_NULL, path, &list_size);
+    if (list == NULL)
+        errx(1, "xs_directory (%s) failed", path);
+
+    for (i = 0; i < list_size; i++) {
+        snprintf(path, sizeof(path),
+            "/local/domain/%d/device/vbd/%s/event-channel", domid, list[i]);
+        data = xs_read(xs_handle, XBT_NULL, path, &data_size);
+        if (data == NULL)
+            continue;
+        port = strtoul(data, NULL, 10);
+        if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_attach,
+                                 &port, NULL,
+                                 NULL, KEMARI_TAP_OUT)) != 0) {
+            ERROR("Error when attaching blk_port (%d) on kemari", port);
+            goto out;
+        }
+        free(data);
+        DPRINTF("blk_port %d attached\n", port);
+    }
+    free(list);
+
+    /*
+     * attach net port.
+     */
+    snprintf(path, sizeof(path), "/local/domain/%d/device/vif", domid);
+    list = xs_directory(xs_handle, XBT_NULL, path, &list_size);
+    if (list == NULL)
+        errx(1, "xs_directory (%s) failed", path);
+
+    for (i = 0; i < list_size; i++) {
+        snprintf(path, sizeof(path),
+            "/local/domain/%d/device/vif/%s/event-channel", domid, list[i]);
+        data = xs_read(xs_handle, XBT_NULL, path, &data_size);
+        if (data == NULL)
+            continue;
+        port = strtoul(data, NULL, 10);
+        if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_attach,
+                                 &port, NULL,
+                                 NULL, KEMARI_TAP_OUT)) != 0) {
+            ERROR("Error when attaching net_port (%d) on kemari", port);
+            goto out;
+        }
+        free(data);
+        DPRINTF("net_port %d attached\n", port);
+    }
+    free(list);
+
+    /* attach success */
+    ret = 0;
+
+out:
+    xs_daemon_close(xs_handle);
+
+    return ret;
+}
+
+static pid_t get_qemu_pid(int domid)
+{
+    struct xs_handle *xs_handle;
+    char path[128];
+    char *data;
+    unsigned int data_size;
+    pid_t pid = 0;
+
+    if ((xs_handle = xs_daemon_open()) == NULL)
+        errx(1, "Couldn't contact xenstore");
+
+    snprintf(path, sizeof(path),
+        "/local/domain/%d/image/device-model-pid", domid);
+    data = xs_read(xs_handle, XBT_NULL, path, &data_size);
+    if (data == NULL) {
+        ERROR("Could not find QEMU pid for domid %d", domid);
+        goto out;
+    }
+    pid = strtoul(data, NULL, 10);
+    free(data);
+
+out:
+    xs_daemon_close(xs_handle);
+
+    return pid;
+}
+
+static void finalize(void)
+{
+    int ret;
+
+    if (is_finalized)
+        return;
+
+    set_signal_handler(SIG_IGN);
+    if (ring != NULL)
+        munmap(ring, kemari_ring_size * PAGE_SIZE);
+
+    if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_off,
+                            NULL, NULL, NULL, 0)) != 0) {
+        ERROR("Error when turning off kemari");
+    } else {
+        DPRINTF("successufully execute KEMARI_OP_off\n");
+    }
+
+    if ( xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_OFF,
+                           NULL, 0, NULL, 0, NULL) < 0 ) {
+        ERROR("Warning - couldn't disable shadow mode");
+    }
+
+    if (!run)
+        xc_domain_destroy(xc_handle, domid);
+
+    xc_interface_close(xc_handle);
+
+    is_finalized = 1;
+}
+
+int
+main(int argc, char **argv)
+{
+    unsigned int maxit, max_f, flags; 
+    int ret;
+    int evtchn_fd;
+    uint32_t port, kemari_port;
+    uint64_t kemari_mfn;
+    fd_set inset;
+
+    if (argc != 6)
+        errx(1, "usage: %s iofd domid maxit maxf flags", argv[0]);
+
+    xc_handle = xc_interface_open();
+    if (xc_handle < 0)
+        errx(1, "failed to open control interface");
+
+    io_fd = atoi(argv[1]);
+    domid = atoi(argv[2]);
+    maxit = atoi(argv[3]);
+    max_f = atoi(argv[4]);
+    flags = atoi(argv[5]);
+
+    set_signal_handler(close_handler);
+    if ((qemu_pid = get_qemu_pid(domid)) == 0)
+        errx(1, "failed to get qemu pid");
+    atexit(finalize);
+
+    if (io_fd == -1) /* means test mode */
+    {
+        io_fd = open("/dev/null", O_RDWR);
+        flags |= XCFLAGS_DEBUG;
+    }
+    else
+    {
+        int one = 1;
+        if (setsockopt(io_fd, IPPROTO_TCP, TCP_NODELAY, 
+                       &one, sizeof(one)) < 0) {
+            ERROR("failed to set TCP_NODELAY");
+        }
+    }
+
+    if ((xce_handle = xc_evtchn_open()) < 0) {
+        errx(1, "failed to open control interface");
+    }
+
+    evtchn_fd = xc_evtchn_fd(xce_handle);
+
+    if ( xc_shadow_control(xc_handle, domid,
+                           XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+                           NULL, 0, NULL, 0, NULL) < 0 )
+    {
+       int frc;
+        /* log-dirty already enabled? There's no test op,
+           so attempt to disable then reenable it */
+        frc = xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_OFF,
+                                NULL, 0, NULL, 0, NULL);
+        if ( frc >= 0 )
+        {
+            frc = xc_shadow_control(xc_handle, domid,
+                                    XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+                                    NULL, 0, NULL, 0, NULL);
+        }
+        
+        if ( frc < 0 )
+        {
+            err(errno, "Couldn't enable shadow mode (rc %d)", frc);
+        }
+    }
+
+    if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_enable, 
+                                 &kemari_port, &kemari_ring_size, 
+                                 &kemari_mfn, 0) != 0)) {
+        errx(1, "Error when turning on kemari");
+    }
+
+    DPRINTF("kemari_port=%u, kemari_mfn=%llu, kemari_ring_size=%u\n", 
+           kemari_port, kemari_mfn, kemari_ring_size);
+
+    if (xc_domain_pause(xc_handle, domid) < 0) {
+        ERROR("Domain appears not to have paused");
+        goto out;
+    }
+
+    ret = xc_kemari_save(xc_handle, io_fd, domid, flags, 
+                         !!(flags & XCFLAGS_HVM),
+                         &init_qemu_maps);
+    if (ret != 0) {
+        ERROR("xc_kemari_save failed");
+        goto out;
+    }
+
+    if (attach_ports(domid) != 0) {
+        ERROR("attaching port failed ");
+        goto out;
+    }
+
+    if ((port = xc_evtchn_bind_interdomain(xce_handle, domid, 
+                                           kemari_port)) < 0) {
+        ERROR("xc_evtchn_bind_interdomain failed ");
+        goto out;
+    }
+
+    if ((ring = xc_map_foreign_range(xc_handle, DOMID_XEN, 
+                                     kemari_ring_size * PAGE_SIZE, PROT_READ | PROT_WRITE, 
+                                     kemari_mfn)) == 0) {
+        ERROR("xc_map_foreign_range failed");
+        goto out;
+    }
+
+    FD_ZERO(&inset);
+    FD_SET(evtchn_fd, &inset);
+
+    if (xc_domain_unpause(xc_handle, domid) < 0) {
+        ERROR("Domain appears not to have unpaused");
+        goto out;
+    }
+
+    DPRINTF("running start");
+
+    while (run) {
+
+        if (select(evtchn_fd + 1, &inset, NULL, NULL, NULL) < 0) {
+            if (errno == EINTR)
+                continue;
+            ERROR("Error when waiting events by select()");
+            break;
+        }
+
+        if (evtchn_fd != -1 && FD_ISSET(evtchn_fd, &inset)) {
+
+            if ((ret = handle_event(domid, flags)) != 0) {
+                ERROR("Error when handling events");
+                break;
+            }
+
+            /* usleep(10000); */
+
+            if (xc_evtchn_notify(xce_handle, port) < 0) {
+                ERROR("xc_evtchn_notify failed");
+                /* goto out; */
+                break;
+            }
+
+            if(xc_domain_unpause(xc_handle, domid) < 0) {
+                ERROR("xc_domain_unpause");
+                /* goto out; */
+                break;
+            }
+
+        }
+    }
+
+ out:
+    close(io_fd);
+    finalize();
+
+    return ret;
+}
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
diff -r 19201eebab16 tools/xcutils/xc_kemari_test.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xcutils/xc_kemari_test.c	Wed Nov 19 13:50:59 2008 +0900
@@ -0,0 +1,167 @@
+/*********************************************************************
+ * xc_kemari_test.c
+ *
+ * Test program for VM part of Kemari.
+ *
+ * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation. 
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file "COPYING" in the main directory of
+ * this archive for more details.
+ *
+ */
+
+#include <err.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <xenctrl.h>
+#include <xenguest.h>
+#include <xen/kemari.h>
+
+#define NUM_OPS 6
+
+static int xc_kemari_all(unsigned int xc_fd, unsigned int domid, 
+                         uint32_t *port, uint32_t *num_pages,
+                         uint64_t *mfn, uint16_t tap_mode) {
+
+    int ret = 0;
+
+    printf("hello world\n");
+
+    return ret;
+}
+
+static int xc_kemari_enable(unsigned int xc_fd, unsigned int domid, 
+                             uint32_t *port, uint32_t *num_pages,
+                             uint64_t *mfn, uint16_t tap_mode) {
+
+    int ret;
+
+    if ((ret = xc_shadow_control(xc_fd, domid,
+                          XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+                          NULL, 0, NULL, 0, NULL)) < 0) {
+        warnx("Couldn't enable shadow mode");
+    }
+
+    if ((ret = xc_kemari_control(xc_fd, domid, XEN_KEMARI_OP_enable,
+                          port, num_pages, 
+                          mfn, tap_mode) != 0)) {
+        warnx("Error when turning on kemari");
+    }
+
+    return ret;
+}
+
+static int xc_kemari_off(unsigned int xc_fd, unsigned int domid, 
+                             uint32_t *port, uint32_t *num_pages,
+                             uint64_t *mfn, uint16_t tap_mode) {
+
+    int ret;
+
+    if ((ret = xc_kemari_control(xc_fd, domid, XEN_KEMARI_OP_off,
+                          port, num_pages, 
+                          mfn, tap_mode) != 0)) {
+        warnx("Error when turning off kemari");
+    }
+
+    return ret;
+}
+
+static int xc_kemari_attach(unsigned int xc_fd, unsigned int domid, 
+                             uint32_t *port, uint32_t *num_pages,
+                             uint64_t *mfn, uint16_t tap_mode) {
+
+    int ret;
+
+    if ((ret = xc_kemari_control(xc_fd, domid, XEN_KEMARI_OP_attach,
+                                 port, num_pages, 
+                                 mfn, KEMARI_TAP_OUT) != 0)) {
+        warnx("Error when attaching kemari");
+    }
+
+    return ret;
+}
+
+static int xc_kemari_send(unsigned int xc_fd, unsigned int domid, 
+                             uint32_t *port, uint32_t *num_pages,
+                             uint64_t *mfn, uint16_t tap_mode) {
+
+    int ret;
+
+    if ((ret = xc_kemari_control(xc_fd, domid, 9/* bogus */,
+                                 port, num_pages, 
+                                 mfn, tap_mode) != 0)) {
+        warnx("Error when sending events from kemari");
+    }
+
+    return ret;
+}
+
+static int (*kemari_ops[NUM_OPS]) (unsigned int xc_fd,
+                             unsigned int domid, 
+                             uint32_t *port,
+                             uint32_t *num_pages,
+                             uint64_t *mfn, 
+                             uint16_t tap_mode) = 
+{
+    xc_kemari_all,
+    xc_kemari_enable,
+    xc_kemari_off,
+    xc_kemari_attach,
+    NULL,
+    xc_kemari_send,
+};
+
+int
+main(int argc, char **argv)
+{
+
+    int ret = 1;
+    unsigned int xc_fd, io_fd, domid, flags;
+    uint16_t tap_mode;
+    uint32_t port, num_pages;
+    uint64_t mfn;
+
+    if (argc != 6)
+        errx(1, 
+             "usage: %s iofd domid flags port tap_mode",
+             argv[0]);
+
+    xc_fd = xc_interface_open();
+    if (xc_fd < 0)
+        errx(1, "failed to open control interface");
+
+    io_fd = atoi(argv[1]);
+    domid = atoi(argv[2]);
+    flags = atoi(argv[3]);
+    port = atoi(argv[4]);
+    tap_mode = atoi(argv[5]);
+
+    if (flags > NUM_OPS-1) {
+        warnx("function mode must be smaller than %d", NUM_OPS);
+        goto out;
+    }
+
+    ret = kemari_ops[flags](xc_fd, domid, &port, &num_pages, &mfn, tap_mode);
+
+ out:
+    xc_interface_close(xc_fd);
+
+    return ret;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 19201eebab16 xen/arch/x86/kemari/Makefile
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/kemari/Makefile	Tue Nov 18 21:04:58 2008 +0900
@@ -0,0 +1,4 @@
+#CFLAGS += -DDEBUG=2
+
+obj-y += kemari.o
+#obj-y += evtchn_tap.o
diff -r 19201eebab16 xen/arch/x86/kemari/kemari.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/kemari/kemari.c	Wed Nov 19 13:50:59 2008 +0900
@@ -0,0 +1,650 @@
+/******************************************************************************
+ * kemari.c
+ *
+ * The hypervisor part of VM synchronization mechanism (Kemari).
+ *
+ * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * 
+ * Copied log_dirty_lock(_d), log_dirty_unlock(_d) and paging_log_dirty_op()
+ * from arch/x86/paging.c.
+ *
+ * x86 specific paging support
+ * Copyright (c) 2007 Advanced Micro Devices (Wei Huang)
+ * Copyright (c) 2007 XenSource Inc.
+ */
+
+#include <xen/config.h>
+#include <xen/errno.h>
+#include <xen/sched.h>
+#include <xen/event.h>
+#include <xen/kemari.h>
+#include <xen/mm.h>
+#include <xen/domain.h>
+
+#include <public/kemari.h>
+#include <asm/domain.h>
+#include <asm/hvm/support.h>
+#include <asm/page.h>
+#include <asm/paging.h>
+#include <asm/shadow.h>
+#include <asm/types.h>
+
+#define log_dirty_lock(_d)                                                   \
+    do {                                                                     \
+        if (unlikely((_d)->arch.paging.log_dirty.locker==current->processor))\
+        {                                                                    \
+            printk("Error: paging log dirty lock held by %s\n",              \
+                   (_d)->arch.paging.log_dirty.locker_function);             \
+            BUG();                                                           \
+        }                                                                    \
+        spin_lock(&(_d)->arch.paging.log_dirty.lock);                        \
+        ASSERT((_d)->arch.paging.log_dirty.locker == -1);                    \
+        (_d)->arch.paging.log_dirty.locker = current->processor;             \
+        (_d)->arch.paging.log_dirty.locker_function = __func__;              \
+    } while (0)
+
+#define log_dirty_unlock(_d)                                              \
+    do {                                                                  \
+        ASSERT((_d)->arch.paging.log_dirty.locker == current->processor); \
+        (_d)->arch.paging.log_dirty.locker = -1;                          \
+        (_d)->arch.paging.log_dirty.locker_function = "nobody";           \
+        spin_unlock(&(_d)->arch.paging.log_dirty.lock);                   \
+    } while (0)
+
+static void kemari_send_domaininfo_ctxt(struct kemari_ring *ring,
+                                        struct domain *d)
+{
+    struct hvm_domain_context ctxt;
+
+    if ( !d->is_paused_by_controller )
+    {
+        dprintk(XENLOG_ERR, "Domain isn't paused\n");        
+        return;
+    }
+
+    ctxt.cur = 0;
+    ctxt.size = ring->hvm_ctxt.buf_size;
+    ctxt.data = ring->hvm_ctxt.buf;
+    hvm_save(d, &ctxt);
+    ring->hvm_ctxt.rec_size = ctxt.cur;
+}
+
+static long kemari_send_dirty_bitmap_page(struct kemari_ring *ring,
+                                          struct domain *d,
+                                          unsigned long *dirty_bitmap,
+                                          uint16_t index, unsigned int bytes)
+{
+    uint16_t i, j;
+    struct kemari_ent *buf;
+
+    for ( i = 0; i < bytes / BYTES_PER_LONG; i++ )
+    {
+        j = i;
+
+        while ( (j < bytes / BYTES_PER_LONG) && (dirty_bitmap[j] != 0) )
+            j++;
+
+        if ( i == j )
+            continue;
+
+        buf = KEMARI_RING_GET_PROD(ring);
+        buf->u.index.start = i + index;
+        buf->u.index.end = j + index;
+        wmb();
+        ring->prod++;
+
+        while( i < j )
+        {
+            buf = (struct kemari_ent *)&dirty_bitmap[i];
+            kemari_ring_write(ring, buf);
+            i++;
+        }
+    }
+    return i;
+}
+
+/* Based on paging_log_dirty_op() in xen/arch/x86/mm/paging.c. */
+static long kemari_send_dirty_bitmap(struct kemari_ring *ring,
+                                     struct domain *d)
+{
+    long ret = 0, clean = 1, peek = 1;
+    unsigned long pages = 0;
+    unsigned long p2m_size;
+    mfn_t *l4, *l3, *l2;
+    unsigned long *l1;
+    int i4, i3, i2;
+    uint16_t index = 0;
+
+    log_dirty_lock(d);
+
+    if ( clean )
+    {
+        d->arch.paging.log_dirty.fault_count = 0;
+        d->arch.paging.log_dirty.dirty_count = 0;
+    }
+
+    if ( !mfn_valid(d->arch.paging.log_dirty.top) )
+    {
+        ret = -EINVAL; /* perhaps should be ENOMEM? */
+        goto out;
+    }
+
+    if ( unlikely(d->arch.paging.log_dirty.failed_allocs) ) {
+        printk("%s: %d failed page allocs while logging dirty pages\n",
+               __FUNCTION__, d->arch.paging.log_dirty.failed_allocs);
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    pages = 0;
+    l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top));
+
+    p2m_size = domain_get_maximum_gpfn(d) + 1;
+
+    for ( i4 = 0;
+          (pages < p2m_size) && (i4 < LOGDIRTY_NODE_ENTRIES);
+          i4++ )
+    {
+        l3 = mfn_valid(l4[i4]) ? map_domain_page(mfn_x(l4[i4])) : NULL;
+        for ( i3 = 0; 
+              (pages < p2m_size) && (i3 < LOGDIRTY_NODE_ENTRIES);
+              i3++ )
+        {
+            l2 = ((l3 && mfn_valid(l3[i3])) ?
+                  map_domain_page(mfn_x(l3[i3])) : NULL);
+            for ( i2 = 0;
+                  (pages < p2m_size) && (i2 < LOGDIRTY_NODE_ENTRIES);
+                  i2++ )
+            {
+                unsigned int bytes = PAGE_SIZE;
+                l1 = ((l2 && mfn_valid(l2[i2])) ?
+                      map_domain_page(mfn_x(l2[i2])) : NULL);
+                if ( unlikely(((p2m_size - pages + 7) >> 3) < bytes) )
+                    bytes = (unsigned int)((p2m_size - pages +
+                                            BITS_PER_LONG - 1) >> 3);
+                if ( likely(peek) )
+                {
+                    if ( l1 != NULL &&
+                         kemari_send_dirty_bitmap_page(ring, d, l1,
+                                                       index, bytes) < 0 )
+                    {
+                        ret = -EFAULT;
+                        dprintk(XENLOG_ERR,
+                                "%s: kemari_send_dirty_bitmap_page\n",
+                                __FUNCTION__);
+                        goto out;
+                    }
+                }
+                index += PAGE_SIZE / BYTES_PER_LONG;
+
+                if ( clean && l1 != NULL )
+                    clear_page(l1);
+                pages += bytes << 3;
+                if ( l1 != NULL )
+                    unmap_domain_page(l1);
+            }
+            if ( l2 )
+                unmap_domain_page(l2);
+        }
+        if ( l3 )
+            unmap_domain_page(l3);
+    }
+    unmap_domain_page(l4);
+
+    log_dirty_unlock(d);
+
+    if ( clean )
+    {
+        /* We need to further call clean_dirty_bitmap() functions of specific
+         * paging modes (shadow or hap).  Safe because the domain is paused. */
+        d->arch.paging.log_dirty.clean_dirty_bitmap(d);
+    }
+
+    return ret;
+
+ out:
+    log_dirty_unlock(d);
+
+    return ret;
+}
+
+static void kemari_guest_notify(struct kemari *kemari)
+{
+    if ( likely(kemari != NULL) )
+        notify_via_xen_evtchn_tap(kemari->domain, kemari->port);
+}
+
+/* VM synchronization entry point. */
+static long run_kemari(struct evtchn *lchn, struct evtchn *rchn)
+{
+    long ret;
+    uint32_t port;
+    uint64_t *events;
+    struct domain *d, *rd = lchn->u.interdomain.remote_dom;
+    struct kemari *kemari;
+    struct kemari_ring *ring;
+
+    if (lchn->tap.mode & KEMARI_TAP_OUT)
+    {
+        domain_pause_for_debugger();
+        d = current->domain;
+        kemari = d->kemari;
+        port = rchn->u.interdomain.remote_port;
+        events = &kemari->taps[port].out_events;
+    }
+    else if (rchn->tap.mode & KEMARI_TAP_IN)
+    {
+        domain_pause_by_systemcontroller(rd);
+        d = rd;
+        kemari = rd->kemari;
+        port = lchn->u.interdomain.remote_port;
+        events = &kemari->taps[port].in_events;
+    }
+    else
+    {
+        ret = 0;
+        goto out;
+    }
+
+    spin_lock(&d->grant_table->lock);
+
+    ++*events;
+
+    ring = kemari->ring;
+
+    kemari_send_domaininfo_ctxt(ring, d);
+
+    ret = kemari_send_dirty_bitmap(ring, d);
+    if ( ret < 0 )
+        goto unlock_out;
+
+    kemari_guest_notify(kemari);
+
+    prepare_wait_on_xen_event_channel(kemari->port);
+
+    ret = 0;
+
+ unlock_out:
+    spin_unlock(&d->grant_table->lock);
+
+ out:
+    return ret;
+}
+
+static long kemari_bind_tap(struct domain *d,
+                            struct xen_domctl_kemari_op *kemari_op)
+{
+    long ret;
+    struct evtchn_bind_tap bind_tap;
+
+    bind_tap.tap_dom = d->domain_id;
+    bind_tap.tap_port = kemari_op->u.attach.port;
+    bind_tap.mode = kemari_op->u.attach.evtchn_tap_mode;
+    bind_tap.redirect = run_kemari;
+
+    ret = evtchn_bind_tap(&bind_tap);
+
+    return ret;
+}
+
+static long kemari_unbind_tap(struct domain *d,
+                              struct xen_domctl_kemari_op *kemari_op)
+{
+    long ret;
+    struct evtchn_bind_tap unbind_tap;
+
+    unbind_tap.tap_dom = d->domain_id;
+    unbind_tap.tap_port = kemari_op->u.detach.port;
+    unbind_tap.mode = KEMARI_TAP_OFF;
+
+    ret = evtchn_unbind_tap(&unbind_tap);
+
+    return ret;
+}
+
+static long kemari_attach(struct domain *d,
+                          struct xen_domctl_kemari_op *kemari_op)
+{
+    long ret;
+    uint32_t port = kemari_op->u.attach.port;
+    struct kemari *kemari = d->kemari;
+    struct kemari_tap *tap;
+
+    dprintk(XENLOG_DEBUG, "%s: in\n", __FUNCTION__);
+
+    ret = -EINVAL;
+    if ( unlikely(kemari == NULL) )
+    {
+        dprintk(XENLOG_ERR, "kemari is off\n");
+        goto out;
+    }
+    dprintk(XENLOG_DEBUG, "%s: kemari_bind_tap\n", __FUNCTION__);
+    ret =  kemari_bind_tap(d, kemari_op);
+    if (ret < 0)
+    {
+        dprintk(XENLOG_ERR, 
+                "couldn't bind evtchn tap port=%u\n", port);
+        goto out;
+    }
+
+    tap = &kemari->taps[port];
+
+    tap->status = KEMARI_TAP_ATTACHED;
+
+ out:
+    dprintk(XENLOG_DEBUG, "%s: out\n", __FUNCTION__);
+    return ret;
+}
+
+static long kemari_detach(struct domain *d,
+                          struct xen_domctl_kemari_op *kemari_op)
+{
+    long ret;
+    uint32_t port = kemari_op->u.detach.port;
+    struct kemari *kemari = d->kemari;
+    struct kemari_tap *tap = &kemari->taps[port];
+
+    ret = -EINVAL;
+    if ( unlikely(kemari == NULL) )
+    {
+        dprintk(XENLOG_ERR, "kemari is off\n");
+        goto out;
+    }
+
+    ret = -EINVAL;
+    if ( unlikely(tap->status != KEMARI_TAP_ATTACHED) )
+        goto out;
+
+    ret =  kemari_unbind_tap(d, kemari_op);
+    if (ret < 0)
+        goto out;
+
+    tap->status = KEMARI_TAP_DETACHED;
+
+ out:
+    return ret;
+}
+
+static void share_kemari_page_with_privileged_guests(struct kemari *kemari)
+{
+    int i;
+    struct kemari_ring *ring = kemari->ring;
+
+    for ( i = 0; i < kemari->num_pages; i++ )
+        share_xen_page_with_privileged_guests(virt_to_page(ring) + i, 
+                                              XENSHARE_writable);
+}
+
+static void unshare_kemari_page_with_privileged_guests(struct kemari *kemari)
+{
+    int i;
+
+    for ( i = 0; i < kemari->num_pages; i++ )
+    {
+        struct page_info *page = mfn_to_page(kemari->mfn + i);
+        BUG_ON(page_get_owner(page) != dom_xen);
+        if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
+            put_page(page);
+    }
+}
+
+static void kemari_free_ring(struct domain *d)
+{
+    int order;
+    struct vcpu *v = d->vcpu[0];
+    struct kemari *kemari = d->kemari;
+
+    if ( kemari->ring == NULL   || 
+         kemari->num_pages == 0 ||
+         kemari->port == 0 )
+
+    free_xen_event_channel(v, kemari->port);
+
+    unshare_kemari_page_with_privileged_guests(kemari);
+
+    order = get_order_from_pages(kemari->num_pages);
+    free_xenheap_pages(kemari->ring, order);
+
+    kemari->mfn = 0;
+    kemari->ring = NULL;
+    kemari->num_pages = 0;
+    kemari->port = 0;
+}
+
+static long kemari_alloc_ring(struct domain *d, struct kemari *kemari)
+{
+    long ret;
+    unsigned int order;
+    unsigned long num_pages;
+    domid_t current_domid = current->domain->domain_id;
+    struct vcpu *v = d->vcpu[0];
+    struct kemari_ring *ring;
+    unsigned long dirty_bitmap_size;
+    uint32_t hvm_buf_size;
+
+    ret = alloc_unbound_xen_event_channel(v, current_domid);
+    if ( ret < 0 )
+    {
+        dprintk(XENLOG_ERR, "couldn't alloc xen_event_channel\n");
+        goto out;
+    }
+    kemari->port = ret;
+
+    dirty_bitmap_size = (BITS_TO_LONGS(domain_get_maximum_gpfn(d) + 1) 
+                         * sizeof(unsigned long));
+
+    ret = -EINVAL;
+    if ( dirty_bitmap_size == 0 || !mfn_valid(d->arch.paging.log_dirty.top) )
+    {
+        dprintk(XENLOG_ERR, "dirty_bitmap is EMPTY\n");
+        goto out_evtchn;
+    }
+
+    hvm_buf_size = hvm_save_size(d);
+    num_pages = (sizeof(struct kemari_ring) 
+                 + hvm_buf_size
+                 + (dirty_bitmap_size >> 3) 
+                 + PAGE_SIZE - 1) / PAGE_SIZE;
+    order = get_order_from_pages(num_pages);
+    num_pages = (1UL << order);
+
+    dprintk(XENLOG_DEBUG, "ring=%u, bitmap=%lu, ctxt=%u, PAGE=%ld\n", 
+            sizeof(struct kemari_ring), dirty_bitmap_size / 8,
+            hvm_buf_size, PAGE_SIZE);
+
+    ret = -ENOMEM;
+    ring = alloc_xenheap_pages(order); 
+    if ( ring == NULL )
+    {
+        dprintk(XENLOG_ERR, "couldn't alloc xenheap_pages\n");
+        goto out_evtchn;
+    }
+    memset(ring, 0, PAGE_SIZE * num_pages);
+
+    ring->num_ents =
+        (PAGE_SIZE * num_pages - hvm_buf_size + (long)ring - (long)ring->data)
+        / sizeof(struct kemari_ent);
+    ring->hvm_ctxt.buf_size = hvm_buf_size;
+    ring->hvm_ctxt.buf = (uint8_t *)ring + PAGE_SIZE * num_pages - hvm_buf_size;
+
+    kemari->num_pages = num_pages;
+    kemari->mfn = virt_to_mfn(ring);
+    kemari->ring = ring;
+
+    share_kemari_page_with_privileged_guests(kemari);
+
+    dprintk(XENLOG_DEBUG, "num_ents=%u, num_pages=%u\n",
+            ring->num_ents, kemari->num_pages);
+
+    return 0;
+
+ out_evtchn:
+    free_xen_event_channel(v, kemari->port);
+ out:
+    return ret;
+}
+
+static long kemari_enable(struct domain *d, 
+                          struct xen_domctl_kemari_op *kemari_op)
+{
+    long ret;
+    struct kemari *kemari;
+
+    ret = -EBUSY;
+    if ( unlikely(d->kemari != NULL) )
+    {
+        dprintk(XENLOG_ERR, "kemari already enabled\n");
+        goto out;
+    }
+
+    ret = -ENOMEM;
+	kemari = xmalloc_bytes(sizeof(struct kemari));
+	if ( kemari == NULL )
+    {
+        dprintk(XENLOG_ERR, "couldn't alloc kemari\n");
+		goto out;
+    }
+
+	memset(kemari, 0, sizeof(struct kemari) );
+
+    domain_pause_by_systemcontroller(d);
+
+    ret = kemari_alloc_ring(d, kemari);
+    if ( ret < 0 )
+        goto kemari_free;
+
+    kemari_op->u.enable.port = kemari->port;
+    kemari_op->u.enable.mfn = kemari->mfn;
+    kemari_op->u.enable.num_pages = kemari->num_pages;
+ 
+    dprintk(XENLOG_DEBUG, "port=%u, mfn=%llu\n", kemari->port, kemari->mfn);
+
+    kemari->domain = d;
+
+    d->kemari = kemari;
+    
+    domain_unpause_by_systemcontroller(d);
+
+    dprintk(XENLOG_DEBUG, "kemari enabled\n");
+    return 0;
+
+ kemari_free:
+    xfree(kemari);
+    domain_unpause_by_systemcontroller(d);
+ out:
+    return ret;
+}
+
+static long kemari_off(struct domain *d,
+                       struct xen_domctl_kemari_op *kemari_op)
+{
+    long ret;
+    uint32_t port;
+    struct kemari *kemari = d->kemari;
+    struct kemari_tap *tap;
+    struct evtchn_bind_tap kemari_unbind_tap;
+
+    ret = -EINVAL;
+    if ( unlikely(kemari == NULL) )
+    {
+        dprintk(XENLOG_ERR, "kemari already off\n");
+        goto out;
+    }
+
+    domain_pause_by_systemcontroller(d);
+
+    kemari_unbind_tap.tap_dom = d->domain_id;
+
+    for ( port = 0; port < NUM_KEMARI_TAPS; port++ ) {
+        tap = &kemari->taps[port];
+
+        if ( tap->status != KEMARI_TAP_ATTACHED )
+            continue;
+
+        kemari_unbind_tap.tap_port = port;
+
+        if ( evtchn_unbind_tap(&kemari_unbind_tap) < 0 )
+            dprintk(XENLOG_ERR, 
+                    "couldn't unbind evtchn tap port=%u\n", port);
+    }
+
+    if ( kemari->ring )
+        kemari_free_ring(d);
+
+    xfree(kemari);
+
+    d->kemari = NULL;
+
+    domain_unpause_by_systemcontroller(d);
+
+    return 0;
+
+ out:
+    return ret;
+}
+
+long do_kemari_op(struct domain *d, struct xen_domctl_kemari_op *kemari_op)
+{
+    static DEFINE_SPINLOCK(lock);
+    long ret;
+
+    /* We don't support calling kemari by itself or dom0. */
+    if ( d == current->domain || d == dom0 )
+    {
+        dprintk(XENLOG_ERR, "can't attach kemari by itself or to dom0");
+        return -EINVAL;
+    }
+
+    spin_lock(&lock);
+
+    switch ( kemari_op->cmd ) 
+    {
+    case  XEN_KEMARI_OP_enable:
+        ret = kemari_enable(d, kemari_op);
+        break;
+
+    case XEN_KEMARI_OP_off:
+        ret = kemari_off(d, kemari_op);
+        break;
+
+    case  XEN_KEMARI_OP_attach:
+        ret = kemari_attach(d, kemari_op);
+        break;
+
+    case XEN_KEMARI_OP_detach:
+        ret = kemari_detach(d, kemari_op);
+        break;
+
+    default:
+        ret = -EINVAL;
+        break;
+    }
+
+    spin_unlock(&lock);
+
+    return ret;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 19201eebab16 xen/include/public/kemari.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/public/kemari.h	Wed Nov 19 13:50:59 2008 +0900
@@ -0,0 +1,97 @@
+/******************************************************************************
+ * kemari.h
+ *
+ * Tools interface to Kemari.
+ *
+ * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __XEN_PUBLIC_KEMARI_H__
+#define __XEN_PUBLIC_KEMARI_H__
+
+#define KEMARI_TAP_OFF 0
+#define KEMARI_TAP_IN  1
+#define KEMARI_TAP_OUT 2
+
+struct kemari_ring {
+    uint32_t cons;
+    uint32_t prod;
+    uint32_t num_ents;
+    unsigned int dirty_bitmap_size; /* num of ditry bits */
+    struct {
+        uint32_t buf_size;
+        uint32_t rec_size;
+        uint8_t  *buf;
+    } hvm_ctxt;
+    char     data[1];
+} __attribute__ ((__aligned__(4)));
+
+struct kemari_ent {
+    union {
+        struct {
+            uint16_t pages;
+            uint16_t port;
+        } header;
+        struct {
+            uint16_t start;
+            uint16_t end;            
+        } index;
+        unsigned long dirty_bitmap;
+    } u;
+};
+
+#define KEMARI_RING_GET_PROD(_ring) \
+    (&((struct kemari_ent *)(_ring)->data)[(_ring)->prod % (_ring)->num_ents])
+
+#define KEMARI_RING_GET_CONS(_ring) \
+    (&((struct kemari_ent *)(_ring)->data)[(_ring)->cons % (_ring)->num_ents])
+
+static inline void kemari_ring_read(struct kemari_ring *ring,
+                                    struct kemari_ent **buf)
+{
+    *buf = KEMARI_RING_GET_CONS(ring);
+#ifdef __XEN__
+    wmb();
+#elif __XEN_TOOLS__
+    xen_wmb();
+#endif
+    ring->cons++;
+}
+
+static inline void kemari_ring_write(struct kemari_ring *ring, 
+                                     struct kemari_ent *buf)
+{
+    memcpy(KEMARI_RING_GET_PROD(ring), buf, sizeof(struct kemari_ent));
+#ifdef __XEN__
+    wmb();
+#elif __XEN_TOOLS__
+    xen_wmb();
+#endif
+    ring->prod++;
+}
+
+#endif /* __XEN_PUBLIC_KEMARI_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 19201eebab16 xen/include/xen/kemari.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/xen/kemari.h	Wed Nov 19 13:50:59 2008 +0900
@@ -0,0 +1,73 @@
+/******************************************************************************
+ * kemari.h
+ *
+ * Kemari header file.
+ *
+ * Copyright (C) 2008 Nippon Telegraph and Telephone Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __XEN_KEMARI_H__
+#define __XEN_KEMARI_H__
+
+#include <public/domctl.h>
+
+#define NUM_KEMARI_TAPS 32
+
+#define _KEMARI_TAP_ATTACHED 0
+#define KEMARI_TAP_ATTACHED (1UL<<_KEMARI_TAP_ATTACHED)
+#define _KEMARI_TAP_DETACHED 1
+#define KEMARI_TAP_DETACHED (1UL<<_KEMARI_TAP_DETACHED)
+
+struct kemari_tap {
+    uint64_t status;
+    uint64_t in_events;
+    uint64_t out_events;
+};
+
+/* Main data structure of Kemari  */
+struct kemari {
+    struct domain      *domain;
+
+    struct kemari_ring *ring;
+
+    uint32_t           port;
+
+    uint32_t           num_pages;
+
+    uint64_t           mfn;
+
+    uint64_t           num_events;
+
+    uint64_t           priv_dirty_pages;
+
+    struct kemari_tap  taps[NUM_KEMARI_TAPS];
+};
+
+/* Entry point to Kemari */
+long do_kemari_op(struct domain *d, struct xen_domctl_kemari_op *kemari_op);
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
