/* $Id$ */

/*
 * NetBSD kernel wrapper for KQEMU
 */
/*
 * Based on Linux kernel wrapper for KQEMU
 *
 * Copyright (C) 2004-2007 Fabrice Bellard
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * version 2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

#include <sys/systm.h>
#include <sys/param.h>
#include <sys/conf.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/lkm.h>
#include <sys/malloc.h>
#if __NetBSD_Version__ >= 499001000 /* post-newlock2 */
#include <sys/mutex.h>
#endif
#include <sys/sysctl.h>
#include <uvm/uvm.h>
#include <machine/stdarg.h>

#include "kqemu-kernel.h"

extern int kqemu_lkmentry(struct lkm_table *, int, int);

MALLOC_DEFINE(M_KQEMU, "kqemu", "kqemu");
MALLOC_DECLARE(M_KQEMU);

int kqemu_debug = 0;
#define DPRINTF		if (kqemu_debug) kqemu_log
#define DPRINTF1	if (kqemu_debug>1) kqemu_log
#define DPRINTF2	if (kqemu_debug>2) kqemu_log
int lock_count = 0;
int page_alloc_count = 0;
int malloc_count = 0;
int io_map_count = 0;

/* Lock the page at virtual address 'user_addr' and return its
   physical address (page index). Return a host OS private user page
   identifier or NULL if error */
/* Use VA as the identifier (see FreeBSD and Windows) */
struct kqemu_user_page *CDECL kqemu_lock_user_page(unsigned long *ppage_index,
                                                   unsigned long user_addr)
{
	struct vm_map *map = &curproc->p_vmspace->vm_map;
	vaddr_t va = user_addr;
	paddr_t pa;
	int ret;

	DPRINTF1("kqemu_lock_user_page(%08lx)\n", user_addr);
	ret = uvm_map_pageable(map, va, va+PAGE_SIZE, FALSE, 0);
	if (ret != 0) {
		printf("kqemu_lock_user_page: uvm_map_pageable failed, "
		       "pid %d, vm_map %p, va %08lx, errno %d\n",
		       curproc->p_pid, map, va, ret);
		return NULL;
	}
	if (pmap_extract(vm_map_pmap(map), va, &pa) == FALSE) {
		printf("kqemu_lock_user_page: pmap_extract failed, "
		       "pmap %p, va %08lx\n", vm_map_pmap(map), user_addr);
		return NULL;
	}
	*ppage_index = pa >> PAGE_SHIFT;
	lock_count++;
	return (struct kqemu_user_page *)va;
}

void CDECL kqemu_unlock_user_page(struct kqemu_user_page *page1)
{
	struct vm_map *map = &curproc->p_vmspace->vm_map;
	vaddr_t va;
	int ret;

	DPRINTF1("kqemu_unlock_user_page(%08lx)\n", page1);
	va = (vaddr_t) page1;
	ret = uvm_map_pageable(map, va, va+PAGE_SIZE, TRUE, 0);
	if (ret != 0) {
		printf("kqemu_unlock_user_page: uvm_map_pageable failed, "
		       "pid %d, vm_map %p, va %08lx, errno %d\n",
		       curproc->p_pid, map, va, ret);
	}
	lock_count--;
}

/* Allocate a new page and return its physical address (page
   index). Return a host OS private page identifier or NULL if
   error */
struct kqemu_page *CDECL kqemu_alloc_zeroed_page(unsigned long *ppage_index)
{
	vaddr_t va;
	paddr_t pa;

	va = uvm_km_alloc(kernel_map, PAGE_SIZE, PAGE_SIZE,
			  UVM_KMF_WIRED|UVM_KMF_ZERO);
	if (va == 0) {
		printf("kqemu_alloc_zeroed_page: uvm_km_alloc failed.\n");
		return NULL;
	}
	if (pmap_extract(pmap_kernel(), va, &pa) == FALSE) {
		printf("kqemu_alloc_zerod_page: pmap_extract failed, "
		       "pmap %p, va %08lx\n", pmap_kernel(), va);
		uvm_km_free(kernel_map, va, PAGE_SIZE, UVM_KMF_WIRED);
		return NULL;
	}
	DPRINTF1("kqemu_alloc_zeroed_page: %08x\n", pa);
	*ppage_index = pa >> PAGE_SHIFT;
	page_alloc_count++;
	return (struct kqemu_page *)va;
}

void CDECL kqemu_free_page(struct kqemu_page *page)
{
	DPRINTF1("kqemu_free_page(%p)\n", page);
	uvm_km_free(kernel_map, (vaddr_t) page, PAGE_SIZE, UVM_KMF_WIRED);
	page_alloc_count--;
}

/* Return a host kernel address of the physical page whose private
   identifier is 'page1' */
void * CDECL kqemu_page_kaddr(struct kqemu_page *page)
{
	vaddr_t va = (vaddr_t)page;
	return (void *)va;
}

/* Allocate 'size' bytes of memory in host kernel address space (size
   is a multiple of 4 KB) and return the address or NULL if error. The
   allocated memory must be marked as executable by the host kernel
   and must be page aligned. On i386 with PAE (but not on x86_64), it
   must be allocated in the first 4 GB of physical memory. */
void * CDECL kqemu_vmalloc(unsigned int size)
{
	void *ptr;
	int r, s;

	/* to avoid reuse of the allocated area, we waste some memory
	   when we are allocating small amount.  otherwise we have to
	   reset the protection before it is freed, but at that time
	   we do not know the size of the allocated area, which must be
	   passed to uvm_map_protect(). */
	if (size <= MAXALLOCSAVE)
		size = MAXALLOCSAVE + 1;
	MALLOC(ptr, void*, size, M_KQEMU, M_WAITOK);
	DPRINTF1("kqemu_vmalloc(%u) -> %p\n", size, ptr);
	s = splvm();		/* XXX? */
	r = uvm_map_protect(kmem_map, (vaddr_t) ptr, (vaddr_t) ptr + size,
			    UVM_PROT_ALL, TRUE);
	splx(s);
	if (r != 0) {
		FREE(ptr, M_KQEMU);
		printf("kqemu_vmalloc: uvm_map_protect failed, errno %d\n", r);
		return 0;
	}
	malloc_count++;
	return ptr;
}

void CDECL kqemu_vfree(void *ptr)
{
	DPRINTF1("kqemu_vfree(%p)\n", ptr);
	FREE(ptr, M_KQEMU);
	malloc_count--;
}

/* Convert a page aligned address inside a memory area allocated by
   kqemu_vmalloc() to a physical address (page index) */
unsigned long kqemu_vmalloc_to_phys(const void *vaddr)
{
	vaddr_t va = (vaddr_t) vaddr;
	paddr_t pa;

	if (pmap_extract(pmap_kernel(), va, &pa) == FALSE) {
		printf("kqemu_vmalloc_to_phys: pmap_extract failed, "
		       "va %08lx\n", va);
		return -1;
	}
	DPRINTF1("kqemu_vmalloc_to_phys(%p)->%08x\n", vaddr, pa);
	return pa >> PAGE_SHIFT;
}

#if 1
/* Map a IO area in the kernel address space and return its
   address. Return NULL if error or not implemented. This function is
   only used if an APIC is detected on the host CPU. */
void * CDECL kqemu_io_map(unsigned long page_index, unsigned int size)
{
	vsize_t rsize = roundup(size, PAGE_SIZE);
	vaddr_t va;
	voff_t offset;

	va = uvm_km_alloc(kernel_map, rsize, 0,
			  UVM_KMF_VAONLY|UVM_KMF_CANFAIL|UVM_KMF_WAITVA);
	if (va == 0) {
		kqemu_log("uvm_km_alloc failed\n");
		return NULL;
	}
	for (offset = 0; offset < rsize; offset += PAGE_SIZE)
		pmap_kenter_pa(va + offset, page_index++ << PAGE_SHIFT,
			       VM_PROT_READ | VM_PROT_WRITE);
	io_map_count += rsize >> PAGE_SHIFT;

	return (void*) va;
}

/* Unmap the IO area */
void CDECL kqemu_io_unmap(void *ptr, unsigned int size)
{
	vaddr_t va = (vaddr_t)ptr;
	vsize_t rsize = roundup(size, PAGE_SIZE);

	pmap_kremove(va, rsize);
	uvm_km_free(kernel_map, va, PAGE_SIZE, UVM_KMF_VAONLY);
	io_map_count -= rsize >> PAGE_SHIFT;
}
#else
/* Map a IO area in the kernel address space and return its
   address. Return NULL if error or not implemented. This function is
   only used if an APIC is detected on the host CPU. */
void * CDECL kqemu_io_map(unsigned long page_index, unsigned int size)
{
	return NULL;
}

/* Unmap the IO area */
void CDECL kqemu_io_unmap(void *ptr, unsigned int size)
{
}
#endif

/* return TRUE if a signal is pending (i.e. the guest must stop
   execution) */
int CDECL kqemu_schedule(void)
{
	DPRINTF2("kqemu_schedule\n");
	yield();
#if 1
#if __NetBSD_Version__ < 499001000 /* pre-newlock2 */
	return CURSIG(curlwp)?TRUE:FALSE;
#else
	return ((curlwp->l_flag & LW_PENDSIG) != 0);
#endif
#else
	return TRUE;
#endif
}

static char log_buf[4096];

void CDECL kqemu_log(const char *fmt, ...)
{
	va_list ap;
	va_start(ap, fmt);
	vsnprintf(log_buf, sizeof(log_buf), fmt, ap);
	printf("kqemu: %s", log_buf);
	va_end(ap);
}

/*********************************************************/

static struct kqemu_global_state *kqemu_gs;

struct kqemu_instance {
#if __NetBSD_Version__ < 499001000 /* pre-newlock2 */
	/* NetBSD <= 4.x: giant lock */
#define mutex_enter(a)	do {} while (/* CONSTCOND */0)
#define mutex_exit(a)	do {} while (/* CONSTCOND */0)
#else
	kmutex_t mutex;
#endif
	struct kqemu_state *state;
};

int kqemu_refcnt = 0;

static const struct fileops kqemu_fileops;

static int
kqemu_open(dev_t dev, int flags, int ifmt, struct lwp *l)
{
	int r, fd;
	struct file *filep;
	struct kqemu_instance *ks;

	DPRINTF("kqemu_open: entry\n");
#if __NetBSD_Version__ < 499005600
	r = falloc(l, &filep, &fd);
	if (r != 0) {
		printf("kqemu_open: falloc failed, errno %d\n", r);
		return r;
	}
#else
	r = fd_allocfile(&filep, &fd);
	if (r != 0) {
		printf("kqemu_open: fd_allocfile failed, errno %d\n", r);
		return r;
	}
#endif
	MALLOC(ks, struct kqemu_instance*, sizeof(*ks), M_KQEMU, M_WAITOK);
	memset(ks, 0, sizeof(*ks));
	mutex_init(&ks->mutex, MUTEX_DEFAULT, IPL_NONE);
	kqemu_refcnt++;		/* fdclone never fails */
#if __NetBSD_Version__ < 499005600
	return fdclone(l, filep, fd, flags, &kqemu_fileops, ks);
#else
	return fd_clone(filep, fd, flags, &kqemu_fileops, ks);
#endif
}

static struct cdevsw kqemu_cdevsw = {
	kqemu_open, noclose,
	noread, nowrite, noioctl,
	nostop, notty, nopoll, nommap, nokqfilter,
	D_OTHER,
};

#if __NetBSD_Version__ < 499005600
static int
kqemu_fops_close(struct file *filep, struct lwp *l)
#else
static int
kqemu_fops_close(struct file *filep)
#endif
{
	struct kqemu_instance *ks = filep->f_data;

	DPRINTF("kqemu_fops_close: entry\n");
	if (ks == 0) {
		printf("kqemu_fops_close: NULL f_data\n");
		return ENODEV;
	}
	mutex_enter(&ks->mutex);
	if (ks->state) {
		kqemu_delete(ks->state);
		ks->state = NULL;
	}
	mutex_exit(&ks->mutex);
	FREE(ks, M_KQEMU);
	kqemu_refcnt--;

	DPRINTF("lock_count=%d, page_alloc_count=%d, io_map_count=%d\n",
		lock_count, page_alloc_count, malloc_count, io_map_count);
	return 0;
}

#if __NetBSD_Version__ < 499003100
static int
fbadop_read(struct file *a, off_t *b, struct uio *c, kauth_cred_t d, int e)
{
	return ENODEV;
}
static int
fbadop_write(struct file *a, off_t *b, struct uio *c, kauth_cred_t d, int e)
{
	return ENODEV;
}
#endif
#if __NetBSD_Version__ < 499005600
static int
kqemu_fops_ioctl(struct file *filep, u_long cmd, void *data, struct lwp *l)
#else
static int
kqemu_fops_ioctl(struct file *filep, u_long cmd, void *data)
#endif
{
	struct kqemu_instance *ks = filep->f_data;
	struct kqemu_state *s;
	int ret;

	if (ks == 0) {
		printf("kqemu_fops_close: NULL f_data\n");
		return EIO;
	}
	s = ks->state;

	mutex_enter(&ks->mutex);
	switch (cmd) {
	case KQEMU_INIT:
	{
		struct kqemu_init d1, *d = &d1, *arg = data;
		DPRINTF("kqemu_fops_ioctl: cmd KQEMU_INIT\n");
		if (s != NULL) {
			ret = EIO;
			break;
		}
		d1 = *arg;
		s = kqemu_init(d, kqemu_gs);
		if (!s) {
			ret = ENOMEM;
			break;
		}
		ks->state = s;
		ret = 0;
		break;
	}
	case KQEMU_EXEC:
	{
		struct kqemu_cpu_state *ctx, *arg = data;
//		int hold_count
		DPRINTF2("kqemu_fops_ioctl: cmd KQEMU_EXEC\n");
		if (!s) {
			ret = EIO;
			break;
		}

		ctx = kqemu_get_cpu_state(s);
		*ctx = *arg;
		/* we are leaving kernel code, thus we can release the lock,
		   right? */
		/* no, kqemu_exec may call kqemu_alloc_zeroed_page() etc.
		   since our vm code is not yet smp-safe, we need the lock */
		/* or we could acquire the lock in those callbacks. */
//		hold_count = KERNEL_LOCK_RELEASE_ALL();
		ret = kqemu_exec(s);
//		KERNEL_LOCK_ACQUIRE_COUNT(hold_count);
		if (ctx->retval == KQEMU_RET_SYSCALL) {
			DPRINTF2("kqemu_exec: syscall\n");
		} else if ((ctx->retval & 0xff00) == KQEMU_RET_INT) {
			DPRINTF2("kqemu_exec: interrupt 0x%02x\n",
				ctx->retval & 0xff);
		} else if ((ctx->retval & 0xff00) == KQEMU_RET_EXCEPTION) {
			DPRINTF2("kqemu_exec: exception 0x%02x\n",
				ctx->retval & 0xff);
		} else if (ctx->retval == KQEMU_RET_INTR) {
#if __NetBSD_Version__ < 499001000 /* pre-newlock2 */
			DPRINTF2("kqemu_exec: signal %d\n", CURSIG(curlwp));
#else
			DPRINTF2("kqemu_exec: signal\n");
#endif
		} else if (ctx->retval == KQEMU_RET_SOFTMMU) {
			DPRINTF2("kqemu_exec: soft MMU\n");
		} else if (ctx->retval == KQEMU_RET_ABORT) {
			DPRINTF2("kqemu_exec: abort\n");
		}
		*arg = *ctx;
		break;
	}
	case KQEMU_GET_VERSION:
		DPRINTF("kqemu_fops_ioctl: cmd KQEMU_GET_VERSION\n");
		*(int *)data = KQEMU_VERSION;
		ret = 0;
		break;
	default:
		DPRINTF("kqemu_fops_ioctl: unknown cmd\n");
		ret = EINVAL;
		break;
	}
	mutex_exit(&ks->mutex);

	return ret;
}

static const struct fileops kqemu_fileops = {
	fbadop_read, fbadop_write,
	kqemu_fops_ioctl,
	fnullop_fcntl, fnullop_poll, fbadop_stat,
	kqemu_fops_close,
	fnullop_kqfilter,
};

MOD_DEV("kqemu", "kqemu", NULL, -1, &kqemu_cdevsw, -1);

static int sysctl_num = -1;

static int
kqemu_attach(struct lkm_table *lkmtp, int cmd)
{
	int max_locked_pages;
	const struct sysctlnode *node;
	int ret = 0;
	char buf[9];

	printf("QEMU Accelerator Module version %d.%d.%d, "
	       "Copyright (c) 2005-2007 Fabrice Bellard\n",
	       (KQEMU_VERSION >> 16),
	       (KQEMU_VERSION >> 8) & 0xff,
	       (KQEMU_VERSION) & 0xff);

	max_locked_pages = physmem / 2;
	kqemu_gs = kqemu_global_init(max_locked_pages);
	if (!kqemu_gs)
		return ENOMEM;

	ret = sysctl_createv(NULL, 0, NULL, &node,
			     CTLFLAG_READWRITE,
			     CTLTYPE_INT, "kqemu",
			     SYSCTL_DESCR("kqemu debug level"),
			     NULL, 0, &kqemu_debug, 0,
			     CTL_DEBUG, CTL_CREATE, CTL_EOL);
	if (ret != 0)
		printf("sysctl_kqemu_setup: sysctl_createv failed, errno %d\n",
		       ret);
	else
		sysctl_num = node->sysctl_num;

	format_bytes(buf, sizeof(buf), max_locked_pages * PAGE_SIZE);
	printf("KQEMU installed, max_locked_mem=%s, major=%d.\n",
	       buf, cdevsw_lookup_major(&kqemu_cdevsw));
	return 0;
}

static int
kqemu_detach(struct lkm_table *lkmtp, int cmd)
{
	if (kqemu_refcnt != 0) {
		DPRINTF("kqemu_detach: refcnt %d\n", kqemu_refcnt);
		return EBUSY;
	}
	if (kqemu_gs) {
		kqemu_global_delete(kqemu_gs);
		kqemu_gs = NULL;
	}
	if (lock_count != 0 || page_alloc_count != 0 ||
	    io_map_count != 0 || malloc_count != 0) {
		printf("kqemu_detach: lock_count=%d, page_alloc_count=%d, "
		       "io_map_count=%d, malloc_count=%d\n",
		       lock_count, page_alloc_count,
		       io_map_count, malloc_count);
		/* return EBUSY; */
	}
	if (sysctl_num >= 0)
		sysctl_destroyv(NULL, CTL_DEBUG, sysctl_num, CTL_EOL);

	return 0;
}

static int
kqemu_stat(struct lkm_table *lkmtp, int cmd) /* XXX */
{
	DPRINTF("lock_count=%d, page_alloc_count=%d, "
		"io_map_count=%d, malloc_count=%d\n",
		lock_count, page_alloc_count, io_map_count, malloc_count);
	return 0;
}

int
kqemu_lkmentry(struct lkm_table *lkmtp, int cmd, int ver)
{
	LKM_DISPATCH(lkmtp, cmd, NULL, kqemu_attach, kqemu_detach, kqemu_stat);
}