WebSVN – HelenOS – Path Comparison – / – /branches/tracing/kernel/generic/src/mm/ Rev 2786 and /branches/tracing/kernel/generic/src/mm/ Rev 2787

Ignore whitespace Rev 2786 → Rev 2787

 /branches/tracing/kernel/generic/src/mm/slab.c
 ,0 → 1,940
+/*
+ * Copyright (c) 2006 Ondrej Palkovsky
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/** @addtogroup genericmm
+ * @{
+ */
+/**
+ * @file
+ * @brief   Slab allocator.
+ *
+ * The slab allocator is closely modelled after OpenSolaris slab allocator.
+ * @see http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick_html/
+ *
+ * with the following exceptions:
+ * @li empty slabs are deallocated immediately
+ *     (in Linux they are kept in linked list, in Solaris ???)
+ * @li empty magazines are deallocated when not needed
+ *     (in Solaris they are held in linked list in slab cache)
+ *
+ * Following features are not currently supported but would be easy to do:
+ * @li cache coloring
+ * @li dynamic magazine growing (different magazine sizes are already
+ *     supported, but we would need to adjust allocation strategy)
+ *
+ * The slab allocator supports per-CPU caches ('magazines') to facilitate
+ * good SMP scaling.
+ *
+ * When a new object is being allocated, it is first checked, if it is
+ * available in a CPU-bound magazine. If it is not found there, it is
+ * allocated from a CPU-shared slab - if a partially full one is found,
+ * it is used, otherwise a new one is allocated.
+ *
+ * When an object is being deallocated, it is put to a CPU-bound magazine.
+ * If there is no such magazine, a new one is allocated (if this fails,
+ * the object is deallocated into slab). If the magazine is full, it is
+ * put into cpu-shared list of magazines and a new one is allocated.
+ *
+ * The CPU-bound magazine is actually a pair of magazines in order to avoid
+ * thrashing when somebody is allocating/deallocating 1 item at the magazine
+ * size boundary. LIFO order is enforced, which should avoid fragmentation
+ * as much as possible.
+ *
+ * Every cache contains list of full slabs and list of partially full slabs.
+ * Empty slabs are immediately freed (thrashing will be avoided because
+ * of magazines).
+ *
+ * The slab information structure is kept inside the data area, if possible.
+ * The cache can be marked that it should not use magazines. This is used
+ * only for slab related caches to avoid deadlocks and infinite recursion
+ * (the slab allocator uses itself for allocating all it's control structures).
+ *
+ * The slab allocator allocates a lot of space and does not free it. When
+ * the frame allocator fails to allocate a frame, it calls slab_reclaim().
+ * It tries 'light reclaim' first, then brutal reclaim. The light reclaim
+ * releases slabs from cpu-shared magazine-list, until at least 1 slab
+ * is deallocated in each cache (this algorithm should probably change).
+ * The brutal reclaim removes all cached objects, even from CPU-bound
+ * magazines.
+ *
+ * @todo
+ * For better CPU-scaling the magazine allocation strategy should
+ * be extended. Currently, if the cache does not have magazine, it asks
+ * for non-cpu cached magazine cache to provide one. It might be feasible
+ * to add cpu-cached magazine cache (which would allocate it's magazines
+ * from non-cpu-cached mag. cache). This would provide a nice per-cpu
+ * buffer. The other possibility is to use the per-cache
+ * 'empty-magazine-list', which decreases competing for 1 per-system
+ * magazine cache.
+ *
+ * @todo
+ * it might be good to add granularity of locks even to slab level,
+ * we could then try_spinlock over all partial slabs and thus improve
+ * scalability even on slab level
+ */
+#include <synch/spinlock.h>
+#include <mm/slab.h>
+#include <adt/list.h>
+#include <memstr.h>
+#include <align.h>
+#include <mm/frame.h>
+#include <config.h>
+#include <print.h>
+#include <arch.h>
+#include <panic.h>
+#include <debug.h>
+#include <bitops.h>
+#include <macros.h>
+SPINLOCK_INITIALIZE(slab_cache_lock);
+static LIST_INITIALIZE(slab_cache_list);
+/** Magazine cache */
+static slab_cache_t mag_cache;
+/** Cache for cache descriptors */
+static slab_cache_t slab_cache_cache;
+/** Cache for external slab descriptors
+ * This time we want per-cpu cache, so do not make it static
+ * - using slab for internal slab structures will not deadlock,
+ *   as all slab structures are 'small' - control structures of
+ *   their caches do not require further allocation
+ */
+static slab_cache_t *slab_extern_cache;
+/** Caches for malloc */
+static slab_cache_t *malloc_caches[SLAB_MAX_MALLOC_W - SLAB_MIN_MALLOC_W + 1];
+char *malloc_names[] =  {
+    "malloc-16",
+    "malloc-32",
+    "malloc-64",
+    "malloc-128",
+    "malloc-256",
+    "malloc-512",
+    "malloc-1K",
+    "malloc-2K",
+    "malloc-4K",
+    "malloc-8K",
+    "malloc-16K",
+    "malloc-32K",
+    "malloc-64K",
+    "malloc-128K",
+    "malloc-256K"
+};
+/** Slab descriptor */
+typedef struct {
+    slab_cache_t *cache;    /**< Pointer to parent cache. */
+    link_t link;        /**< List of full/partial slabs. */
+    void *start;        /**< Start address of first available item. */
+    count_t available;  /**< Count of available items in this slab. */
+    index_t nextavail;  /**< The index of next available item. */
+} slab_t;
+#ifdef CONFIG_DEBUG
+static int _slab_initialized = 0;
+#endif
+/**************************************/
+/* Slab allocation functions          */
+/**
+ * Allocate frames for slab space and initialize
+ *
+ */
+static slab_t * slab_space_alloc(slab_cache_t *cache, int flags)
+{
+    void *data;
+    slab_t *slab;
+    size_t fsize;
+    unsigned int i;
+    unsigned int zone = 0;
+    data = frame_alloc_generic(cache->order, FRAME_KA | flags, &zone);
+    if (!data) {
+        return NULL;
+    }
+    if (! (cache->flags & SLAB_CACHE_SLINSIDE)) {
+        slab = slab_alloc(slab_extern_cache, flags);
+        if (!slab) {
+            frame_free(KA2PA(data));
+            return NULL;
+        }
+    } else {
+        fsize = (PAGE_SIZE << cache->order);
+        slab = data + fsize - sizeof(*slab);
+    }
+    /* Fill in slab structures */
+    for (i = 0; i < ((unsigned int) 1 << cache->order); i++)
+        frame_set_parent(ADDR2PFN(KA2PA(data)) + i, slab, zone);
+    slab->start = data;
+    slab->available = cache->objects;
+    slab->nextavail = 0;
+    slab->cache = cache;
+    for (i = 0; i < cache->objects; i++)
+        *((int *) (slab->start + i*cache->size)) = i+1;
+    atomic_inc(&cache->allocated_slabs);
+    return slab;
+}
+/**
+ * Deallocate space associated with slab
+ *
+ * @return number of freed frames
+ */
+static count_t slab_space_free(slab_cache_t *cache, slab_t *slab)
+{
+    frame_free(KA2PA(slab->start));
+    if (! (cache->flags & SLAB_CACHE_SLINSIDE))
+        slab_free(slab_extern_cache, slab);
+    atomic_dec(&cache->allocated_slabs);
+    return 1 << cache->order;
+}
+/** Map object to slab structure */
+static slab_t * obj2slab(void *obj)
+{
+    return (slab_t *) frame_get_parent(ADDR2PFN(KA2PA(obj)), 0);
+}
+/**************************************/
+/* Slab functions */
+/**
+ * Return object to slab and call a destructor
+ *
+ * @param slab If the caller knows directly slab of the object, otherwise NULL
+ *
+ * @return Number of freed pages
+ */
+static count_t slab_obj_destroy(slab_cache_t *cache, void *obj,
+                slab_t *slab)
+{
+    int freed = 0;
+    if (!slab)
+        slab = obj2slab(obj);
+    ASSERT(slab->cache == cache);
+    if (cache->destructor)
+        freed = cache->destructor(obj);
+    spinlock_lock(&cache->slablock);
+    ASSERT(slab->available < cache->objects);
+    *((int *)obj) = slab->nextavail;
+    slab->nextavail = (obj - slab->start)/cache->size;
+    slab->available++;
+    /* Move it to correct list */
+    if (slab->available == cache->objects) {
+        /* Free associated memory */
+        list_remove(&slab->link);
+        spinlock_unlock(&cache->slablock);
+        return freed + slab_space_free(cache, slab);
+    } else if (slab->available == 1) {
+        /* It was in full, move to partial */
+        list_remove(&slab->link);
+        list_prepend(&slab->link, &cache->partial_slabs);
+    }
+    spinlock_unlock(&cache->slablock);
+    return freed;
+}
+/**
+ * Take new object from slab or create new if needed
+ *
+ * @return Object address or null
+ */
+static void * slab_obj_create(slab_cache_t *cache, int flags)
+{
+    slab_t *slab;
+    void *obj;
+    spinlock_lock(&cache->slablock);
+    if (list_empty(&cache->partial_slabs)) {
+        /* Allow recursion and reclaiming
+         * - this should work, as the slab control structures
+         *   are small and do not need to allocate with anything
+         *   other than frame_alloc when they are allocating,
+         *   that's why we should get recursion at most 1-level deep
+         */
+        spinlock_unlock(&cache->slablock);
+        slab = slab_space_alloc(cache, flags);
+        if (!slab)
+            return NULL;
+        spinlock_lock(&cache->slablock);
+    } else {
+        slab = list_get_instance(cache->partial_slabs.next, slab_t, link);
+        list_remove(&slab->link);
+    }
+    obj = slab->start + slab->nextavail * cache->size;
+    slab->nextavail = *((int *)obj);
+    slab->available--;
+    if (!slab->available)
+        list_prepend(&slab->link, &cache->full_slabs);
+    else
+        list_prepend(&slab->link, &cache->partial_slabs);
+    spinlock_unlock(&cache->slablock);
+    if (cache->constructor && cache->constructor(obj, flags)) {
+        /* Bad, bad, construction failed */
+        slab_obj_destroy(cache, obj, slab);
+        return NULL;
+    }
+    return obj;
+}
+/**************************************/
+/* CPU-Cache slab functions */
+/**
+ * Finds a full magazine in cache, takes it from list
+ * and returns it
+ *
+ * @param first If true, return first, else last mag
+ */
+static slab_magazine_t * get_mag_from_cache(slab_cache_t *cache,
+                        int first)
+{
+    slab_magazine_t *mag = NULL;
+    link_t *cur;
+    spinlock_lock(&cache->maglock);
+    if (!list_empty(&cache->magazines)) {
+        if (first)
+            cur = cache->magazines.next;
+        else
+            cur = cache->magazines.prev;
+        mag = list_get_instance(cur, slab_magazine_t, link);
+        list_remove(&mag->link);
+        atomic_dec(&cache->magazine_counter);
+    }
+    spinlock_unlock(&cache->maglock);
+    return mag;
+}
+/** Prepend magazine to magazine list in cache */
+static void put_mag_to_cache(slab_cache_t *cache, slab_magazine_t *mag)
+{
+    spinlock_lock(&cache->maglock);
+    list_prepend(&mag->link, &cache->magazines);
+    atomic_inc(&cache->magazine_counter);
+    spinlock_unlock(&cache->maglock);
+}
+/**
+ * Free all objects in magazine and free memory associated with magazine
+ *
+ * @return Number of freed pages
+ */
+static count_t magazine_destroy(slab_cache_t *cache,
+                slab_magazine_t *mag)
+{
+    unsigned int i;
+    count_t frames = 0;
+    for (i = 0; i < mag->busy; i++) {
+        frames += slab_obj_destroy(cache, mag->objs[i], NULL);
+        atomic_dec(&cache->cached_objs);
+    }
+    slab_free(&mag_cache, mag);
+    return frames;
+}
+/**
+ * Find full magazine, set it as current and return it
+ *
+ * Assume cpu_magazine lock is held
+ */
+static slab_magazine_t * get_full_current_mag(slab_cache_t *cache)
+{
+    slab_magazine_t *cmag, *lastmag, *newmag;
+    cmag = cache->mag_cache[CPU->id].current;
+    lastmag = cache->mag_cache[CPU->id].last;
+    if (cmag) { /* First try local CPU magazines */
+        if (cmag->busy)
+            return cmag;
+        if (lastmag && lastmag->busy) {
+            cache->mag_cache[CPU->id].current = lastmag;
+            cache->mag_cache[CPU->id].last = cmag;
+            return lastmag;
+        }
+    }
+    /* Local magazines are empty, import one from magazine list */
+    newmag = get_mag_from_cache(cache, 1);
+    if (!newmag)
+        return NULL;
+    if (lastmag)
+        magazine_destroy(cache, lastmag);
+    cache->mag_cache[CPU->id].last = cmag;
+    cache->mag_cache[CPU->id].current = newmag;
+    return newmag;
+}
+/**
+ * Try to find object in CPU-cache magazines
+ *
+ * @return Pointer to object or NULL if not available
+ */
+static void * magazine_obj_get(slab_cache_t *cache)
+{
+    slab_magazine_t *mag;
+    void *obj;
+    if (!CPU)
+        return NULL;
+    spinlock_lock(&cache->mag_cache[CPU->id].lock);
+    mag = get_full_current_mag(cache);
+    if (!mag) {
+        spinlock_unlock(&cache->mag_cache[CPU->id].lock);
+        return NULL;
+    }
+    obj = mag->objs[--mag->busy];
+    spinlock_unlock(&cache->mag_cache[CPU->id].lock);
+    atomic_dec(&cache->cached_objs);
+    return obj;
+}
+/**
+ * Assure that the current magazine is empty, return pointer to it, or NULL if
+ * no empty magazine is available and cannot be allocated
+ *
+ * Assume mag_cache[CPU->id].lock is held
+ *
+ * We have 2 magazines bound to processor.
+ * First try the current.
+ *  If full, try the last.
+ *   If full, put to magazines list.
+ *   allocate new, exchange last & current
+ *
+ */
+static slab_magazine_t * make_empty_current_mag(slab_cache_t *cache)
+{
+    slab_magazine_t *cmag,*lastmag,*newmag;
+    cmag = cache->mag_cache[CPU->id].current;
+    lastmag = cache->mag_cache[CPU->id].last;
+    if (cmag) {
+        if (cmag->busy < cmag->size)
+            return cmag;
+        if (lastmag && lastmag->busy < lastmag->size) {
+            cache->mag_cache[CPU->id].last = cmag;
+            cache->mag_cache[CPU->id].current = lastmag;
+            return lastmag;
+        }
+    }
+    /* current | last are full | nonexistent, allocate new */
+    /* We do not want to sleep just because of caching */
+    /* Especially we do not want reclaiming to start, as
+     * this would deadlock */
+    newmag = slab_alloc(&mag_cache, FRAME_ATOMIC | FRAME_NO_RECLAIM);
+    if (!newmag)
+        return NULL;
+    newmag->size = SLAB_MAG_SIZE;
+    newmag->busy = 0;
+    /* Flush last to magazine list */
+    if (lastmag)
+        put_mag_to_cache(cache, lastmag);
+    /* Move current as last, save new as current */
+    cache->mag_cache[CPU->id].last = cmag;
+    cache->mag_cache[CPU->id].current = newmag;
+    return newmag;
+}
+/**
+ * Put object into CPU-cache magazine
+ *
+ * @return 0 - success, -1 - could not get memory
+ */
+static int magazine_obj_put(slab_cache_t *cache, void *obj)
+{
+    slab_magazine_t *mag;
+    if (!CPU)
+        return -1;
+    spinlock_lock(&cache->mag_cache[CPU->id].lock);
+    mag = make_empty_current_mag(cache);
+    if (!mag) {
+        spinlock_unlock(&cache->mag_cache[CPU->id].lock);
+        return -1;
+    }
+    mag->objs[mag->busy++] = obj;
+    spinlock_unlock(&cache->mag_cache[CPU->id].lock);
+    atomic_inc(&cache->cached_objs);
+    return 0;
+}
+/**************************************/
+/* Slab cache functions */
+/** Return number of objects that fit in certain cache size */
+static unsigned int comp_objects(slab_cache_t *cache)
+{
+    if (cache->flags & SLAB_CACHE_SLINSIDE)
+        return ((PAGE_SIZE << cache->order) - sizeof(slab_t)) / cache->size;
+    else
+        return (PAGE_SIZE << cache->order) / cache->size;
+}
+/** Return wasted space in slab */
+static unsigned int badness(slab_cache_t *cache)
+{
+    unsigned int objects;
+    unsigned int ssize;
+    objects = comp_objects(cache);
+    ssize = PAGE_SIZE << cache->order;
+    if (cache->flags & SLAB_CACHE_SLINSIDE)
+        ssize -= sizeof(slab_t);
+    return ssize - objects * cache->size;
+}
+/**
+ * Initialize mag_cache structure in slab cache
+ */
+static void make_magcache(slab_cache_t *cache)
+{
+    unsigned int i;
+    ASSERT(_slab_initialized >= 2);
+    cache->mag_cache = malloc(sizeof(slab_mag_cache_t)*config.cpu_count,0);
+    for (i = 0; i < config.cpu_count; i++) {
+        memsetb((uintptr_t)&cache->mag_cache[i],
+            sizeof(cache->mag_cache[i]), 0);
+        spinlock_initialize(&cache->mag_cache[i].lock, "slab_maglock_cpu");
+    }
+}
+/** Initialize allocated memory as a slab cache */
+static void
+_slab_cache_create(slab_cache_t *cache,
+           char *name,
+           size_t size,
+           size_t align,
+           int (*constructor)(void *obj, int kmflag),
+           int (*destructor)(void *obj),
+           int flags)
+{
+    int pages;
+    ipl_t ipl;
+    memsetb((uintptr_t)cache, sizeof(*cache), 0);
+    cache->name = name;
+    if (align < sizeof(unative_t))
+        align = sizeof(unative_t);
+    size = ALIGN_UP(size, align);
+    cache->size = size;
+    cache->constructor = constructor;
+    cache->destructor = destructor;
+    cache->flags = flags;
+    list_initialize(&cache->full_slabs);
+    list_initialize(&cache->partial_slabs);
+    list_initialize(&cache->magazines);
+    spinlock_initialize(&cache->slablock, "slab_lock");
+    spinlock_initialize(&cache->maglock, "slab_maglock");
+    if (! (cache->flags & SLAB_CACHE_NOMAGAZINE))
+        make_magcache(cache);
+    /* Compute slab sizes, object counts in slabs etc. */
+    if (cache->size < SLAB_INSIDE_SIZE)
+        cache->flags |= SLAB_CACHE_SLINSIDE;
+    /* Minimum slab order */
+    pages = SIZE2FRAMES(cache->size);
+    /* We need the 2^order >= pages */
+    if (pages == 1)
+        cache->order = 0;
+    else
+        cache->order = fnzb(pages-1)+1;
+    while (badness(cache) > SLAB_MAX_BADNESS(cache)) {
+        cache->order += 1;
+    }
+    cache->objects = comp_objects(cache);
+    /* If info fits in, put it inside */
+    if (badness(cache) > sizeof(slab_t))
+        cache->flags |= SLAB_CACHE_SLINSIDE;
+    /* Add cache to cache list */
+    ipl = interrupts_disable();
+    spinlock_lock(&slab_cache_lock);
+    list_append(&cache->link, &slab_cache_list);
+    spinlock_unlock(&slab_cache_lock);
+    interrupts_restore(ipl);
+}
+/** Create slab cache  */
+slab_cache_t * slab_cache_create(char *name,
+                 size_t size,
+                 size_t align,
+                 int (*constructor)(void *obj, int kmflag),
+                 int (*destructor)(void *obj),
+                 int flags)
+{
+    slab_cache_t *cache;
+    cache = slab_alloc(&slab_cache_cache, 0);
+    _slab_cache_create(cache, name, size, align, constructor, destructor,
+               flags);
+    return cache;
+}
+/**
+ * Reclaim space occupied by objects that are already free
+ *
+ * @param flags If contains SLAB_RECLAIM_ALL, do aggressive freeing
+ * @return Number of freed pages
+ */
+static count_t _slab_reclaim(slab_cache_t *cache, int flags)
+{
+    unsigned int i;
+    slab_magazine_t *mag;
+    count_t frames = 0;
+    int magcount;
+    if (cache->flags & SLAB_CACHE_NOMAGAZINE)
+        return 0; /* Nothing to do */
+    /* We count up to original magazine count to avoid
+     * endless loop
+     */
+    magcount = atomic_get(&cache->magazine_counter);
+    while (magcount-- && (mag=get_mag_from_cache(cache,0))) {
+        frames += magazine_destroy(cache,mag);
+        if (!(flags & SLAB_RECLAIM_ALL) && frames)
+            break;
+    }
+    if (flags & SLAB_RECLAIM_ALL) {
+        /* Free cpu-bound magazines */
+        /* Destroy CPU magazines */
+        for (i = 0; i < config.cpu_count; i++) {
+            spinlock_lock(&cache->mag_cache[i].lock);
+            mag = cache->mag_cache[i].current;
+            if (mag)
+                frames += magazine_destroy(cache, mag);
+            cache->mag_cache[i].current = NULL;
+            mag = cache->mag_cache[i].last;
+            if (mag)
+                frames += magazine_destroy(cache, mag);
+            cache->mag_cache[i].last = NULL;
+            spinlock_unlock(&cache->mag_cache[i].lock);
+        }
+    }
+    return frames;
+}
+/** Check that there are no slabs and remove cache from system  */
+void slab_cache_destroy(slab_cache_t *cache)
+{
+    ipl_t ipl;
+    /* First remove cache from link, so that we don't need
+     * to disable interrupts later
+     */
+    ipl = interrupts_disable();
+    spinlock_lock(&slab_cache_lock);
+    list_remove(&cache->link);
+    spinlock_unlock(&slab_cache_lock);
+    interrupts_restore(ipl);
+    /* Do not lock anything, we assume the software is correct and
+     * does not touch the cache when it decides to destroy it */
+    /* Destroy all magazines */
+    _slab_reclaim(cache, SLAB_RECLAIM_ALL);
+    /* All slabs must be empty */
+    if (!list_empty(&cache->full_slabs) \
+        || !list_empty(&cache->partial_slabs))
+        panic("Destroying cache that is not empty.");
+    if (!(cache->flags & SLAB_CACHE_NOMAGAZINE))
+        free(cache->mag_cache);
+    slab_free(&slab_cache_cache, cache);
+}
+/** Allocate new object from cache - if no flags given, always returns
+    memory */
+void * slab_alloc(slab_cache_t *cache, int flags)
+{
+    ipl_t ipl;
+    void *result = NULL;
+    /* Disable interrupts to avoid deadlocks with interrupt handlers */
+    ipl = interrupts_disable();
+    if (!(cache->flags & SLAB_CACHE_NOMAGAZINE)) {
+        result = magazine_obj_get(cache);
+    }
+    if (!result)
+        result = slab_obj_create(cache, flags);
+    interrupts_restore(ipl);
+    if (result)
+        atomic_inc(&cache->allocated_objs);
+    return result;
+}
+/** Return object to cache, use slab if known  */
+static void _slab_free(slab_cache_t *cache, void *obj, slab_t *slab)
+{
+    ipl_t ipl;
+    ipl = interrupts_disable();
+    if ((cache->flags & SLAB_CACHE_NOMAGAZINE) \
+        || magazine_obj_put(cache, obj)) {
+        slab_obj_destroy(cache, obj, slab);
+    }
+    interrupts_restore(ipl);
+    atomic_dec(&cache->allocated_objs);
+}
+/** Return slab object to cache */
+void slab_free(slab_cache_t *cache, void *obj)
+{
+    _slab_free(cache, obj, NULL);
+}
+/* Go through all caches and reclaim what is possible */
+count_t slab_reclaim(int flags)
+{
+    slab_cache_t *cache;
+    link_t *cur;
+    count_t frames = 0;
+    spinlock_lock(&slab_cache_lock);
+    /* TODO: Add assert, that interrupts are disabled, otherwise
+     * memory allocation from interrupts can deadlock.
+     */
+    for (cur = slab_cache_list.next;cur!=&slab_cache_list; cur=cur->next) {
+        cache = list_get_instance(cur, slab_cache_t, link);
+        frames += _slab_reclaim(cache, flags);
+    }
+    spinlock_unlock(&slab_cache_lock);
+    return frames;
+}
+/* Print list of slabs */
+void slab_print_list(void)
+{
+    slab_cache_t *cache;
+    link_t *cur;
+    ipl_t ipl;
+    ipl = interrupts_disable();
+    spinlock_lock(&slab_cache_lock);
+    printf("slab name        size     pages  obj/pg slabs  cached allocated ctl\n");
+    printf("---------------- -------- ------ ------ ------ ------ --------- ---\n");
+    for (cur = slab_cache_list.next; cur != &slab_cache_list; cur = cur->next) {
+        cache = list_get_instance(cur, slab_cache_t, link);
+        printf("%-16s %8zd %6zd %6zd %6zd %6zd %9zd %-3s\n", cache->name, cache->size, (1 << cache->order), cache->objects, atomic_get(&cache->allocated_slabs), atomic_get(&cache->cached_objs), atomic_get(&cache->allocated_objs), cache->flags & SLAB_CACHE_SLINSIDE ? "in" : "out");
+    }
+    spinlock_unlock(&slab_cache_lock);
+    interrupts_restore(ipl);
+}
+void slab_cache_init(void)
+{
+    int i, size;
+    /* Initialize magazine cache */
+    _slab_cache_create(&mag_cache,
+               "slab_magazine",
+               sizeof(slab_magazine_t)+SLAB_MAG_SIZE*sizeof(void*),
+               sizeof(uintptr_t),
+               NULL, NULL,
+               SLAB_CACHE_NOMAGAZINE | SLAB_CACHE_SLINSIDE);
+    /* Initialize slab_cache cache */
+    _slab_cache_create(&slab_cache_cache,
+               "slab_cache",
+               sizeof(slab_cache_cache),
+               sizeof(uintptr_t),
+               NULL, NULL,
+               SLAB_CACHE_NOMAGAZINE | SLAB_CACHE_SLINSIDE);
+    /* Initialize external slab cache */
+    slab_extern_cache = slab_cache_create("slab_extern",
+                          sizeof(slab_t),
+, NULL, NULL,
+                          SLAB_CACHE_SLINSIDE | SLAB_CACHE_MAGDEFERRED);
+    /* Initialize structures for malloc */
+    for (i=0, size=(1<<SLAB_MIN_MALLOC_W);
+         i < (SLAB_MAX_MALLOC_W-SLAB_MIN_MALLOC_W+1);
+         i++, size <<= 1) {
+        malloc_caches[i] = slab_cache_create(malloc_names[i],
+                             size, 0,
+                             NULL,NULL, SLAB_CACHE_MAGDEFERRED);
+    }
+#ifdef CONFIG_DEBUG
+    _slab_initialized = 1;
+#endif
+}
+/** Enable cpu_cache
+ *
+ * Kernel calls this function, when it knows the real number of
+ * processors.
+ * Allocate slab for cpucache and enable it on all existing
+ * slabs that are SLAB_CACHE_MAGDEFERRED
+ */
+void slab_enable_cpucache(void)
+{
+    link_t *cur;
+    slab_cache_t *s;
+#ifdef CONFIG_DEBUG
+    _slab_initialized = 2;
+#endif
+    spinlock_lock(&slab_cache_lock);
+    for (cur=slab_cache_list.next; cur != &slab_cache_list;cur=cur->next){
+        s = list_get_instance(cur, slab_cache_t, link);
+        if ((s->flags & SLAB_CACHE_MAGDEFERRED) != SLAB_CACHE_MAGDEFERRED)
+            continue;
+        make_magcache(s);
+        s->flags &= ~SLAB_CACHE_MAGDEFERRED;
+    }
+    spinlock_unlock(&slab_cache_lock);
+}
+/**************************************/
+/* kalloc/kfree functions             */
+void * malloc(unsigned int size, int flags)
+{
+    ASSERT(_slab_initialized);
+    ASSERT(size && size <= (1 << SLAB_MAX_MALLOC_W));
+    if (size < (1 << SLAB_MIN_MALLOC_W))
+        size = (1 << SLAB_MIN_MALLOC_W);
+    int idx = fnzb(size - 1) - SLAB_MIN_MALLOC_W + 1;
+    return slab_alloc(malloc_caches[idx], flags);
+}
+void * realloc(void *ptr, unsigned int size, int flags)
+{
+    ASSERT(_slab_initialized);
+    ASSERT(size <= (1 << SLAB_MAX_MALLOC_W));
+    void *new_ptr;
+    if (size > 0) {
+        if (size < (1 << SLAB_MIN_MALLOC_W))
+            size = (1 << SLAB_MIN_MALLOC_W);
+        int idx = fnzb(size - 1) - SLAB_MIN_MALLOC_W + 1;
+        new_ptr = slab_alloc(malloc_caches[idx], flags);
+    } else
+        new_ptr = NULL;
+    if ((new_ptr != NULL) && (ptr != NULL)) {
+        slab_t *slab = obj2slab(ptr);
+        memcpy(new_ptr, ptr, min(size, slab->cache->size));
+    }
+    if (ptr != NULL)
+        free(ptr);
+    return new_ptr;
+}
+void free(void *ptr)
+{
+    if (!ptr)
+        return;
+    slab_t *slab = obj2slab(ptr);
+    _slab_free(slab->cache, ptr, slab);
+}
+/** @}
+ */

 /branches/tracing/kernel/generic/src/mm/tlb.c
 ,0 → 1,190
+/*
+ * Copyright (c) 2001-2004 Jakub Jermar
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/** @addtogroup genericmm
+ * @{
+ */
+/**
+ * @file
+ * @brief   Generic TLB shootdown algorithm.
+ *
+ * The algorithm implemented here is based on the CMU TLB shootdown
+ * algorithm and is further simplified (e.g. all CPUs receive all TLB
+ * shootdown messages).
+ */
+#include <mm/tlb.h>
+#include <mm/asid.h>
+#include <arch/mm/tlb.h>
+#include <smp/ipi.h>
+#include <synch/spinlock.h>
+#include <atomic.h>
+#include <arch/interrupt.h>
+#include <config.h>
+#include <arch.h>
+#include <panic.h>
+#include <debug.h>
+#include <cpu.h>
+/**
+ * This lock is used for synchronisation between sender and
+ * recipients of TLB shootdown message. It must be acquired
+ * before CPU structure lock.
+ */
+SPINLOCK_INITIALIZE(tlblock);
+void tlb_init(void)
+{
+    tlb_arch_init();
+}
+#ifdef CONFIG_SMP
+/** Send TLB shootdown message.
+ *
+ * This function attempts to deliver TLB shootdown message
+ * to all other processors.
+ *
+ * This function must be called with interrupts disabled.
+ *
+ * @param type Type describing scope of shootdown.
+ * @param asid Address space, if required by type.
+ * @param page Virtual page address, if required by type.
+ * @param count Number of pages, if required by type.
+ */
+void tlb_shootdown_start(tlb_invalidate_type_t type, asid_t asid,
+    uintptr_t page, count_t count)
+{
+    unsigned int i;
+    CPU->tlb_active = 0;
+    spinlock_lock(&tlblock);
+    for (i = 0; i < config.cpu_count; i++) {
+        cpu_t *cpu;
+        if (i == CPU->id)
+            continue;
+        cpu = &cpus[i];
+        spinlock_lock(&cpu->lock);
+        if (cpu->tlb_messages_count == TLB_MESSAGE_QUEUE_LEN) {
+            /*
+             * The message queue is full.
+             * Erase the queue and store one TLB_INVL_ALL message.
+             */
+            cpu->tlb_messages_count = 1;
+            cpu->tlb_messages[0].type = TLB_INVL_ALL;
+            cpu->tlb_messages[0].asid = ASID_INVALID;
+            cpu->tlb_messages[0].page = 0;
+            cpu->tlb_messages[0].count = 0;
+        } else {
+            /*
+             * Enqueue the message.
+             */
+            index_t idx = cpu->tlb_messages_count++;
+            cpu->tlb_messages[idx].type = type;
+            cpu->tlb_messages[idx].asid = asid;
+            cpu->tlb_messages[idx].page = page;
+            cpu->tlb_messages[idx].count = count;
+        }
+        spinlock_unlock(&cpu->lock);
+    }
+    tlb_shootdown_ipi_send();
+busy_wait:
+    for (i = 0; i < config.cpu_count; i++)
+        if (cpus[i].tlb_active)
+            goto busy_wait;
+}
+/** Finish TLB shootdown sequence. */
+void tlb_shootdown_finalize(void)
+{
+    spinlock_unlock(&tlblock);
+    CPU->tlb_active = 1;
+}
+void tlb_shootdown_ipi_send(void)
+{
+    ipi_broadcast(VECTOR_TLB_SHOOTDOWN_IPI);
+}
+/** Receive TLB shootdown message. */
+void tlb_shootdown_ipi_recv(void)
+{
+    tlb_invalidate_type_t type;
+    asid_t asid;
+    uintptr_t page;
+    count_t count;
+    unsigned int i;
+    ASSERT(CPU);
+    CPU->tlb_active = 0;
+    spinlock_lock(&tlblock);
+    spinlock_unlock(&tlblock);
+    spinlock_lock(&CPU->lock);
+    ASSERT(CPU->tlb_messages_count <= TLB_MESSAGE_QUEUE_LEN);
+    for (i = 0; i < CPU->tlb_messages_count; CPU->tlb_messages_count--) {
+        type = CPU->tlb_messages[i].type;
+        asid = CPU->tlb_messages[i].asid;
+        page = CPU->tlb_messages[i].page;
+        count = CPU->tlb_messages[i].count;
+        switch (type) {
+        case TLB_INVL_ALL:
+            tlb_invalidate_all();
+            break;
+        case TLB_INVL_ASID:
+            tlb_invalidate_asid(asid);
+            break;
+        case TLB_INVL_PAGES:
+                ASSERT(count);
+            tlb_invalidate_pages(asid, page, count);
+            break;
+        default:
+            panic("unknown type (%d)\n", type);
+            break;
+        }
+        if (type == TLB_INVL_ALL)
+            break;
+    }
+    spinlock_unlock(&CPU->lock);
+    CPU->tlb_active = 1;
+}
+#endif /* CONFIG_SMP */
+/** @}
+ */

 /branches/tracing/kernel/generic/src/mm/backend_anon.c
 ,0 → 1,227
+/*
+ * Copyright (c) 2006 Jakub Jermar
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/** @addtogroup genericmm
+ * @{
+ */
+/**
+ * @file
+ * @brief   Backend for anonymous memory address space areas.
+ *
+ */
+#include <mm/as.h>
+#include <mm/page.h>
+#include <genarch/mm/page_pt.h>
+#include <genarch/mm/page_ht.h>
+#include <mm/frame.h>
+#include <mm/slab.h>
+#include <synch/mutex.h>
+#include <adt/list.h>
+#include <adt/btree.h>
+#include <errno.h>
+#include <arch/types.h>
+#include <align.h>
+#include <arch.h>
+#ifdef CONFIG_VIRT_IDX_DCACHE
+#include <arch/mm/cache.h>
+#endif
+static int anon_page_fault(as_area_t *area, uintptr_t addr, pf_access_t access);
+static void anon_frame_free(as_area_t *area, uintptr_t page, uintptr_t frame);
+static void anon_share(as_area_t *area);
+mem_backend_t anon_backend = {
+    .page_fault = anon_page_fault,
+    .frame_free = anon_frame_free,
+    .share = anon_share
+};
+/** Service a page fault in the anonymous memory address space area.
+ *
+ * The address space area and page tables must be already locked.
+ *
+ * @param area Pointer to the address space area.
+ * @param addr Faulting virtual address.
+ * @param access Access mode that caused the fault (i.e. read/write/exec).
+ *
+ * @return AS_PF_FAULT on failure (i.e. page fault) or AS_PF_OK on success (i.e.
+ *     serviced).
+ */
+int anon_page_fault(as_area_t *area, uintptr_t addr, pf_access_t access)
+{
+    uintptr_t frame;
+    bool dirty = false;
+    if (!as_area_check_access(area, access))
+        return AS_PF_FAULT;
+    if (area->sh_info) {
+        btree_node_t *leaf;
+        /*
+         * The area is shared, chances are that the mapping can be found
+         * in the pagemap of the address space area share info
+         * structure.
+         * In the case that the pagemap does not contain the respective
+         * mapping, a new frame is allocated and the mapping is created.
+         */
+        mutex_lock(&area->sh_info->lock);
+        frame = (uintptr_t) btree_search(&area->sh_info->pagemap,
+            ALIGN_DOWN(addr, PAGE_SIZE) - area->base, &leaf);
+        if (!frame) {
+            bool allocate = true;
+            unsigned int i;
+            /*
+             * Zero can be returned as a valid frame address.
+             * Just a small workaround.
+             */
+            for (i = 0; i < leaf->keys; i++) {
+                if (leaf->key[i] ==
+                    ALIGN_DOWN(addr, PAGE_SIZE)) {
+                    allocate = false;
+                    break;
+                }
+            }
+            if (allocate) {
+                frame = (uintptr_t) frame_alloc(ONE_FRAME, 0);
+                memsetb(PA2KA(frame), FRAME_SIZE, 0);
+                dirty = true;
+                /*
+                 * Insert the address of the newly allocated
+                 * frame to the pagemap.
+                 */
+                btree_insert(&area->sh_info->pagemap,
+                    ALIGN_DOWN(addr, PAGE_SIZE) - area->base,
+                    (void *) frame, leaf);
+            }
+        }
+        frame_reference_add(ADDR2PFN(frame));
+        mutex_unlock(&area->sh_info->lock);
+    } else {
+        /*
+         * In general, there can be several reasons that
+         * can have caused this fault.
+         *
+         * - non-existent mapping: the area is an anonymous
+         *   area (e.g. heap or stack) and so far has not been
+         *   allocated a frame for the faulting page
+         *
+         * - non-present mapping: another possibility,
+         *   currently not implemented, would be frame
+         *   reuse; when this becomes a possibility,
+         *   do not forget to distinguish between
+         *   the different causes
+         */
+        frame = (uintptr_t) frame_alloc(ONE_FRAME, 0);
+        memsetb(PA2KA(frame), FRAME_SIZE, 0);
+        dirty = true;
+    }
+    /*
+     * Map 'page' to 'frame'.
+     * Note that TLB shootdown is not attempted as only new information is
+     * being inserted into page tables.
+     */
+    page_mapping_insert(AS, addr, frame, as_area_get_flags(area));
+    if (!used_space_insert(area, ALIGN_DOWN(addr, PAGE_SIZE), 1))
+        panic("Could not insert used space.\n");
+    return AS_PF_OK;
+}
+/** Free a frame that is backed by the anonymous memory backend.
+ *
+ * The address space area and page tables must be already locked.
+ *
+ * @param area Ignored.
+ * @param page Virtual address of the page corresponding to the frame.
+ * @param frame Frame to be released.
+ */
+void anon_frame_free(as_area_t *area, uintptr_t page, uintptr_t frame)
+{
+    frame_free(frame);
+}
+/** Share the anonymous address space area.
+ *
+ * Sharing of anonymous area is done by duplicating its entire mapping
+ * to the pagemap. Page faults will primarily search for frames there.
+ *
+ * The address space and address space area must be already locked.
+ *
+ * @param area Address space area to be shared.
+ */
+void anon_share(as_area_t *area)
+{
+    link_t *cur;
+    /*
+     * Copy used portions of the area to sh_info's page map.
+     */
+    mutex_lock(&area->sh_info->lock);
+    for (cur = area->used_space.leaf_head.next;
+        cur != &area->used_space.leaf_head; cur = cur->next) {
+        btree_node_t *node;
+        unsigned int i;
+        node = list_get_instance(cur, btree_node_t, leaf_link);
+        for (i = 0; i < node->keys; i++) {
+            uintptr_t base = node->key[i];
+            count_t count = (count_t) node->value[i];
+            unsigned int j;
+            for (j = 0; j < count; j++) {
+                pte_t *pte;
+                page_table_lock(area->as, false);
+                pte = page_mapping_find(area->as,
+                    base + j * PAGE_SIZE);
+                ASSERT(pte && PTE_VALID(pte) &&
+                    PTE_PRESENT(pte));
+                btree_insert(&area->sh_info->pagemap,
+                    (base + j * PAGE_SIZE) - area->base,
+                    (void *) PTE_GET_FRAME(pte), NULL);
+                page_table_unlock(area->as, false);
+                pfn_t pfn = ADDR2PFN(PTE_GET_FRAME(pte));
+                frame_reference_add(pfn);
+            }
+        }
+    }
+    mutex_unlock(&area->sh_info->lock);
+}
+/** @}
+ */

 /branches/tracing/kernel/generic/src/mm/as.c
 ,0 → 1,1815
+/*
+ * Copyright (c) 2001-2006 Jakub Jermar
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/** @addtogroup genericmm
+ * @{
+ */
+/**
+ * @file
+ * @brief   Address space related functions.
+ *
+ * This file contains address space manipulation functions.
+ * Roughly speaking, this is a higher-level client of
+ * Virtual Address Translation (VAT) subsystem.
+ *
+ * Functionality provided by this file allows one to
+ * create address spaces and create, resize and share
+ * address space areas.
+ *
+ * @see page.c
+ *
+ */
+#include <mm/as.h>
+#include <arch/mm/as.h>
+#include <mm/page.h>
+#include <mm/frame.h>
+#include <mm/slab.h>
+#include <mm/tlb.h>
+#include <arch/mm/page.h>
+#include <genarch/mm/page_pt.h>
+#include <genarch/mm/page_ht.h>
+#include <mm/asid.h>
+#include <arch/mm/asid.h>
+#include <preemption.h>
+#include <synch/spinlock.h>
+#include <synch/mutex.h>
+#include <adt/list.h>
+#include <adt/btree.h>
+#include <proc/task.h>
+#include <proc/thread.h>
+#include <arch/asm.h>
+#include <panic.h>
+#include <debug.h>
+#include <print.h>
+#include <memstr.h>
+#include <macros.h>
+#include <arch.h>
+#include <errno.h>
+#include <config.h>
+#include <align.h>
+#include <arch/types.h>
+#include <syscall/copy.h>
+#include <arch/interrupt.h>
+#ifdef CONFIG_VIRT_IDX_DCACHE
+#include <arch/mm/cache.h>
+#endif /* CONFIG_VIRT_IDX_DCACHE */
+#ifndef __OBJC__
+/**
+ * Each architecture decides what functions will be used to carry out
+ * address space operations such as creating or locking page tables.
+ */
+as_operations_t *as_operations = NULL;
+/**
+ * Slab for as_t objects.
+ */
+static slab_cache_t *as_slab;
+#endif
+/**
+ * This lock serializes access to the ASID subsystem.
+ * It protects:
+ * - inactive_as_with_asid_head list
+ * - as->asid for each as of the as_t type
+ * - asids_allocated counter
+ */
+SPINLOCK_INITIALIZE(asidlock);
+/**
+ * This list contains address spaces that are not active on any
+ * processor and that have valid ASID.
+ */
+LIST_INITIALIZE(inactive_as_with_asid_head);
+/** Kernel address space. */
+as_t *AS_KERNEL = NULL;
+static int area_flags_to_page_flags(int aflags);
+static as_area_t *find_area_and_lock(as_t *as, uintptr_t va);
+static bool check_area_conflicts(as_t *as, uintptr_t va, size_t size,
+    as_area_t *avoid_area);
+static void sh_info_remove_reference(share_info_t *sh_info);
+#ifndef __OBJC__
+static int as_constructor(void *obj, int flags)
+{
+    as_t *as = (as_t *) obj;
+    int rc;
+    link_initialize(&as->inactive_as_with_asid_link);
+    mutex_initialize(&as->lock);
+    rc = as_constructor_arch(as, flags);
+    return rc;
+}
+static int as_destructor(void *obj)
+{
+    as_t *as = (as_t *) obj;
+    return as_destructor_arch(as);
+}
+#endif
+/** Initialize address space subsystem. */
+void as_init(void)
+{
+    as_arch_init();
+#ifndef __OBJC__
+    as_slab = slab_cache_create("as_slab", sizeof(as_t), 0,
+        as_constructor, as_destructor, SLAB_CACHE_MAGDEFERRED);
+#endif
+    AS_KERNEL = as_create(FLAG_AS_KERNEL);
+    if (!AS_KERNEL)
+        panic("can't create kernel address space\n");
+}
+/** Create address space.
+ *
+ * @param flags Flags that influence way in wich the address space is created.
+ */
+as_t *as_create(int flags)
+{
+    as_t *as;
+#ifdef __OBJC__
+    as = [as_t new];
+    link_initialize(&as->inactive_as_with_asid_link);
+    mutex_initialize(&as->lock);
+    (void) as_constructor_arch(as, flags);
+#else
+    as = (as_t *) slab_alloc(as_slab, 0);
+#endif
+    (void) as_create_arch(as, 0);
+    btree_create(&as->as_area_btree);
+    if (flags & FLAG_AS_KERNEL)
+        as->asid = ASID_KERNEL;
+    else
+        as->asid = ASID_INVALID;
+    atomic_set(&as->refcount, 0);
+    as->cpu_refcount = 0;
+#ifdef AS_PAGE_TABLE
+    as->genarch.page_table = page_table_create(flags);
+#else
+    page_table_create(flags);
+#endif
+    return as;
+}
+/** Destroy adress space.
+ *
+ * When there are no tasks referencing this address space (i.e. its refcount is
+ * zero), the address space can be destroyed.
+ *
+ * We know that we don't hold any spinlock.
+ */
+void as_destroy(as_t *as)
+{
+    ipl_t ipl;
+    bool cond;
+    DEADLOCK_PROBE_INIT(p_asidlock);
+    ASSERT(atomic_get(&as->refcount) == 0);
+    /*
+     * Since there is no reference to this area,
+     * it is safe not to lock its mutex.
+     */
+    /*
+     * We need to avoid deadlock between TLB shootdown and asidlock.
+     * We therefore try to take asid conditionally and if we don't succeed,
+     * we enable interrupts and try again. This is done while preemption is
+     * disabled to prevent nested context switches. We also depend on the
+     * fact that so far no spinlocks are held.
+     */
+    preemption_disable();
+    ipl = interrupts_read();
+retry:
+    interrupts_disable();
+    if (!spinlock_trylock(&asidlock)) {
+        interrupts_enable();
+        DEADLOCK_PROBE(p_asidlock, DEADLOCK_THRESHOLD);
+        goto retry;
+    }
+    preemption_enable();    /* Interrupts disabled, enable preemption */
+    if (as->asid != ASID_INVALID && as != AS_KERNEL) {
+        if (as != AS && as->cpu_refcount == 0)
+            list_remove(&as->inactive_as_with_asid_link);
+        asid_put(as->asid);
+    }
+    spinlock_unlock(&asidlock);
+    /*
+     * Destroy address space areas of the address space.
+     * The B+tree must be walked carefully because it is
+     * also being destroyed.
+     */
+    for (cond = true; cond; ) {
+        btree_node_t *node;
+        ASSERT(!list_empty(&as->as_area_btree.leaf_head));
+        node = list_get_instance(as->as_area_btree.leaf_head.next,
+            btree_node_t, leaf_link);
+        if ((cond = node->keys)) {
+            as_area_destroy(as, node->key[0]);
+        }
+    }
+    btree_destroy(&as->as_area_btree);
+#ifdef AS_PAGE_TABLE
+    page_table_destroy(as->genarch.page_table);
+#else
+    page_table_destroy(NULL);
+#endif
+    interrupts_restore(ipl);
+#ifdef __OBJC__
+    [as free];
+#else
+    slab_free(as_slab, as);
+#endif
+}
+/** Create address space area of common attributes.
+ *
+ * The created address space area is added to the target address space.
+ *
+ * @param as Target address space.
+ * @param flags Flags of the area memory.
+ * @param size Size of area.
+ * @param base Base address of area.
+ * @param attrs Attributes of the area.
+ * @param backend Address space area backend. NULL if no backend is used.
+ * @param backend_data NULL or a pointer to an array holding two void *.
+ *
+ * @return Address space area on success or NULL on failure.
+ */
+as_area_t *
+as_area_create(as_t *as, int flags, size_t size, uintptr_t base, int attrs,
+           mem_backend_t *backend, mem_backend_data_t *backend_data)
+{
+    ipl_t ipl;
+    as_area_t *a;
+    if (base % PAGE_SIZE)
+        return NULL;
+    if (!size)
+        return NULL;
+    /* Writeable executable areas are not supported. */
+    if ((flags & AS_AREA_EXEC) && (flags & AS_AREA_WRITE))
+        return NULL;
+    ipl = interrupts_disable();
+    mutex_lock(&as->lock);
+    if (!check_area_conflicts(as, base, size, NULL)) {
+        mutex_unlock(&as->lock);
+        interrupts_restore(ipl);
+        return NULL;
+    }
+    a = (as_area_t *) malloc(sizeof(as_area_t), 0);
+    mutex_initialize(&a->lock);
+    a->as = as;
+    a->flags = flags;
+    a->attributes = attrs;
+    a->pages = SIZE2FRAMES(size);
+    a->base = base;
+    a->sh_info = NULL;
+    a->backend = backend;
+    if (backend_data)
+        a->backend_data = *backend_data;
+    else
+        memsetb((uintptr_t) &a->backend_data, sizeof(a->backend_data),
+);
+    btree_create(&a->used_space);
+    btree_insert(&as->as_area_btree, base, (void *) a, NULL);
+    mutex_unlock(&as->lock);
+    interrupts_restore(ipl);
+    return a;
+}
+/** Find address space area and change it.
+ *
+ * @param as Address space.
+ * @param address Virtual address belonging to the area to be changed. Must be
+ *     page-aligned.
+ * @param size New size of the virtual memory block starting at address.
+ * @param flags Flags influencing the remap operation. Currently unused.
+ *
+ * @return Zero on success or a value from @ref errno.h otherwise.
+ */
+int as_area_resize(as_t *as, uintptr_t address, size_t size, int flags)
+{
+    as_area_t *area;
+    ipl_t ipl;
+    size_t pages;
+    ipl = interrupts_disable();
+    mutex_lock(&as->lock);
+    /*
+     * Locate the area.
+     */
+    area = find_area_and_lock(as, address);
+    if (!area) {
+        mutex_unlock(&as->lock);
+        interrupts_restore(ipl);
+        return ENOENT;
+    }
+    if (area->backend == &phys_backend) {
+        /*
+         * Remapping of address space areas associated
+         * with memory mapped devices is not supported.
+         */
+        mutex_unlock(&area->lock);
+        mutex_unlock(&as->lock);
+        interrupts_restore(ipl);
+        return ENOTSUP;
+    }
+    if (area->sh_info) {
+        /*
+         * Remapping of shared address space areas
+         * is not supported.
+         */
+        mutex_unlock(&area->lock);
+        mutex_unlock(&as->lock);
+        interrupts_restore(ipl);
+        return ENOTSUP;
+    }
+    pages = SIZE2FRAMES((address - area->base) + size);
+    if (!pages) {
+        /*
+         * Zero size address space areas are not allowed.
+         */
+        mutex_unlock(&area->lock);
+        mutex_unlock(&as->lock);
+        interrupts_restore(ipl);
+        return EPERM;
+    }
+    if (pages < area->pages) {
+        bool cond;
+        uintptr_t start_free = area->base + pages*PAGE_SIZE;
+        /*
+         * Shrinking the area.
+         * No need to check for overlaps.
+         */
+        /*
+         * Start TLB shootdown sequence.
+         */
+        tlb_shootdown_start(TLB_INVL_PAGES, AS->asid, area->base +
+            pages * PAGE_SIZE, area->pages - pages);
+        /*
+         * Remove frames belonging to used space starting from
+         * the highest addresses downwards until an overlap with
+         * the resized address space area is found. Note that this
+         * is also the right way to remove part of the used_space
+         * B+tree leaf list.
+         */
+        for (cond = true; cond;) {
+            btree_node_t *node;
+            ASSERT(!list_empty(&area->used_space.leaf_head));
+            node =
+                list_get_instance(area->used_space.leaf_head.prev,
+                btree_node_t, leaf_link);
+            if ((cond = (bool) node->keys)) {
+                uintptr_t b = node->key[node->keys - 1];
+                count_t c =
+                    (count_t) node->value[node->keys - 1];
+                unsigned int i = 0;
+                if (overlaps(b, c * PAGE_SIZE, area->base,
+                    pages * PAGE_SIZE)) {
+                    if (b + c * PAGE_SIZE <= start_free) {
+                        /*
+                         * The whole interval fits
+                         * completely in the resized
+                         * address space area.
+                         */
+                        break;
+                    }
+                    /*
+                     * Part of the interval corresponding
+                     * to b and c overlaps with the resized
+                     * address space area.
+                     */
+                    cond = false;   /* we are almost done */
+                    i = (start_free - b) >> PAGE_WIDTH;
+                    if (!used_space_remove(area, start_free,
+                        c - i))
+                        panic("Could not remove used "
+                            "space.\n");
+                } else {
+                    /*
+                     * The interval of used space can be
+                     * completely removed.
+                     */
+                    if (!used_space_remove(area, b, c))
+                        panic("Could not remove used "
+                            "space.\n");
+                }
+                for (; i < c; i++) {
+                    pte_t *pte;
+                    page_table_lock(as, false);
+                    pte = page_mapping_find(as, b +
+                        i * PAGE_SIZE);
+                    ASSERT(pte && PTE_VALID(pte) &&
+                        PTE_PRESENT(pte));
+                    if (area->backend &&
+                        area->backend->frame_free) {
+                        area->backend->frame_free(area,
+                            b + i * PAGE_SIZE,
+                            PTE_GET_FRAME(pte));
+                    }
+                    page_mapping_remove(as, b +
+                        i * PAGE_SIZE);
+                    page_table_unlock(as, false);
+                }
+            }
+        }
+        /*
+         * Finish TLB shootdown sequence.
+         */
+        tlb_invalidate_pages(as->asid, area->base + pages * PAGE_SIZE,
+            area->pages - pages);
+        /*
+         * Invalidate software translation caches (e.g. TSB on sparc64).
+         */
+        as_invalidate_translation_cache(as, area->base +
+            pages * PAGE_SIZE, area->pages - pages);
+        tlb_shootdown_finalize();
+    } else {
+        /*
+         * Growing the area.
+         * Check for overlaps with other address space areas.
+         */
+        if (!check_area_conflicts(as, address, pages * PAGE_SIZE,
+            area)) {
+            mutex_unlock(&area->lock);
+            mutex_unlock(&as->lock);
+            interrupts_restore(ipl);
+            return EADDRNOTAVAIL;
+        }
+    }
+    area->pages = pages;
+    mutex_unlock(&area->lock);
+    mutex_unlock(&as->lock);
+    interrupts_restore(ipl);
+    return 0;
+}
+/** Destroy address space area.
+ *
+ * @param as Address space.
+ * @param address Address withing the area to be deleted.
+ *
+ * @return Zero on success or a value from @ref errno.h on failure.
+ */
+int as_area_destroy(as_t *as, uintptr_t address)
+{
+    as_area_t *area;
+    uintptr_t base;
+    link_t *cur;
+    ipl_t ipl;
+    ipl = interrupts_disable();
+    mutex_lock(&as->lock);
+    area = find_area_and_lock(as, address);
+    if (!area) {
+        mutex_unlock(&as->lock);
+        interrupts_restore(ipl);
+        return ENOENT;
+    }
+    base = area->base;
+    /*
+     * Start TLB shootdown sequence.
+     */
+    tlb_shootdown_start(TLB_INVL_PAGES, as->asid, area->base, area->pages);
+    /*
+     * Visit only the pages mapped by used_space B+tree.
+     */
+    for (cur = area->used_space.leaf_head.next;
+        cur != &area->used_space.leaf_head; cur = cur->next) {
+        btree_node_t *node;
+        unsigned int i;
+        node = list_get_instance(cur, btree_node_t, leaf_link);
+        for (i = 0; i < node->keys; i++) {
+            uintptr_t b = node->key[i];
+            count_t j;
+            pte_t *pte;
+            for (j = 0; j < (count_t) node->value[i]; j++) {
+                page_table_lock(as, false);
+                pte = page_mapping_find(as, b + j * PAGE_SIZE);
+                ASSERT(pte && PTE_VALID(pte) &&
+                    PTE_PRESENT(pte));
+                if (area->backend &&
+                    area->backend->frame_free) {
+                    area->backend->frame_free(area, b +
+                        j * PAGE_SIZE, PTE_GET_FRAME(pte));
+                }
+                page_mapping_remove(as, b + j * PAGE_SIZE);
+                page_table_unlock(as, false);
+            }
+        }
+    }
+    /*
+     * Finish TLB shootdown sequence.
+     */
+    tlb_invalidate_pages(as->asid, area->base, area->pages);
+    /*
+     * Invalidate potential software translation caches (e.g. TSB on
+     * sparc64).
+     */
+    as_invalidate_translation_cache(as, area->base, area->pages);
+    tlb_shootdown_finalize();
+    btree_destroy(&area->used_space);
+    area->attributes |= AS_AREA_ATTR_PARTIAL;
+    if (area->sh_info)
+        sh_info_remove_reference(area->sh_info);
+    mutex_unlock(&area->lock);
+    /*
+     * Remove the empty area from address space.
+     */
+    btree_remove(&as->as_area_btree, base, NULL);
+    free(area);
+    mutex_unlock(&as->lock);
+    interrupts_restore(ipl);
+    return 0;
+}
+/** Share address space area with another or the same address space.
+ *
+ * Address space area mapping is shared with a new address space area.
+ * If the source address space area has not been shared so far,
+ * a new sh_info is created. The new address space area simply gets the
+ * sh_info of the source area. The process of duplicating the
+ * mapping is done through the backend share function.
+ *
+ * @param src_as Pointer to source address space.
+ * @param src_base Base address of the source address space area.
+ * @param acc_size Expected size of the source area.
+ * @param dst_as Pointer to destination address space.
+ * @param dst_base Target base address.
+ * @param dst_flags_mask Destination address space area flags mask.
+ *
+ * @return Zero on success or ENOENT if there is no such task or if there is no
+ * such address space area, EPERM if there was a problem in accepting the area
+ * or ENOMEM if there was a problem in allocating destination address space
+ * area. ENOTSUP is returned if the address space area backend does not support
+ * sharing.
+ */
+int as_area_share(as_t *src_as, uintptr_t src_base, size_t acc_size,
+    as_t *dst_as, uintptr_t dst_base, int dst_flags_mask)
+{
+    ipl_t ipl;
+    int src_flags;
+    size_t src_size;
+    as_area_t *src_area, *dst_area;
+    share_info_t *sh_info;
+    mem_backend_t *src_backend;
+    mem_backend_data_t src_backend_data;
+    ipl = interrupts_disable();
+    mutex_lock(&src_as->lock);
+    src_area = find_area_and_lock(src_as, src_base);
+    if (!src_area) {
+        /*
+         * Could not find the source address space area.
+         */
+        mutex_unlock(&src_as->lock);
+        interrupts_restore(ipl);
+        return ENOENT;
+    }
+    if (!src_area->backend || !src_area->backend->share) {
+        /*
+         * There is no backend or the backend does not
+         * know how to share the area.
+         */
+        mutex_unlock(&src_area->lock);
+        mutex_unlock(&src_as->lock);
+        interrupts_restore(ipl);
+        return ENOTSUP;
+    }
+    src_size = src_area->pages * PAGE_SIZE;
+    src_flags = src_area->flags;
+    src_backend = src_area->backend;
+    src_backend_data = src_area->backend_data;
+    /* Share the cacheable flag from the original mapping */
+    if (src_flags & AS_AREA_CACHEABLE)
+        dst_flags_mask |= AS_AREA_CACHEABLE;
+    if (src_size != acc_size ||
+        (src_flags & dst_flags_mask) != dst_flags_mask) {
+        mutex_unlock(&src_area->lock);
+        mutex_unlock(&src_as->lock);
+        interrupts_restore(ipl);
+        return EPERM;
+    }
+    /*
+     * Now we are committed to sharing the area.
+     * First, prepare the area for sharing.
+     * Then it will be safe to unlock it.
+     */
+    sh_info = src_area->sh_info;
+    if (!sh_info) {
+        sh_info = (share_info_t *) malloc(sizeof(share_info_t), 0);
+        mutex_initialize(&sh_info->lock);
+        sh_info->refcount = 2;
+        btree_create(&sh_info->pagemap);
+        src_area->sh_info = sh_info;
+        /*
+         * Call the backend to setup sharing.
+         */
+        src_area->backend->share(src_area);
+    } else {
+        mutex_lock(&sh_info->lock);
+        sh_info->refcount++;
+        mutex_unlock(&sh_info->lock);
+    }
+    mutex_unlock(&src_area->lock);
+    mutex_unlock(&src_as->lock);
+    /*
+     * Create copy of the source address space area.
+     * The destination area is created with AS_AREA_ATTR_PARTIAL
+     * attribute set which prevents race condition with
+     * preliminary as_page_fault() calls.
+     * The flags of the source area are masked against dst_flags_mask
+     * to support sharing in less privileged mode.
+     */
+    dst_area = as_area_create(dst_as, dst_flags_mask, src_size, dst_base,
+        AS_AREA_ATTR_PARTIAL, src_backend, &src_backend_data);
+    if (!dst_area) {
+        /*
+         * Destination address space area could not be created.
+         */
+        sh_info_remove_reference(sh_info);
+        interrupts_restore(ipl);
+        return ENOMEM;
+    }
+    /*
+     * Now the destination address space area has been
+     * fully initialized. Clear the AS_AREA_ATTR_PARTIAL
+     * attribute and set the sh_info.
+     */
+    mutex_lock(&dst_as->lock);
+    mutex_lock(&dst_area->lock);
+    dst_area->attributes &= ~AS_AREA_ATTR_PARTIAL;
+    dst_area->sh_info = sh_info;
+    mutex_unlock(&dst_area->lock);
+    mutex_unlock(&dst_as->lock);
+    interrupts_restore(ipl);
+    return 0;
+}
+/** Check access mode for address space area.
+ *
+ * The address space area must be locked prior to this call.
+ *
+ * @param area Address space area.
+ * @param access Access mode.
+ *
+ * @return False if access violates area's permissions, true otherwise.
+ */
+bool as_area_check_access(as_area_t *area, pf_access_t access)
+{
+    int flagmap[] = {
+        [PF_ACCESS_READ] = AS_AREA_READ,
+        [PF_ACCESS_WRITE] = AS_AREA_WRITE,
+        [PF_ACCESS_EXEC] = AS_AREA_EXEC
+    };
+    if (!(area->flags & flagmap[access]))
+        return false;
+    return true;
+}
+/** Handle page fault within the current address space.
+ *
+ * This is the high-level page fault handler. It decides
+ * whether the page fault can be resolved by any backend
+ * and if so, it invokes the backend to resolve the page
+ * fault.
+ *
+ * Interrupts are assumed disabled.
+ *
+ * @param page Faulting page.
+ * @param access Access mode that caused the fault (i.e. read/write/exec).
+ * @param istate Pointer to interrupted state.
+ *
+ * @return AS_PF_FAULT on page fault, AS_PF_OK on success or AS_PF_DEFER if the
+ *     fault was caused by copy_to_uspace() or copy_from_uspace().
+ */
+int as_page_fault(uintptr_t page, pf_access_t access, istate_t *istate)
+{
+    pte_t *pte;
+    as_area_t *area;
+    if (!THREAD)
+        return AS_PF_FAULT;
+    ASSERT(AS);
+    mutex_lock(&AS->lock);
+    area = find_area_and_lock(AS, page);
+    if (!area) {
+        /*
+         * No area contained mapping for 'page'.
+         * Signal page fault to low-level handler.
+         */
+        mutex_unlock(&AS->lock);
+        goto page_fault;
+    }
+    if (area->attributes & AS_AREA_ATTR_PARTIAL) {
+        /*
+         * The address space area is not fully initialized.
+         * Avoid possible race by returning error.
+         */
+        mutex_unlock(&area->lock);
+        mutex_unlock(&AS->lock);
+        goto page_fault;
+    }
+    if (!area->backend || !area->backend->page_fault) {
+        /*
+         * The address space area is not backed by any backend
+         * or the backend cannot handle page faults.
+         */
+        mutex_unlock(&area->lock);
+        mutex_unlock(&AS->lock);
+        goto page_fault;
+    }
+    page_table_lock(AS, false);
+    /*
+     * To avoid race condition between two page faults
+     * on the same address, we need to make sure
+     * the mapping has not been already inserted.
+     */
+    if ((pte = page_mapping_find(AS, page))) {
+        if (PTE_PRESENT(pte)) {
+            if (((access == PF_ACCESS_READ) && PTE_READABLE(pte)) ||
+                (access == PF_ACCESS_WRITE && PTE_WRITABLE(pte)) ||
+                (access == PF_ACCESS_EXEC && PTE_EXECUTABLE(pte))) {
+                page_table_unlock(AS, false);
+                mutex_unlock(&area->lock);
+                mutex_unlock(&AS->lock);
+                return AS_PF_OK;
+            }
+        }
+    }
+    /*
+     * Resort to the backend page fault handler.
+     */
+    if (area->backend->page_fault(area, page, access) != AS_PF_OK) {
+        page_table_unlock(AS, false);
+        mutex_unlock(&area->lock);
+        mutex_unlock(&AS->lock);
+        goto page_fault;
+    }
+    page_table_unlock(AS, false);
+    mutex_unlock(&area->lock);
+    mutex_unlock(&AS->lock);
+    return AS_PF_OK;
+page_fault:
+    if (THREAD->in_copy_from_uspace) {
+        THREAD->in_copy_from_uspace = false;
+        istate_set_retaddr(istate,
+            (uintptr_t) &memcpy_from_uspace_failover_address);
+    } else if (THREAD->in_copy_to_uspace) {
+        THREAD->in_copy_to_uspace = false;
+        istate_set_retaddr(istate,
+            (uintptr_t) &memcpy_to_uspace_failover_address);
+    } else {
+        return AS_PF_FAULT;
+    }
+    return AS_PF_DEFER;
+}
+/** Switch address spaces.
+ *
+ * Note that this function cannot sleep as it is essentially a part of
+ * scheduling. Sleeping here would lead to deadlock on wakeup. Another
+ * thing which is forbidden in this context is locking the address space.
+ *
+ * When this function is enetered, no spinlocks may be held.
+ *
+ * @param old Old address space or NULL.
+ * @param new New address space.
+ */
+void as_switch(as_t *old_as, as_t *new_as)
+{
+    DEADLOCK_PROBE_INIT(p_asidlock);
+    preemption_disable();
+retry:
+    (void) interrupts_disable();
+    if (!spinlock_trylock(&asidlock)) {
+        /*
+         * Avoid deadlock with TLB shootdown.
+         * We can enable interrupts here because
+         * preemption is disabled. We should not be
+         * holding any other lock.
+         */
+        (void) interrupts_enable();
+        DEADLOCK_PROBE(p_asidlock, DEADLOCK_THRESHOLD);
+        goto retry;
+    }
+    preemption_enable();
+    /*
+     * First, take care of the old address space.
+     */
+    if (old_as) {
+        ASSERT(old_as->cpu_refcount);
+        if((--old_as->cpu_refcount == 0) && (old_as != AS_KERNEL)) {
+            /*
+             * The old address space is no longer active on
+             * any processor. It can be appended to the
+             * list of inactive address spaces with assigned
+             * ASID.
+             */
+            ASSERT(old_as->asid != ASID_INVALID);
+            list_append(&old_as->inactive_as_with_asid_link,
+                &inactive_as_with_asid_head);
+        }
+        /*
+         * Perform architecture-specific tasks when the address space
+         * is being removed from the CPU.
+         */
+        as_deinstall_arch(old_as);
+    }
+    /*
+     * Second, prepare the new address space.
+     */
+    if ((new_as->cpu_refcount++ == 0) && (new_as != AS_KERNEL)) {
+        if (new_as->asid != ASID_INVALID)
+            list_remove(&new_as->inactive_as_with_asid_link);
+        else
+            new_as->asid = asid_get();
+    }
+#ifdef AS_PAGE_TABLE
+    SET_PTL0_ADDRESS(new_as->genarch.page_table);
+#endif
+    /*
+     * Perform architecture-specific steps.
+     * (e.g. write ASID to hardware register etc.)
+     */
+    as_install_arch(new_as);
+    spinlock_unlock(&asidlock);
+    AS = new_as;
+}
+/** Convert address space area flags to page flags.
+ *
+ * @param aflags Flags of some address space area.
+ *
+ * @return Flags to be passed to page_mapping_insert().
+ */
+int area_flags_to_page_flags(int aflags)
+{
+    int flags;
+    flags = PAGE_USER | PAGE_PRESENT;
+    if (aflags & AS_AREA_READ)
+        flags |= PAGE_READ;
+    if (aflags & AS_AREA_WRITE)
+        flags |= PAGE_WRITE;
+    if (aflags & AS_AREA_EXEC)
+        flags |= PAGE_EXEC;
+    if (aflags & AS_AREA_CACHEABLE)
+        flags |= PAGE_CACHEABLE;
+    return flags;
+}
+/** Compute flags for virtual address translation subsytem.
+ *
+ * The address space area must be locked.
+ * Interrupts must be disabled.
+ *
+ * @param a Address space area.
+ *
+ * @return Flags to be used in page_mapping_insert().
+ */
+int as_area_get_flags(as_area_t *a)
+{
+    return area_flags_to_page_flags(a->flags);
+}
+/** Create page table.
+ *
+ * Depending on architecture, create either address space
+ * private or global page table.
+ *
+ * @param flags Flags saying whether the page table is for kernel address space.
+ *
+ * @return First entry of the page table.
+ */
+pte_t *page_table_create(int flags)
+{
+#ifdef __OBJC__
+    return [as_t page_table_create: flags];
+#else
+    ASSERT(as_operations);
+    ASSERT(as_operations->page_table_create);
+    return as_operations->page_table_create(flags);
+#endif
+}
+/** Destroy page table.
+ *
+ * Destroy page table in architecture specific way.
+ *
+ * @param page_table Physical address of PTL0.
+ */
+void page_table_destroy(pte_t *page_table)
+{
+#ifdef __OBJC__
+    return [as_t page_table_destroy: page_table];
+#else
+    ASSERT(as_operations);
+    ASSERT(as_operations->page_table_destroy);
+    as_operations->page_table_destroy(page_table);
+#endif
+}
+/** Lock page table.
+ *
+ * This function should be called before any page_mapping_insert(),
+ * page_mapping_remove() and page_mapping_find().
+ *
+ * Locking order is such that address space areas must be locked
+ * prior to this call. Address space can be locked prior to this
+ * call in which case the lock argument is false.
+ *
+ * @param as Address space.
+ * @param lock If false, do not attempt to lock as->lock.
+ */
+void page_table_lock(as_t *as, bool lock)
+{
+#ifdef __OBJC__
+    [as page_table_lock: lock];
+#else
+    ASSERT(as_operations);
+    ASSERT(as_operations->page_table_lock);
+    as_operations->page_table_lock(as, lock);
+#endif
+}
+/** Unlock page table.
+ *
+ * @param as Address space.
+ * @param unlock If false, do not attempt to unlock as->lock.
+ */
+void page_table_unlock(as_t *as, bool unlock)
+{
+#ifdef __OBJC__
+    [as page_table_unlock: unlock];
+#else
+    ASSERT(as_operations);
+    ASSERT(as_operations->page_table_unlock);
+    as_operations->page_table_unlock(as, unlock);
+#endif
+}
+/** Find address space area and lock it.
+ *
+ * The address space must be locked and interrupts must be disabled.
+ *
+ * @param as Address space.
+ * @param va Virtual address.
+ *
+ * @return Locked address space area containing va on success or NULL on
+ *     failure.
+ */
+as_area_t *find_area_and_lock(as_t *as, uintptr_t va)
+{
+    as_area_t *a;
+    btree_node_t *leaf, *lnode;
+    unsigned int i;
+    a = (as_area_t *) btree_search(&as->as_area_btree, va, &leaf);
+    if (a) {
+        /* va is the base address of an address space area */
+        mutex_lock(&a->lock);
+        return a;
+    }
+    /*
+     * Search the leaf node and the righmost record of its left neighbour
+     * to find out whether this is a miss or va belongs to an address
+     * space area found there.
+     */
+    /* First, search the leaf node itself. */
+    for (i = 0; i < leaf->keys; i++) {
+        a = (as_area_t *) leaf->value[i];
+        mutex_lock(&a->lock);
+        if ((a->base <= va) && (va < a->base + a->pages * PAGE_SIZE)) {
+            return a;
+        }
+        mutex_unlock(&a->lock);
+    }
+    /*
+     * Second, locate the left neighbour and test its last record.
+     * Because of its position in the B+tree, it must have base < va.
+     */
+    lnode = btree_leaf_node_left_neighbour(&as->as_area_btree, leaf);
+    if (lnode) {
+        a = (as_area_t *) lnode->value[lnode->keys - 1];
+        mutex_lock(&a->lock);
+        if (va < a->base + a->pages * PAGE_SIZE) {
+            return a;
+        }
+        mutex_unlock(&a->lock);
+    }
+    return NULL;
+}
+/** Check area conflicts with other areas.
+ *
+ * The address space must be locked and interrupts must be disabled.
+ *
+ * @param as Address space.
+ * @param va Starting virtual address of the area being tested.
+ * @param size Size of the area being tested.
+ * @param avoid_area Do not touch this area.
+ *
+ * @return True if there is no conflict, false otherwise.
+ */
+bool check_area_conflicts(as_t *as, uintptr_t va, size_t size,
+              as_area_t *avoid_area)
+{
+    as_area_t *a;
+    btree_node_t *leaf, *node;
+    unsigned int i;
+    /*
+     * We don't want any area to have conflicts with NULL page.
+     */
+    if (overlaps(va, size, NULL, PAGE_SIZE))
+        return false;
+    /*
+     * The leaf node is found in O(log n), where n is proportional to
+     * the number of address space areas belonging to as.
+     * The check for conflicts is then attempted on the rightmost
+     * record in the left neighbour, the leftmost record in the right
+     * neighbour and all records in the leaf node itself.
+     */
+    if ((a = (as_area_t *) btree_search(&as->as_area_btree, va, &leaf))) {
+        if (a != avoid_area)
+            return false;
+    }
+    /* First, check the two border cases. */
+    if ((node = btree_leaf_node_left_neighbour(&as->as_area_btree, leaf))) {
+        a = (as_area_t *) node->value[node->keys - 1];
+        mutex_lock(&a->lock);
+        if (overlaps(va, size, a->base, a->pages * PAGE_SIZE)) {
+            mutex_unlock(&a->lock);
+            return false;
+        }
+        mutex_unlock(&a->lock);
+    }
+    node = btree_leaf_node_right_neighbour(&as->as_area_btree, leaf);
+    if (node) {
+        a = (as_area_t *) node->value[0];
+        mutex_lock(&a->lock);
+        if (overlaps(va, size, a->base, a->pages * PAGE_SIZE)) {
+            mutex_unlock(&a->lock);
+            return false;
+        }
+        mutex_unlock(&a->lock);
+    }
+    /* Second, check the leaf node. */
+    for (i = 0; i < leaf->keys; i++) {
+        a = (as_area_t *) leaf->value[i];
+        if (a == avoid_area)
+            continue;
+        mutex_lock(&a->lock);
+        if (overlaps(va, size, a->base, a->pages * PAGE_SIZE)) {
+            mutex_unlock(&a->lock);
+            return false;
+        }
+        mutex_unlock(&a->lock);
+    }
+    /*
+     * So far, the area does not conflict with other areas.
+     * Check if it doesn't conflict with kernel address space.
+     */
+    if (!KERNEL_ADDRESS_SPACE_SHADOWED) {
+        return !overlaps(va, size,
+            KERNEL_ADDRESS_SPACE_START,
+            KERNEL_ADDRESS_SPACE_END - KERNEL_ADDRESS_SPACE_START);
+    }
+    return true;
+}
+/** Return size of the address space area with given base.
+ *
+ * @param base      Arbitrary address insede the address space area.
+ *
+ * @return      Size of the address space area in bytes or zero if it
+ *          does not exist.
+ */
+size_t as_area_get_size(uintptr_t base)
+{
+    ipl_t ipl;
+    as_area_t *src_area;
+    size_t size;
+    ipl = interrupts_disable();
+    src_area = find_area_and_lock(AS, base);
+    if (src_area){
+        size = src_area->pages * PAGE_SIZE;
+        mutex_unlock(&src_area->lock);
+    } else {
+        size = 0;
+    }
+    interrupts_restore(ipl);
+    return size;
+}
+/** Mark portion of address space area as used.
+ *
+ * The address space area must be already locked.
+ *
+ * @param a Address space area.
+ * @param page First page to be marked.
+ * @param count Number of page to be marked.
+ *
+ * @return 0 on failure and 1 on success.
+ */
+int used_space_insert(as_area_t *a, uintptr_t page, count_t count)
+{
+    btree_node_t *leaf, *node;
+    count_t pages;
+    unsigned int i;
+    ASSERT(page == ALIGN_DOWN(page, PAGE_SIZE));
+    ASSERT(count);
+    pages = (count_t) btree_search(&a->used_space, page, &leaf);
+    if (pages) {
+        /*
+         * We hit the beginning of some used space.
+         */
+        return 0;
+    }
+    if (!leaf->keys) {
+        btree_insert(&a->used_space, page, (void *) count, leaf);
+        return 1;
+    }
+    node = btree_leaf_node_left_neighbour(&a->used_space, leaf);
+    if (node) {
+        uintptr_t left_pg = node->key[node->keys - 1];
+        uintptr_t right_pg = leaf->key[0];
+        count_t left_cnt = (count_t) node->value[node->keys - 1];
+        count_t right_cnt = (count_t) leaf->value[0];
+        /*
+         * Examine the possibility that the interval fits
+         * somewhere between the rightmost interval of
+         * the left neigbour and the first interval of the leaf.
+         */
+        if (page >= right_pg) {
+            /* Do nothing. */
+        } else if (overlaps(page, count * PAGE_SIZE, left_pg,
+            left_cnt * PAGE_SIZE)) {
+            /* The interval intersects with the left interval. */
+            return 0;
+        } else if (overlaps(page, count * PAGE_SIZE, right_pg,
+            right_cnt * PAGE_SIZE)) {
+            /* The interval intersects with the right interval. */
+            return 0;
+        } else if ((page == left_pg + left_cnt * PAGE_SIZE) &&
+            (page + count * PAGE_SIZE == right_pg)) {
+            /*
+             * The interval can be added by merging the two already
+             * present intervals.
+             */
+            node->value[node->keys - 1] += count + right_cnt;
+            btree_remove(&a->used_space, right_pg, leaf);
+            return 1;
+        } else if (page == left_pg + left_cnt * PAGE_SIZE) {
+            /*
+             * The interval can be added by simply growing the left
+             * interval.
+             */
+            node->value[node->keys - 1] += count;
+            return 1;
+        } else if (page + count * PAGE_SIZE == right_pg) {
+            /*
+             * The interval can be addded by simply moving base of
+             * the right interval down and increasing its size
+             * accordingly.
+             */
+            leaf->value[0] += count;
+            leaf->key[0] = page;
+            return 1;
+        } else {
+            /*
+             * The interval is between both neigbouring intervals,
+             * but cannot be merged with any of them.
+             */
+            btree_insert(&a->used_space, page, (void *) count,
+                leaf);
+            return 1;
+        }
+    } else if (page < leaf->key[0]) {
+        uintptr_t right_pg = leaf->key[0];
+        count_t right_cnt = (count_t) leaf->value[0];
+        /*
+         * Investigate the border case in which the left neighbour does
+         * not exist but the interval fits from the left.
+         */
+        if (overlaps(page, count * PAGE_SIZE, right_pg,
+            right_cnt * PAGE_SIZE)) {
+            /* The interval intersects with the right interval. */
+            return 0;
+        } else if (page + count * PAGE_SIZE == right_pg) {
+            /*
+             * The interval can be added by moving the base of the
+             * right interval down and increasing its size
+             * accordingly.
+             */
+            leaf->key[0] = page;
+            leaf->value[0] += count;
+            return 1;
+        } else {
+            /*
+             * The interval doesn't adjoin with the right interval.
+             * It must be added individually.
+             */
+            btree_insert(&a->used_space, page, (void *) count,
+                leaf);
+            return 1;
+        }
+    }
+    node = btree_leaf_node_right_neighbour(&a->used_space, leaf);
+    if (node) {
+        uintptr_t left_pg = leaf->key[leaf->keys - 1];
+        uintptr_t right_pg = node->key[0];
+        count_t left_cnt = (count_t) leaf->value[leaf->keys - 1];
+        count_t right_cnt = (count_t) node->value[0];
+        /*
+         * Examine the possibility that the interval fits
+         * somewhere between the leftmost interval of
+         * the right neigbour and the last interval of the leaf.
+         */
+        if (page < left_pg) {
+            /* Do nothing. */
+        } else if (overlaps(page, count * PAGE_SIZE, left_pg,
+            left_cnt * PAGE_SIZE)) {
+            /* The interval intersects with the left interval. */
+            return 0;
+        } else if (overlaps(page, count * PAGE_SIZE, right_pg,
+            right_cnt * PAGE_SIZE)) {
+            /* The interval intersects with the right interval. */
+            return 0;
+        } else if ((page == left_pg + left_cnt * PAGE_SIZE) &&
+            (page + count * PAGE_SIZE == right_pg)) {
+            /*
+             * The interval can be added by merging the two already
+             * present intervals.
+             * */
+            leaf->value[leaf->keys - 1] += count + right_cnt;
+            btree_remove(&a->used_space, right_pg, node);
+            return 1;
+        } else if (page == left_pg + left_cnt * PAGE_SIZE) {
+            /*
+             * The interval can be added by simply growing the left
+             * interval.
+             * */
+            leaf->value[leaf->keys - 1] +=  count;
+            return 1;
+        } else if (page + count * PAGE_SIZE == right_pg) {
+            /*
+             * The interval can be addded by simply moving base of
+             * the right interval down and increasing its size
+             * accordingly.
+             */
+            node->value[0] += count;
+            node->key[0] = page;
+            return 1;
+        } else {
+            /*
+             * The interval is between both neigbouring intervals,
+             * but cannot be merged with any of them.
+             */
+            btree_insert(&a->used_space, page, (void *) count,
+                leaf);
+            return 1;
+        }
+    } else if (page >= leaf->key[leaf->keys - 1]) {
+        uintptr_t left_pg = leaf->key[leaf->keys - 1];
+        count_t left_cnt = (count_t) leaf->value[leaf->keys - 1];
+        /*
+         * Investigate the border case in which the right neighbour
+         * does not exist but the interval fits from the right.
+         */
+        if (overlaps(page, count * PAGE_SIZE, left_pg,
+            left_cnt * PAGE_SIZE)) {
+            /* The interval intersects with the left interval. */
+            return 0;
+        } else if (left_pg + left_cnt * PAGE_SIZE == page) {
+            /*
+             * The interval can be added by growing the left
+             * interval.
+             */
+            leaf->value[leaf->keys - 1] += count;
+            return 1;
+        } else {
+            /*
+             * The interval doesn't adjoin with the left interval.
+             * It must be added individually.
+             */
+            btree_insert(&a->used_space, page, (void *) count,
+                leaf);
+            return 1;
+        }
+    }
+    /*
+     * Note that if the algorithm made it thus far, the interval can fit
+     * only between two other intervals of the leaf. The two border cases
+     * were already resolved.
+     */
+    for (i = 1; i < leaf->keys; i++) {
+        if (page < leaf->key[i]) {
+            uintptr_t left_pg = leaf->key[i - 1];
+            uintptr_t right_pg = leaf->key[i];
+            count_t left_cnt = (count_t) leaf->value[i - 1];
+            count_t right_cnt = (count_t) leaf->value[i];
+            /*
+             * The interval fits between left_pg and right_pg.
+             */
+            if (overlaps(page, count * PAGE_SIZE, left_pg,
+                left_cnt * PAGE_SIZE)) {
+                /*
+                 * The interval intersects with the left
+                 * interval.
+                 */
+                return 0;
+            } else if (overlaps(page, count * PAGE_SIZE, right_pg,
+                right_cnt * PAGE_SIZE)) {
+                /*
+                 * The interval intersects with the right
+                 * interval.
+                 */
+                return 0;
+            } else if ((page == left_pg + left_cnt * PAGE_SIZE) &&
+                (page + count * PAGE_SIZE == right_pg)) {
+                /*
+                 * The interval can be added by merging the two
+                 * already present intervals.
+                 */
+                leaf->value[i - 1] += count + right_cnt;
+                btree_remove(&a->used_space, right_pg, leaf);
+                return 1;
+            } else if (page == left_pg + left_cnt * PAGE_SIZE) {
+                /*
+                 * The interval can be added by simply growing
+                 * the left interval.
+                 */
+                leaf->value[i - 1] += count;
+                return 1;
+            } else if (page + count * PAGE_SIZE == right_pg) {
+                /*
+                     * The interval can be addded by simply moving
+                 * base of the right interval down and
+                 * increasing its size accordingly.
+                 */
+                leaf->value[i] += count;
+                leaf->key[i] = page;
+                return 1;
+            } else {
+                /*
+                 * The interval is between both neigbouring
+                 * intervals, but cannot be merged with any of
+                 * them.
+                 */
+                btree_insert(&a->used_space, page,
+                    (void *) count, leaf);
+                return 1;
+            }
+        }
+    }
+    panic("Inconsistency detected while adding %d pages of used space at "
+        "%p.\n", count, page);
+}
+/** Mark portion of address space area as unused.
+ *
+ * The address space area must be already locked.
+ *
+ * @param a Address space area.
+ * @param page First page to be marked.
+ * @param count Number of page to be marked.
+ *
+ * @return 0 on failure and 1 on success.
+ */
+int used_space_remove(as_area_t *a, uintptr_t page, count_t count)
+{
+    btree_node_t *leaf, *node;
+    count_t pages;
+    unsigned int i;
+    ASSERT(page == ALIGN_DOWN(page, PAGE_SIZE));
+    ASSERT(count);
+    pages = (count_t) btree_search(&a->used_space, page, &leaf);
+    if (pages) {
+        /*
+         * We are lucky, page is the beginning of some interval.
+         */
+        if (count > pages) {
+            return 0;
+        } else if (count == pages) {
+            btree_remove(&a->used_space, page, leaf);
+            return 1;
+        } else {
+            /*
+             * Find the respective interval.
+             * Decrease its size and relocate its start address.
+             */
+            for (i = 0; i < leaf->keys; i++) {
+                if (leaf->key[i] == page) {
+                    leaf->key[i] += count * PAGE_SIZE;
+                    leaf->value[i] -= count;
+                    return 1;
+                }
+            }
+            goto error;
+        }
+    }
+    node = btree_leaf_node_left_neighbour(&a->used_space, leaf);
+    if (node && page < leaf->key[0]) {
+        uintptr_t left_pg = node->key[node->keys - 1];
+        count_t left_cnt = (count_t) node->value[node->keys - 1];
+        if (overlaps(left_pg, left_cnt * PAGE_SIZE, page,
+            count * PAGE_SIZE)) {
+            if (page + count * PAGE_SIZE ==
+                left_pg + left_cnt * PAGE_SIZE) {
+                /*
+                 * The interval is contained in the rightmost
+                 * interval of the left neighbour and can be
+                 * removed by updating the size of the bigger
+                 * interval.
+                 */
+                node->value[node->keys - 1] -= count;
+                return 1;
+            } else if (page + count * PAGE_SIZE <
+                left_pg + left_cnt*PAGE_SIZE) {
+                count_t new_cnt;
+                /*
+                 * The interval is contained in the rightmost
+                 * interval of the left neighbour but its
+                 * removal requires both updating the size of
+                 * the original interval and also inserting a
+                 * new interval.
+                 */
+                new_cnt = ((left_pg + left_cnt * PAGE_SIZE) -
+                    (page + count*PAGE_SIZE)) >> PAGE_WIDTH;
+                node->value[node->keys - 1] -= count + new_cnt;
+                btree_insert(&a->used_space, page +
+                    count * PAGE_SIZE, (void *) new_cnt, leaf);
+                return 1;
+            }
+        }
+        return 0;
+    } else if (page < leaf->key[0]) {
+        return 0;
+    }
+    if (page > leaf->key[leaf->keys - 1]) {
+        uintptr_t left_pg = leaf->key[leaf->keys - 1];
+        count_t left_cnt = (count_t) leaf->value[leaf->keys - 1];
+        if (overlaps(left_pg, left_cnt * PAGE_SIZE, page,
+            count * PAGE_SIZE)) {
+            if (page + count * PAGE_SIZE ==
+                left_pg + left_cnt * PAGE_SIZE) {
+                /*
+                 * The interval is contained in the rightmost
+                 * interval of the leaf and can be removed by
+                 * updating the size of the bigger interval.
+                 */
+                leaf->value[leaf->keys - 1] -= count;
+                return 1;
+            } else if (page + count * PAGE_SIZE < left_pg +
+                left_cnt * PAGE_SIZE) {
+                count_t new_cnt;
+                /*
+                 * The interval is contained in the rightmost
+                 * interval of the leaf but its removal
+                 * requires both updating the size of the
+                 * original interval and also inserting a new
+                 * interval.
+                 */
+                new_cnt = ((left_pg + left_cnt * PAGE_SIZE) -
+                    (page + count * PAGE_SIZE)) >> PAGE_WIDTH;
+                leaf->value[leaf->keys - 1] -= count + new_cnt;
+                btree_insert(&a->used_space, page +
+                    count * PAGE_SIZE, (void *) new_cnt, leaf);
+                return 1;
+            }
+        }
+        return 0;
+    }
+    /*
+     * The border cases have been already resolved.
+     * Now the interval can be only between intervals of the leaf.
+     */
+    for (i = 1; i < leaf->keys - 1; i++) {
+        if (page < leaf->key[i]) {
+            uintptr_t left_pg = leaf->key[i - 1];
+            count_t left_cnt = (count_t) leaf->value[i - 1];
+            /*
+             * Now the interval is between intervals corresponding
+             * to (i - 1) and i.
+             */
+            if (overlaps(left_pg, left_cnt * PAGE_SIZE, page,
+                count * PAGE_SIZE)) {
+                if (page + count * PAGE_SIZE ==
+                    left_pg + left_cnt*PAGE_SIZE) {
+                    /*
+                     * The interval is contained in the
+                     * interval (i - 1) of the leaf and can
+                     * be removed by updating the size of
+                     * the bigger interval.
+                     */
+                    leaf->value[i - 1] -= count;
+                    return 1;
+                } else if (page + count * PAGE_SIZE <
+                    left_pg + left_cnt * PAGE_SIZE) {
+                    count_t new_cnt;
+                    /*
+                     * The interval is contained in the
+                     * interval (i - 1) of the leaf but its
+                     * removal requires both updating the
+                     * size of the original interval and
+                     * also inserting a new interval.
+                     */
+                    new_cnt = ((left_pg +
+                        left_cnt * PAGE_SIZE) -
+                        (page + count * PAGE_SIZE)) >>
+                        PAGE_WIDTH;
+                    leaf->value[i - 1] -= count + new_cnt;
+                    btree_insert(&a->used_space, page +
+                        count * PAGE_SIZE, (void *) new_cnt,
+                        leaf);
+                    return 1;
+                }
+            }
+            return 0;
+        }
+    }
+error:
+    panic("Inconsistency detected while removing %d pages of used space "
+        "from %p.\n", count, page);
+}
+/** Remove reference to address space area share info.
+ *
+ * If the reference count drops to 0, the sh_info is deallocated.
+ *
+ * @param sh_info Pointer to address space area share info.
+ */
+void sh_info_remove_reference(share_info_t *sh_info)
+{
+    bool dealloc = false;
+    mutex_lock(&sh_info->lock);
+    ASSERT(sh_info->refcount);
+    if (--sh_info->refcount == 0) {
+        dealloc = true;
+        link_t *cur;
+        /*
+         * Now walk carefully the pagemap B+tree and free/remove
+         * reference from all frames found there.
+         */
+        for (cur = sh_info->pagemap.leaf_head.next;
+            cur != &sh_info->pagemap.leaf_head; cur = cur->next) {
+            btree_node_t *node;
+            unsigned int i;
+            node = list_get_instance(cur, btree_node_t, leaf_link);
+            for (i = 0; i < node->keys; i++)
+                frame_free((uintptr_t) node->value[i]);
+        }
+    }
+    mutex_unlock(&sh_info->lock);
+    if (dealloc) {
+        btree_destroy(&sh_info->pagemap);
+        free(sh_info);
+    }
+}
+/*
+ * Address space related syscalls.
+ */
+/** Wrapper for as_area_create(). */
+unative_t sys_as_area_create(uintptr_t address, size_t size, int flags)
+{
+    if (as_area_create(AS, flags | AS_AREA_CACHEABLE, size, address,
+        AS_AREA_ATTR_NONE, &anon_backend, NULL))
+        return (unative_t) address;
+    else
+        return (unative_t) -1;
+}
+/** Wrapper for as_area_resize(). */
+unative_t sys_as_area_resize(uintptr_t address, size_t size, int flags)
+{
+    return (unative_t) as_area_resize(AS, address, size, 0);
+}
+/** Wrapper for as_area_destroy(). */
+unative_t sys_as_area_destroy(uintptr_t address)
+{
+    return (unative_t) as_area_destroy(AS, address);
+}
+/** Print out information about address space.
+ *
+ * @param as Address space.
+ */
+void as_print(as_t *as)
+{
+    ipl_t ipl;
+    ipl = interrupts_disable();
+    mutex_lock(&as->lock);
+    /* print out info about address space areas */
+    link_t *cur;
+    for (cur = as->as_area_btree.leaf_head.next;
+        cur != &as->as_area_btree.leaf_head; cur = cur->next) {
+        btree_node_t *node;
+        node = list_get_instance(cur, btree_node_t, leaf_link);
+        unsigned int i;
+        for (i = 0; i < node->keys; i++) {
+            as_area_t *area = node->value[i];
+            mutex_lock(&area->lock);
+            printf("as_area: %p, base=%p, pages=%d (%p - %p)\n",
+                area, area->base, area->pages, area->base,
+                area->base + area->pages*PAGE_SIZE);
+            mutex_unlock(&area->lock);
+        }
+    }
+    mutex_unlock(&as->lock);
+    interrupts_restore(ipl);
+}
+/** @}
+ */

 /branches/tracing/kernel/generic/src/mm/frame.c
 ,0 → 1,1232
+/*
+ * Copyright (c) 2001-2005 Jakub Jermar
+ * Copyright (c) 2005 Sergey Bondari
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/** @addtogroup genericmm
+ * @{
+ */
+/**
+ * @file
+ * @brief   Physical frame allocator.
+ *
+ * This file contains the physical frame allocator and memory zone management.
+ * The frame allocator is built on top of the buddy allocator.
+ *
+ * @see buddy.c
+ */
+/*
+ * Locking order
+ *
+ * In order to access particular zone, the process must first lock
+ * the zones.lock, then lock the zone and then unlock the zones.lock.
+ * This insures, that we can fiddle with the zones in runtime without
+ * affecting the processes.
+ *
+ */
+#include <arch/types.h>
+#include <mm/frame.h>
+#include <mm/as.h>
+#include <panic.h>
+#include <debug.h>
+#include <adt/list.h>
+#include <synch/spinlock.h>
+#include <arch/asm.h>
+#include <arch.h>
+#include <print.h>
+#include <align.h>
+#include <mm/slab.h>
+#include <bitops.h>
+#include <macros.h>
+#include <config.h>
+typedef struct {
+    count_t refcount;   /**< tracking of shared frames  */
+    uint8_t buddy_order;    /**< buddy system block order */
+    link_t buddy_link;  /**< link to the next free block inside one
+                     order */
+    void *parent;           /**< If allocated by slab, this points there */
+} frame_t;
+typedef struct {
+    SPINLOCK_DECLARE(lock); /**< this lock protects everything below */
+    pfn_t base;     /**< frame_no of the first frame in the frames
+                     array */
+    count_t count;          /**< Size of zone */
+    frame_t *frames;    /**< array of frame_t structures in this
+                     zone */
+    count_t free_count; /**< number of free frame_t structures */
+    count_t busy_count; /**< number of busy frame_t structures */
+    buddy_system_t *buddy_system; /**< buddy system for the zone */
+    int flags;
+} zone_t;
+/*
+ * The zoneinfo.lock must be locked when accessing zoneinfo structure.
+ * Some of the attributes in zone_t structures are 'read-only'
+ */
+typedef struct {
+    SPINLOCK_DECLARE(lock);
+    unsigned int count;
+    zone_t *info[ZONES_MAX];
+} zones_t;
+static zones_t zones;
+/********************/
+/* Helper functions */
+/********************/
+static inline index_t frame_index(zone_t *zone, frame_t *frame)
+{
+    return (index_t) (frame - zone->frames);
+}
+static inline index_t frame_index_abs(zone_t *zone, frame_t *frame)
+{
+    return (index_t) (frame - zone->frames) + zone->base;
+}
+static inline int frame_index_valid(zone_t *zone, index_t index)
+{
+    return (index < zone->count);
+}
+/** Compute pfn_t from frame_t pointer & zone pointer */
+static index_t make_frame_index(zone_t *zone, frame_t *frame)
+{
+    return (frame - zone->frames);
+}
+/** Initialize frame structure
+ *
+ * Initialize frame structure.
+ *
+ * @param frame Frame structure to be initialized.
+ */
+static void frame_initialize(frame_t *frame)
+{
+    frame->refcount = 1;
+    frame->buddy_order = 0;
+}
+/**********************/
+/* Zoneinfo functions */
+/**********************/
+/**
+ * Insert-sort zone into zones list
+ *
+ * @param newzone New zone to be inserted into zone list
+ * @return zone number on success, -1 on error
+ */
+static int zones_add_zone(zone_t *newzone)
+{
+    unsigned int i, j;
+    ipl_t ipl;
+    zone_t *z;
+    ipl = interrupts_disable();
+    spinlock_lock(&zones.lock);
+    /* Try to merge */
+    if (zones.count + 1 == ZONES_MAX) {
+        printf("Maximum zone count %u exceeded!\n", ZONES_MAX);
+        spinlock_unlock(&zones.lock);
+        interrupts_restore(ipl);
+        return -1;
+    }
+    for (i = 0; i < zones.count; i++) {
+        /* Check for overflow */
+        z = zones.info[i];
+        if (overlaps(newzone->base, newzone->count, z->base, z->count)) {
+            printf("Zones overlap!\n");
+            return -1;
+        }
+        if (newzone->base < z->base)
+            break;
+    }
+    /* Move other zones up */
+    for (j = i; j < zones.count; j++)
+        zones.info[j + 1] = zones.info[j];
+    zones.info[i] = newzone;
+    zones.count++;
+    spinlock_unlock(&zones.lock);
+    interrupts_restore(ipl);
+    return i;
+}
+/**
+ * Try to find a zone where can we find the frame
+ *
+ * Assume interrupts are disabled.
+ *
+ * @param frame Frame number contained in zone
+ * @param pzone If not null, it is used as zone hint. Zone index
+ *              is filled into the variable on success.
+ * @return Pointer to locked zone containing frame
+ */
+static zone_t * find_zone_and_lock(pfn_t frame, unsigned int *pzone)
+{
+    unsigned int i;
+    unsigned int hint = pzone ? *pzone : 0;
+    zone_t *z;
+    spinlock_lock(&zones.lock);
+    if (hint >= zones.count)
+        hint = 0;
+    i = hint;
+    do {
+        z = zones.info[i];
+        spinlock_lock(&z->lock);
+        if (z->base <= frame && z->base + z->count > frame) {
+            /* Unlock the global lock */
+            spinlock_unlock(&zones.lock);
+            if (pzone)
+                *pzone = i;
+            return z;
+        }
+        spinlock_unlock(&z->lock);
+        i++;
+        if (i >= zones.count)
+            i = 0;
+    } while(i != hint);
+    spinlock_unlock(&zones.lock);
+    return NULL;
+}
+/** @return True if zone can allocate specified order */
+static int zone_can_alloc(zone_t *z, uint8_t order)
+{
+    return buddy_system_can_alloc(z->buddy_system, order);
+}
+/** Find and lock zone that can allocate order frames.
+ *
+ * Assume interrupts are disabled.
+ *
+ * @param order Size (2^order) of free space we are trying to find
+ * @param pzone Pointer to preferred zone or NULL, on return contains zone
+ *      number
+ */
+static zone_t * find_free_zone_and_lock(uint8_t order, unsigned int *pzone)
+{
+    unsigned int i;
+    zone_t *z;
+    unsigned int hint = pzone ? *pzone : 0;
+    spinlock_lock(&zones.lock);
+    if (hint >= zones.count)
+        hint = 0;
+    i = hint;
+    do {
+        z = zones.info[i];
+        spinlock_lock(&z->lock);
+        /* Check if the zone has 2^order frames area available  */
+        if (zone_can_alloc(z, order)) {
+            spinlock_unlock(&zones.lock);
+            if (pzone)
+                *pzone = i;
+            return z;
+        }
+        spinlock_unlock(&z->lock);
+        if (++i >= zones.count)
+            i = 0;
+    } while(i != hint);
+    spinlock_unlock(&zones.lock);
+    return NULL;
+}
+/**************************/
+/* Buddy system functions */
+/**************************/
+/** Buddy system find_block implementation
+ *
+ * Find block that is parent of current list.
+ * That means go to lower addresses, until such block is found
+ *
+ * @param order - Order of parent must be different then this parameter!!
+ */
+static link_t *zone_buddy_find_block(buddy_system_t *b, link_t *child,
+    uint8_t order)
+{
+    frame_t *frame;
+    zone_t *zone;
+    index_t index;
+    frame = list_get_instance(child, frame_t, buddy_link);
+    zone = (zone_t *) b->data;
+    index = frame_index(zone, frame);
+    do {
+        if (zone->frames[index].buddy_order != order) {
+            return &zone->frames[index].buddy_link;
+        }
+    } while(index-- > 0);
+    return NULL;
+}
+static void zone_buddy_print_id(buddy_system_t *b, link_t *block)
+{
+    frame_t *frame;
+    zone_t *zone;
+    index_t index;
+    frame = list_get_instance(block, frame_t, buddy_link);
+    zone = (zone_t *) b->data;
+    index = frame_index(zone, frame);
+    printf("%zd", index);
+}
+/** Buddy system find_buddy implementation
+ *
+ * @param b Buddy system.
+ * @param block Block for which buddy should be found
+ *
+ * @return Buddy for given block if found
+ */
+static link_t *zone_buddy_find_buddy(buddy_system_t *b, link_t *block)
+{
+    frame_t *frame;
+    zone_t *zone;
+    index_t index;
+    bool is_left, is_right;
+    frame = list_get_instance(block, frame_t, buddy_link);
+    zone = (zone_t *) b->data;
+    ASSERT(IS_BUDDY_ORDER_OK(frame_index_abs(zone, frame),
+        frame->buddy_order));
+    is_left = IS_BUDDY_LEFT_BLOCK_ABS(zone, frame);
+    is_right = IS_BUDDY_RIGHT_BLOCK_ABS(zone, frame);
+    ASSERT(is_left ^ is_right);
+    if (is_left) {
+        index = (frame_index(zone, frame)) + (1 << frame->buddy_order);
+    } else {    /* if (is_right) */
+        index = (frame_index(zone, frame)) - (1 << frame->buddy_order);
+    }
+    if (frame_index_valid(zone, index)) {
+        if (zone->frames[index].buddy_order == frame->buddy_order &&
+            zone->frames[index].refcount == 0) {
+            return &zone->frames[index].buddy_link;
+        }
+    }
+    return NULL;
+}
+/** Buddy system bisect implementation
+ *
+ * @param b Buddy system.
+ * @param block Block to bisect
+ *
+ * @return right block
+ */
+static link_t * zone_buddy_bisect(buddy_system_t *b, link_t *block) {
+    frame_t *frame_l, *frame_r;
+    frame_l = list_get_instance(block, frame_t, buddy_link);
+    frame_r = (frame_l + (1 << (frame_l->buddy_order - 1)));
+    return &frame_r->buddy_link;
+}
+/** Buddy system coalesce implementation
+ *
+ * @param b Buddy system.
+ * @param block_1 First block
+ * @param block_2 First block's buddy
+ *
+ * @return Coalesced block (actually block that represents lower address)
+ */
+static link_t *zone_buddy_coalesce(buddy_system_t *b, link_t *block_1,
+    link_t *block_2)
+{
+    frame_t *frame1, *frame2;
+    frame1 = list_get_instance(block_1, frame_t, buddy_link);
+    frame2 = list_get_instance(block_2, frame_t, buddy_link);
+    return frame1 < frame2 ? block_1 : block_2;
+}
+/** Buddy system set_order implementation
+ *
+ * @param b Buddy system.
+ * @param block Buddy system block
+ * @param order Order to set
+ */
+static void zone_buddy_set_order(buddy_system_t *b, link_t *block,
+    uint8_t order) {
+    frame_t *frame;
+    frame = list_get_instance(block, frame_t, buddy_link);
+    frame->buddy_order = order;
+}
+/** Buddy system get_order implementation
+ *
+ * @param b Buddy system.
+ * @param block Buddy system block
+ *
+ * @return Order of block
+ */
+static uint8_t zone_buddy_get_order(buddy_system_t *b, link_t *block) {
+    frame_t *frame;
+    frame = list_get_instance(block, frame_t, buddy_link);
+    return frame->buddy_order;
+}
+/** Buddy system mark_busy implementation
+ *
+ * @param b Buddy system
+ * @param block Buddy system block
+ *
+ */
+static void zone_buddy_mark_busy(buddy_system_t *b, link_t * block) {
+    frame_t * frame;
+    frame = list_get_instance(block, frame_t, buddy_link);
+    frame->refcount = 1;
+}
+/** Buddy system mark_available implementation
+ *
+ * @param b Buddy system
+ * @param block Buddy system block
+ *
+ */
+static void zone_buddy_mark_available(buddy_system_t *b, link_t *block) {
+    frame_t *frame;
+    frame = list_get_instance(block, frame_t, buddy_link);
+    frame->refcount = 0;
+}
+static buddy_system_operations_t zone_buddy_system_operations = {
+    .find_buddy = zone_buddy_find_buddy,
+    .bisect = zone_buddy_bisect,
+    .coalesce = zone_buddy_coalesce,
+    .set_order = zone_buddy_set_order,
+    .get_order = zone_buddy_get_order,
+    .mark_busy = zone_buddy_mark_busy,
+    .mark_available = zone_buddy_mark_available,
+    .find_block = zone_buddy_find_block,
+    .print_id = zone_buddy_print_id
+};
+/******************/
+/* Zone functions */
+/******************/
+/** Allocate frame in particular zone
+ *
+ * Assume zone is locked
+ * Panics if allocation is impossible.
+ *
+ * @param zone  Zone to allocate from.
+ * @param order Allocate exactly 2^order frames.
+ *
+ * @return Frame index in zone
+ *
+ */
+static pfn_t zone_frame_alloc(zone_t *zone, uint8_t order)
+{
+    pfn_t v;
+    link_t *tmp;
+    frame_t *frame;
+    /* Allocate frames from zone buddy system */
+    tmp = buddy_system_alloc(zone->buddy_system, order);
+    ASSERT(tmp);
+    /* Update zone information. */
+    zone->free_count -= (1 << order);
+    zone->busy_count += (1 << order);
+    /* Frame will be actually a first frame of the block. */
+    frame = list_get_instance(tmp, frame_t, buddy_link);
+    /* get frame address */
+    v = make_frame_index(zone, frame);
+    return v;
+}
+/** Free frame from zone
+ *
+ * Assume zone is locked
+ *
+ * @param zone Pointer to zone from which the frame is to be freed
+ * @param frame_idx Frame index relative to zone
+ */
+static void zone_frame_free(zone_t *zone, index_t frame_idx)
+{
+    frame_t *frame;
+    uint8_t order;
+    frame = &zone->frames[frame_idx];
+    /* remember frame order */
+    order = frame->buddy_order;
+    ASSERT(frame->refcount);
+    if (!--frame->refcount) {
+        buddy_system_free(zone->buddy_system, &frame->buddy_link);
+        /* Update zone information. */
+        zone->free_count += (1 << order);
+        zone->busy_count -= (1 << order);
+    }
+}
+/** Return frame from zone */
+static frame_t * zone_get_frame(zone_t *zone, index_t frame_idx)
+{
+    ASSERT(frame_idx < zone->count);
+    return &zone->frames[frame_idx];
+}
+/** Mark frame in zone unavailable to allocation */
+static void zone_mark_unavailable(zone_t *zone, index_t frame_idx)
+{
+    frame_t *frame;
+    link_t *link;
+    frame = zone_get_frame(zone, frame_idx);
+    if (frame->refcount)
+        return;
+    link = buddy_system_alloc_block(zone->buddy_system,
+        &frame->buddy_link);
+    ASSERT(link);
+    zone->free_count--;
+}
+/**
+ * Join 2 zones
+ *
+ * Expect zone_t *z to point to space at least zone_conf_size large
+ *
+ * Assume z1 & z2 are locked
+ *
+ * @param z Target zone structure pointer
+ * @param z1 Zone to merge
+ * @param z2 Zone to merge
+ */
+static void _zone_merge(zone_t *z, zone_t *z1, zone_t *z2)
+{
+    uint8_t max_order;
+    unsigned int i;
+    int z2idx;
+    pfn_t frame_idx;
+    frame_t *frame;
+    ASSERT(!overlaps(z1->base, z1->count, z2->base, z2->count));
+    ASSERT(z1->base < z2->base);
+    spinlock_initialize(&z->lock, "zone_lock");
+    z->base = z1->base;
+    z->count = z2->base + z2->count - z1->base;
+    z->flags = z1->flags & z2->flags;
+    z->free_count = z1->free_count + z2->free_count;
+    z->busy_count = z1->busy_count + z2->busy_count;
+    max_order = fnzb(z->count);
+    z->buddy_system = (buddy_system_t *) &z[1];
+    buddy_system_create(z->buddy_system, max_order,
+        &zone_buddy_system_operations, (void *) z);
+    z->frames = (frame_t *)((uint8_t *) z->buddy_system +
+        buddy_conf_size(max_order));
+    for (i = 0; i < z->count; i++) {
+        /* This marks all frames busy */
+        frame_initialize(&z->frames[i]);
+    }
+    /* Copy frames from both zones to preserve full frame orders,
+     * parents etc. Set all free frames with refcount=0 to 1, because
+     * we add all free frames to buddy allocator later again, clear
+     * order to 0. Don't set busy frames with refcount=0, as they
+     * will not be reallocated during merge and it would make later
+     * problems with allocation/free.
+     */
+    for (i = 0; i < z1->count; i++)
+        z->frames[i] = z1->frames[i];
+    for (i = 0; i < z2->count; i++) {
+        z2idx = i + (z2->base - z1->base);
+        z->frames[z2idx] = z2->frames[i];
+    }
+    i = 0;
+    while (i < z->count) {
+        if (z->frames[i].refcount) {
+            /* skip busy frames */
+            i += 1 << z->frames[i].buddy_order;
+        } else { /* Free frames, set refcount=1 */
+            /* All free frames have refcount=0, we need not
+             * to check the order */
+            z->frames[i].refcount = 1;
+            z->frames[i].buddy_order = 0;
+            i++;
+        }
+    }
+    /* Add free blocks from the 2 original zones */
+    while (zone_can_alloc(z1, 0)) {
+        frame_idx = zone_frame_alloc(z1, 0);
+        frame = &z->frames[frame_idx];
+        frame->refcount = 0;
+        buddy_system_free(z->buddy_system, &frame->buddy_link);
+    }
+    while (zone_can_alloc(z2, 0)) {
+        frame_idx = zone_frame_alloc(z2, 0);
+        frame = &z->frames[frame_idx + (z2->base - z1->base)];
+        frame->refcount = 0;
+        buddy_system_free(z->buddy_system, &frame->buddy_link);
+    }
+}
+/** Return old configuration frames into the zone
+ *
+ * We have several cases
+ * - the conf. data is outside of zone -> exit, shall we call frame_free??
+ * - the conf. data was created by zone_create or
+ *   updated with reduce_region -> free every frame
+ *
+ * @param newzone The actual zone where freeing should occur
+ * @param oldzone Pointer to old zone configuration data that should
+ *                be freed from new zone
+ */
+static void return_config_frames(zone_t *newzone, zone_t *oldzone)
+{
+    pfn_t pfn;
+    frame_t *frame;
+    count_t cframes;
+    unsigned int i;
+    pfn = ADDR2PFN((uintptr_t)KA2PA(oldzone));
+    cframes = SIZE2FRAMES(zone_conf_size(oldzone->count));
+    if (pfn < newzone->base || pfn >= newzone->base + newzone->count)
+        return;
+    frame = &newzone->frames[pfn - newzone->base];
+    ASSERT(!frame->buddy_order);
+    for (i = 0; i < cframes; i++) {
+        newzone->busy_count++;
+        zone_frame_free(newzone, pfn+i-newzone->base);
+    }
+}
+/** Reduce allocated block to count of order 0 frames
+ *
+ * The allocated block need 2^order frames of space. Reduce all frames
+ * in block to order 0 and free the unneeded frames. This means, that
+ * when freeing the previously allocated block starting with frame_idx,
+ * you have to free every frame.
+ *
+ * @param zone
+ * @param frame_idx Index to block
+ * @param count Allocated space in block
+ */
+static void zone_reduce_region(zone_t *zone, pfn_t frame_idx, count_t count)
+{
+    count_t i;
+    uint8_t order;
+    frame_t *frame;
+    ASSERT(frame_idx + count < zone->count);
+    order = zone->frames[frame_idx].buddy_order;
+    ASSERT((count_t) (1 << order) >= count);
+    /* Reduce all blocks to order 0 */
+    for (i = 0; i < (count_t) (1 << order); i++) {
+        frame = &zone->frames[i + frame_idx];
+        frame->buddy_order = 0;
+        if (!frame->refcount)
+            frame->refcount = 1;
+        ASSERT(frame->refcount == 1);
+    }
+    /* Free unneeded frames */
+    for (i = count; i < (count_t) (1 << order); i++) {
+        zone_frame_free(zone, i + frame_idx);
+    }
+}
+/** Merge zones z1 and z2
+ *
+ * - the zones must be 2 zones with no zone existing in between,
+ *   which means that z2 = z1+1
+ *
+ * - When you create a new zone, the frame allocator configuration does
+ *   not to be 2^order size. Once the allocator is running it is no longer
+ *   possible, merged configuration data occupies more space :-/
+ */
+void zone_merge(unsigned int z1, unsigned int z2)
+{
+    ipl_t ipl;
+    zone_t *zone1, *zone2, *newzone;
+    unsigned int cframes;
+    uint8_t order;
+    unsigned int i;
+    pfn_t pfn;
+    ipl = interrupts_disable();
+    spinlock_lock(&zones.lock);
+    if ((z1 >= zones.count) || (z2 >= zones.count))
+        goto errout;
+    /* We can join only 2 zones with none existing inbetween */
+    if (z2-z1 != 1)
+        goto errout;
+    zone1 = zones.info[z1];
+    zone2 = zones.info[z2];
+    spinlock_lock(&zone1->lock);
+    spinlock_lock(&zone2->lock);
+    cframes = SIZE2FRAMES(zone_conf_size(zone2->base + zone2->count -
+        zone1->base));
+    if (cframes == 1)
+        order = 0;
+    else
+        order = fnzb(cframes - 1) + 1;
+    /* Allocate zonedata inside one of the zones */
+    if (zone_can_alloc(zone1, order))
+        pfn = zone1->base + zone_frame_alloc(zone1, order);
+    else if (zone_can_alloc(zone2, order))
+        pfn = zone2->base + zone_frame_alloc(zone2, order);
+    else
+        goto errout2;
+    newzone = (zone_t *) PA2KA(PFN2ADDR(pfn));
+    _zone_merge(newzone, zone1, zone2);
+    /* Free unneeded config frames */
+    zone_reduce_region(newzone, pfn - newzone->base,  cframes);
+    /* Subtract zone information from busy frames */
+    newzone->busy_count -= cframes;
+    /* Replace existing zones in zoneinfo list */
+    zones.info[z1] = newzone;
+    for (i = z2 + 1; i < zones.count; i++)
+        zones.info[i - 1] = zones.info[i];
+    zones.count--;
+    /* Free old zone information */
+    return_config_frames(newzone, zone1);
+    return_config_frames(newzone, zone2);
+errout2:
+    /* Nobody is allowed to enter to zone, so we are safe
+     * to touch the spinlocks last time */
+    spinlock_unlock(&zone1->lock);
+    spinlock_unlock(&zone2->lock);
+errout:
+    spinlock_unlock(&zones.lock);
+    interrupts_restore(ipl);
+}
+/**
+ * Merge all zones into one big zone
+ *
+ * It is reasonable to do this on systems whose bios reports parts in chunks,
+ * so that we could have 1 zone (it's faster).
+ */
+void zone_merge_all(void)
+{
+    int count = zones.count;
+    while (zones.count > 1 && --count) {
+        zone_merge(0,1);
+        break;
+    }
+}
+/** Create frame zone
+ *
+ * Create new frame zone.
+ *
+ * @param start Physical address of the first frame within the zone.
+ * @param count Count of frames in zone
+ * @param z Address of configuration information of zone
+ * @param flags Zone flags.
+ *
+ * @return Initialized zone.
+ */
+static void zone_construct(pfn_t start, count_t count, zone_t *z, int flags)
+{
+    unsigned int i;
+    uint8_t max_order;
+    spinlock_initialize(&z->lock, "zone_lock");
+    z->base = start;
+    z->count = count;
+    z->flags = flags;
+    z->free_count = count;
+    z->busy_count = 0;
+    /*
+     * Compute order for buddy system, initialize
+     */
+    max_order = fnzb(count);
+    z->buddy_system = (buddy_system_t *)&z[1];
+    buddy_system_create(z->buddy_system, max_order,
+                &zone_buddy_system_operations,
+                (void *) z);
+    /* Allocate frames _after_ the conframe */
+    /* Check sizes */
+    z->frames = (frame_t *)((uint8_t *) z->buddy_system +
+        buddy_conf_size(max_order));
+    for (i = 0; i < count; i++) {
+        frame_initialize(&z->frames[i]);
+    }
+    /* Stuffing frames */
+    for (i = 0; i < count; i++) {
+        z->frames[i].refcount = 0;
+        buddy_system_free(z->buddy_system, &z->frames[i].buddy_link);
+    }
+}
+/** Compute configuration data size for zone
+ *
+ * @param count Size of zone in frames
+ * @return Size of zone configuration info (in bytes)
+ */
+uintptr_t zone_conf_size(count_t count)
+{
+    int size = sizeof(zone_t) + count*sizeof(frame_t);
+    int max_order;
+    max_order = fnzb(count);
+    size += buddy_conf_size(max_order);
+    return size;
+}
+/** Create and add zone to system
+ *
+ * @param start First frame number (absolute)
+ * @param count Size of zone in frames
+ * @param confframe Where configuration frames are supposed to be.
+ *                  Automatically checks, that we will not disturb the
+ *                  kernel and possibly init.
+ *                  If confframe is given _outside_ this zone, it is expected,
+ *                  that the area is already marked BUSY and big enough
+ *                  to contain zone_conf_size() amount of data.
+ *                  If the confframe is inside the area, the zone free frame
+ *                  information is modified not to include it.
+ *
+ * @return Zone number or -1 on error
+ */
+int zone_create(pfn_t start, count_t count, pfn_t confframe, int flags)
+{
+    zone_t *z;
+    uintptr_t addr;
+    count_t confcount;
+    unsigned int i;
+    int znum;
+    /* Theoretically we could have here 0, practically make sure
+     * nobody tries to do that. If some platform requires, remove
+     * the assert
+     */
+    ASSERT(confframe);
+    /* If conframe is supposed to be inside our zone, then make sure
+     * it does not span kernel & init
+     */
+    confcount = SIZE2FRAMES(zone_conf_size(count));
+    if (confframe >= start && confframe < start+count) {
+        for (;confframe < start + count; confframe++) {
+            addr = PFN2ADDR(confframe);
+            if (overlaps(addr, PFN2ADDR(confcount),
+                KA2PA(config.base), config.kernel_size))
+                continue;
+            if (overlaps(addr, PFN2ADDR(confcount),
+                KA2PA(config.stack_base), config.stack_size))
+                continue;
+            bool overlap = false;
+            count_t i;
+            for (i = 0; i < init.cnt; i++)
+                if (overlaps(addr, PFN2ADDR(confcount),
+                    KA2PA(init.tasks[i].addr),
+                    init.tasks[i].size)) {
+                    overlap = true;
+                    break;
+                }
+            if (overlap)
+                continue;
+            break;
+        }
+        if (confframe >= start + count)
+            panic("Cannot find configuration data for zone.");
+    }
+    z = (zone_t *) PA2KA(PFN2ADDR(confframe));
+    zone_construct(start, count, z, flags);
+    znum = zones_add_zone(z);
+    if (znum == -1)
+        return -1;
+    /* If confdata in zone, mark as unavailable */
+    if (confframe >= start && confframe < start + count)
+        for (i = confframe; i < confframe + confcount; i++) {
+            zone_mark_unavailable(z, i - z->base);
+        }
+    return znum;
+}
+/***************************************/
+/* Frame functions */
+/** Set parent of frame */
+void frame_set_parent(pfn_t pfn, void *data, unsigned int hint)
+{
+    zone_t *zone = find_zone_and_lock(pfn, &hint);
+    ASSERT(zone);
+    zone_get_frame(zone, pfn-zone->base)->parent = data;
+    spinlock_unlock(&zone->lock);
+}
+void *frame_get_parent(pfn_t pfn, unsigned int hint)
+{
+    zone_t *zone = find_zone_and_lock(pfn, &hint);
+    void *res;
+    ASSERT(zone);
+    res = zone_get_frame(zone, pfn - zone->base)->parent;
+    spinlock_unlock(&zone->lock);
+    return res;
+}
+/** Allocate power-of-two frames of physical memory.
+ *
+ * @param order  Allocate exactly 2^order frames.
+ * @param flags  Flags for host zone selection and address processing.
+ * @param pzone  Preferred zone
+ *
+ * @return Physical address of the allocated frame.
+ *
+ */
+void * frame_alloc_generic(uint8_t order, int flags, unsigned int *pzone)
+{
+    ipl_t ipl;
+    int freed;
+    pfn_t v;
+    zone_t *zone;
+loop:
+    ipl = interrupts_disable();
+    /*
+     * First, find suitable frame zone.
+     */
+    zone = find_free_zone_and_lock(order, pzone);
+    /* If no memory, reclaim some slab memory,
+       if it does not help, reclaim all */
+    if (!zone && !(flags & FRAME_NO_RECLAIM)) {
+        freed = slab_reclaim(0);
+        if (freed)
+            zone = find_free_zone_and_lock(order, pzone);
+        if (!zone) {
+            freed = slab_reclaim(SLAB_RECLAIM_ALL);
+            if (freed)
+                zone = find_free_zone_and_lock(order, pzone);
+        }
+    }
+    if (!zone) {
+        /*
+         * TODO: Sleep until frames are available again.
+         */
+        interrupts_restore(ipl);
+        if (flags & FRAME_ATOMIC)
+            return 0;
+        panic("Sleep not implemented.\n");
+        goto loop;
+    }
+    v = zone_frame_alloc(zone, order);
+    v += zone->base;
+    spinlock_unlock(&zone->lock);
+    interrupts_restore(ipl);
+    if (flags & FRAME_KA)
+        return (void *)PA2KA(PFN2ADDR(v));
+    return (void *)PFN2ADDR(v);
+}
+/** Free a frame.
+ *
+ * Find respective frame structure for supplied physical frame address.
+ * Decrement frame reference count.
+ * If it drops to zero, move the frame structure to free list.
+ *
+ * @param Frame Physical Address of of the frame to be freed.
+ */
+void frame_free(uintptr_t frame)
+{
+    ipl_t ipl;
+    zone_t *zone;
+    pfn_t pfn = ADDR2PFN(frame);
+    ipl = interrupts_disable();
+    /*
+     * First, find host frame zone for addr.
+     */
+    zone = find_zone_and_lock(pfn,NULL);
+    ASSERT(zone);
+    zone_frame_free(zone, pfn-zone->base);
+    spinlock_unlock(&zone->lock);
+    interrupts_restore(ipl);
+}
+/** Add reference to frame.
+ *
+ * Find respective frame structure for supplied PFN and
+ * increment frame reference count.
+ *
+ * @param pfn Frame number of the frame to be freed.
+ */
+void frame_reference_add(pfn_t pfn)
+{
+    ipl_t ipl;
+    zone_t *zone;
+    frame_t *frame;
+    ipl = interrupts_disable();
+    /*
+     * First, find host frame zone for addr.
+     */
+    zone = find_zone_and_lock(pfn,NULL);
+    ASSERT(zone);
+    frame = &zone->frames[pfn-zone->base];
+    frame->refcount++;
+    spinlock_unlock(&zone->lock);
+    interrupts_restore(ipl);
+}
+/** Mark given range unavailable in frame zones */
+void frame_mark_unavailable(pfn_t start, count_t count)
+{
+    unsigned int i;
+    zone_t *zone;
+    unsigned int prefzone = 0;
+    for (i = 0; i < count; i++) {
+        zone = find_zone_and_lock(start + i, &prefzone);
+        if (!zone) /* PFN not found */
+            continue;
+        zone_mark_unavailable(zone, start + i - zone->base);
+        spinlock_unlock(&zone->lock);
+    }
+}
+/** Initialize physical memory management
+ *
+ * Initialize physical memory managemnt.
+ */
+void frame_init(void)
+{
+    if (config.cpu_active == 1) {
+        zones.count = 0;
+        spinlock_initialize(&zones.lock, "zones.lock");
+    }
+    /* Tell the architecture to create some memory */
+    frame_arch_init();
+    if (config.cpu_active == 1) {
+        frame_mark_unavailable(ADDR2PFN(KA2PA(config.base)),
+            SIZE2FRAMES(config.kernel_size));
+        frame_mark_unavailable(ADDR2PFN(KA2PA(config.stack_base)),
+            SIZE2FRAMES(config.stack_size));
+        count_t i;
+        for (i = 0; i < init.cnt; i++) {
+            pfn_t pfn = ADDR2PFN(KA2PA(init.tasks[i].addr));
+            frame_mark_unavailable(pfn,
+                SIZE2FRAMES(init.tasks[i].size));
+        }
+        if (ballocs.size)
+            frame_mark_unavailable(ADDR2PFN(KA2PA(ballocs.base)),
+                SIZE2FRAMES(ballocs.size));
+        /* Black list first frame, as allocating NULL would
+         * fail in some places */
+        frame_mark_unavailable(0, 1);
+    }
+}
+/** Return total size of all zones
+ *
+ */
+uint64_t zone_total_size(void) {
+    zone_t *zone = NULL;
+    unsigned int i;
+    ipl_t ipl;
+    uint64_t total = 0;
+    ipl = interrupts_disable();
+    spinlock_lock(&zones.lock);
+    for (i = 0; i < zones.count; i++) {
+        zone = zones.info[i];
+        spinlock_lock(&zone->lock);
+        total += (uint64_t) FRAMES2SIZE(zone->count);
+        spinlock_unlock(&zone->lock);
+    }
+    spinlock_unlock(&zones.lock);
+    interrupts_restore(ipl);
+    return total;
+}
+/** Prints list of zones
+ *
+ */
+void zone_print_list(void) {
+    zone_t *zone = NULL;
+    unsigned int i;
+    ipl_t ipl;
+    ipl = interrupts_disable();
+    spinlock_lock(&zones.lock);
+    if (sizeof(void *) == 4) {
+        printf("#  base address free frames  busy frames\n");
+        printf("-- ------------ ------------ ------------\n");
+    } else {
+        printf("#  base address         free frames  busy frames\n");
+        printf("-- -------------------- ------------ ------------\n");
+    }
+    for (i = 0; i < zones.count; i++) {
+        zone = zones.info[i];
+        spinlock_lock(&zone->lock);
+        if (sizeof(void *) == 4)
+            printf("%-2d   %#10zx %12zd %12zd\n", i, PFN2ADDR(zone->base),
+                zone->free_count, zone->busy_count);
+        else
+            printf("%-2d   %#18zx %12zd %12zd\n", i, PFN2ADDR(zone->base),
+                zone->free_count, zone->busy_count);
+        spinlock_unlock(&zone->lock);
+    }
+    spinlock_unlock(&zones.lock);
+    interrupts_restore(ipl);
+}
+/** Prints zone details.
+ *
+ * @param num Zone base address or zone number.
+ */
+void zone_print_one(unsigned int num) {
+    zone_t *zone = NULL;
+    ipl_t ipl;
+    unsigned int i;
+    ipl = interrupts_disable();
+    spinlock_lock(&zones.lock);
+    for (i = 0; i < zones.count; i++) {
+        if ((i == num) || (PFN2ADDR(zones.info[i]->base) == num)) {
+            zone = zones.info[i];
+            break;
+        }
+    }
+    if (!zone) {
+        printf("Zone not found.\n");
+        goto out;
+    }
+    spinlock_lock(&zone->lock);
+    printf("Memory zone information\n");
+    printf("Zone base address: %#.*p\n", sizeof(uintptr_t) * 2,
+        PFN2ADDR(zone->base));
+    printf("Zone size: %zd frames (%zd KB)\n", zone->count,
+        SIZE2KB(FRAMES2SIZE(zone->count)));
+    printf("Allocated space: %zd frames (%zd KB)\n", zone->busy_count,
+        SIZE2KB(FRAMES2SIZE(zone->busy_count)));
+    printf("Available space: %zd frames (%zd KB)\n", zone->free_count,
+        SIZE2KB(FRAMES2SIZE(zone->free_count)));
+    buddy_system_structure_print(zone->buddy_system, FRAME_SIZE);
+    spinlock_unlock(&zone->lock);
+out:
+    spinlock_unlock(&zones.lock);
+    interrupts_restore(ipl);
+}
+/** @}
+ */

 /branches/tracing/kernel/generic/src/mm/backend_elf.c
 ,0 → 1,342
+/*
+ * Copyright (c) 2006 Jakub Jermar
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/** @addtogroup genericmm
+ * @{
+ */
+/**
+ * @file
+ * @brief   Backend for address space areas backed by an ELF image.
+ */
+#include <lib/elf.h>
+#include <debug.h>
+#include <arch/types.h>
+#include <mm/as.h>
+#include <mm/frame.h>
+#include <mm/slab.h>
+#include <mm/page.h>
+#include <genarch/mm/page_pt.h>
+#include <genarch/mm/page_ht.h>
+#include <align.h>
+#include <memstr.h>
+#include <macros.h>
+#include <arch.h>
+#ifdef CONFIG_VIRT_IDX_DCACHE
+#include <arch/mm/cache.h>
+#endif
+static int elf_page_fault(as_area_t *area, uintptr_t addr, pf_access_t access);
+static void elf_frame_free(as_area_t *area, uintptr_t page, uintptr_t frame);
+static void elf_share(as_area_t *area);
+mem_backend_t elf_backend = {
+    .page_fault = elf_page_fault,
+    .frame_free = elf_frame_free,
+    .share = elf_share
+};
+/** Service a page fault in the ELF backend address space area.
+ *
+ * The address space area and page tables must be already locked.
+ *
+ * @param area Pointer to the address space area.
+ * @param addr Faulting virtual address.
+ * @param access Access mode that caused the fault (i.e. read/write/exec).
+ *
+ * @return AS_PF_FAULT on failure (i.e. page fault) or AS_PF_OK on success (i.e.
+ *     serviced).
+ */
+int elf_page_fault(as_area_t *area, uintptr_t addr, pf_access_t access)
+{
+    elf_header_t *elf = area->backend_data.elf;
+    elf_segment_header_t *entry = area->backend_data.segment;
+    btree_node_t *leaf;
+    uintptr_t base, frame;
+    index_t i;
+    bool dirty = false;
+    if (!as_area_check_access(area, access))
+        return AS_PF_FAULT;
+    ASSERT((addr >= entry->p_vaddr) &&
+        (addr < entry->p_vaddr + entry->p_memsz));
+    i = (addr - entry->p_vaddr) >> PAGE_WIDTH;
+    base = (uintptr_t) (((void *) elf) + entry->p_offset);
+    ASSERT(ALIGN_UP(base, FRAME_SIZE) == base);
+    if (area->sh_info) {
+        bool found = false;
+        /*
+         * The address space area is shared.
+         */
+        mutex_lock(&area->sh_info->lock);
+        frame = (uintptr_t) btree_search(&area->sh_info->pagemap,
+            ALIGN_DOWN(addr, PAGE_SIZE) - area->base, &leaf);
+        if (!frame) {
+            unsigned int i;
+            /*
+             * Workaround for valid NULL address.
+             */
+            for (i = 0; i < leaf->keys; i++) {
+                if (leaf->key[i] ==
+                    ALIGN_DOWN(addr, PAGE_SIZE)) {
+                    found = true;
+                    break;
+                }
+            }
+        }
+        if (frame || found) {
+            frame_reference_add(ADDR2PFN(frame));
+            page_mapping_insert(AS, addr, frame,
+                as_area_get_flags(area));
+            if (!used_space_insert(area,
+                ALIGN_DOWN(addr, PAGE_SIZE), 1))
+                panic("Could not insert used space.\n");
+            mutex_unlock(&area->sh_info->lock);
+            return AS_PF_OK;
+        }
+    }
+    /*
+     * The area is either not shared or the pagemap does not contain the
+     * mapping.
+     */
+    if (ALIGN_DOWN(addr, PAGE_SIZE) + PAGE_SIZE <
+        entry->p_vaddr + entry->p_filesz) {
+        /*
+         * Initialized portion of the segment. The memory is backed
+         * directly by the content of the ELF image. Pages are
+         * only copied if the segment is writable so that there
+         * can be more instantions of the same memory ELF image
+         * used at a time. Note that this could be later done
+         * as COW.
+         */
+        if (entry->p_flags & PF_W) {
+            frame = (uintptr_t)frame_alloc(ONE_FRAME, 0);
+            memcpy((void *) PA2KA(frame),
+                (void *) (base + i * FRAME_SIZE), FRAME_SIZE);
+            dirty = true;
+            if (area->sh_info) {
+                frame_reference_add(ADDR2PFN(frame));
+                btree_insert(&area->sh_info->pagemap,
+                    ALIGN_DOWN(addr, PAGE_SIZE) - area->base,
+                    (void *) frame, leaf);
+            }
+        } else {
+            frame = KA2PA(base + i*FRAME_SIZE);
+        }
+    } else if (ALIGN_DOWN(addr, PAGE_SIZE) >=
+        ALIGN_UP(entry->p_vaddr + entry->p_filesz, PAGE_SIZE)) {
+        /*
+         * This is the uninitialized portion of the segment.
+         * It is not physically present in the ELF image.
+         * To resolve the situation, a frame must be allocated
+         * and cleared.
+         */
+        frame = (uintptr_t)frame_alloc(ONE_FRAME, 0);
+        memsetb(PA2KA(frame), FRAME_SIZE, 0);
+        dirty = true;
+        if (area->sh_info) {
+            frame_reference_add(ADDR2PFN(frame));
+            btree_insert(&area->sh_info->pagemap,
+                ALIGN_DOWN(addr, PAGE_SIZE) - area->base,
+                (void *) frame, leaf);
+        }
+    } else {
+        size_t size;
+        /*
+         * The mixed case.
+         * The lower part is backed by the ELF image and
+         * the upper part is anonymous memory.
+         */
+        size = entry->p_filesz - (i<<PAGE_WIDTH);
+        frame = (uintptr_t)frame_alloc(ONE_FRAME, 0);
+        memsetb(PA2KA(frame) + size, FRAME_SIZE - size, 0);
+        memcpy((void *) PA2KA(frame), (void *) (base + i * FRAME_SIZE),
+            size);
+        dirty = true;
+        if (area->sh_info) {
+            frame_reference_add(ADDR2PFN(frame));
+            btree_insert(&area->sh_info->pagemap,
+                ALIGN_DOWN(addr, PAGE_SIZE) - area->base,
+                (void *) frame, leaf);
+        }
+    }
+    if (area->sh_info)
+        mutex_unlock(&area->sh_info->lock);
+    page_mapping_insert(AS, addr, frame, as_area_get_flags(area));
+    if (!used_space_insert(area, ALIGN_DOWN(addr, PAGE_SIZE), 1))
+        panic("Could not insert used space.\n");
+    return AS_PF_OK;
+}
+/** Free a frame that is backed by the ELF backend.
+ *
+ * The address space area and page tables must be already locked.
+ *
+ * @param area Pointer to the address space area.
+ * @param page Page that is mapped to frame. Must be aligned to PAGE_SIZE.
+ * @param frame Frame to be released.
+ *
+ */
+void elf_frame_free(as_area_t *area, uintptr_t page, uintptr_t frame)
+{
+    elf_header_t *elf = area->backend_data.elf;
+    elf_segment_header_t *entry = area->backend_data.segment;
+    uintptr_t base;
+    index_t i;
+    ASSERT((page >= entry->p_vaddr) &&
+        (page < entry->p_vaddr + entry->p_memsz));
+    i = (page - entry->p_vaddr) >> PAGE_WIDTH;
+    base = (uintptr_t) (((void *) elf) + entry->p_offset);
+    ASSERT(ALIGN_UP(base, FRAME_SIZE) == base);
+    if (page + PAGE_SIZE <
+        ALIGN_UP(entry->p_vaddr + entry->p_filesz, PAGE_SIZE)) {
+        if (entry->p_flags & PF_W) {
+            /*
+             * Free the frame with the copy of writable segment
+             * data.
+             */
+            frame_free(frame);
+        }
+    } else {
+        /*
+         * The frame is either anonymous memory or the mixed case (i.e.
+         * lower part is backed by the ELF image and the upper is
+         * anonymous). In any case, a frame needs to be freed.
+         */
+        frame_free(frame);
+    }
+}
+/** Share ELF image backed address space area.
+ *
+ * If the area is writable, then all mapped pages are duplicated in the pagemap.
+ * Otherwise only portions of the area that are not backed by the ELF image
+ * are put into the pagemap.
+ *
+ * The address space and address space area must be locked prior to the call.
+ *
+ * @param area Address space area.
+ */
+void elf_share(as_area_t *area)
+{
+    elf_segment_header_t *entry = area->backend_data.segment;
+    link_t *cur;
+    btree_node_t *leaf, *node;
+    uintptr_t start_anon = entry->p_vaddr + entry->p_filesz;
+    /*
+     * Find the node in which to start linear search.
+     */
+    if (area->flags & AS_AREA_WRITE) {
+        node = list_get_instance(area->used_space.leaf_head.next,
+            btree_node_t, leaf_link);
+    } else {
+        (void) btree_search(&area->sh_info->pagemap, start_anon, &leaf);
+        node = btree_leaf_node_left_neighbour(&area->sh_info->pagemap,
+            leaf);
+        if (!node)
+            node = leaf;
+    }
+    /*
+     * Copy used anonymous portions of the area to sh_info's page map.
+     */
+    mutex_lock(&area->sh_info->lock);
+    for (cur = &node->leaf_link; cur != &area->used_space.leaf_head;
+        cur = cur->next) {
+        unsigned int i;
+        node = list_get_instance(cur, btree_node_t, leaf_link);
+        for (i = 0; i < node->keys; i++) {
+            uintptr_t base = node->key[i];
+            count_t count = (count_t) node->value[i];
+            unsigned int j;
+            /*
+             * Skip read-only areas of used space that are backed
+             * by the ELF image.
+             */
+            if (!(area->flags & AS_AREA_WRITE))
+                if (base + count * PAGE_SIZE <= start_anon)
+                    continue;
+            for (j = 0; j < count; j++) {
+                pte_t *pte;
+                /*
+                 * Skip read-only pages that are backed by the
+                 * ELF image.
+                 */
+                if (!(area->flags & AS_AREA_WRITE))
+                    if (base + (j + 1) * PAGE_SIZE <=
+                        start_anon)
+                        continue;
+                page_table_lock(area->as, false);
+                pte = page_mapping_find(area->as,
+                    base + j * PAGE_SIZE);
+                ASSERT(pte && PTE_VALID(pte) &&
+                    PTE_PRESENT(pte));
+                btree_insert(&area->sh_info->pagemap,
+                    (base + j * PAGE_SIZE) - area->base,
+                    (void *) PTE_GET_FRAME(pte), NULL);
+                page_table_unlock(area->as, false);
+                pfn_t pfn = ADDR2PFN(PTE_GET_FRAME(pte));
+                frame_reference_add(pfn);
+            }
+        }
+    }
+    mutex_unlock(&area->sh_info->lock);
+}
+/** @}
+ */

 /branches/tracing/kernel/generic/src/mm/page.c
 ,0 → 1,142
+/*
+ * Copyright (c) 2001-2006 Jakub Jermar
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/** @addtogroup genericmm
+ * @{
+ */
+/**
+ * @file
+ * @brief   Virtual Address Translation subsystem.
+ *
+ * This file contains code for creating, destroying and searching
+ * mappings between virtual addresses and physical addresses.
+ * Functions here are mere wrappers that call the real implementation.
+ * They however, define the single interface.
+ */
+#include <mm/page.h>
+#include <arch/mm/page.h>
+#include <arch/mm/asid.h>
+#include <mm/as.h>
+#include <mm/frame.h>
+#include <arch/types.h>
+#include <arch/asm.h>
+#include <memstr.h>
+#include <debug.h>
+#include <arch.h>
+/** Virtual operations for page subsystem. */
+page_mapping_operations_t *page_mapping_operations = NULL;
+void page_init(void)
+{
+    page_arch_init();
+}
+/** Map memory structure
+ *
+ * Identity-map memory structure
+ * considering possible crossings
+ * of page boundaries.
+ *
+ * @param s Address of the structure.
+ * @param size Size of the structure.
+ */
+void map_structure(uintptr_t s, size_t size)
+{
+    int i, cnt, length;
+    length = size + (s - (s & ~(PAGE_SIZE - 1)));
+    cnt = length / PAGE_SIZE + (length % PAGE_SIZE > 0);
+    for (i = 0; i < cnt; i++)
+        page_mapping_insert(AS_KERNEL, s + i * PAGE_SIZE, s + i * PAGE_SIZE, PAGE_NOT_CACHEABLE | PAGE_WRITE);
+}
+/** Insert mapping of page to frame.
+ *
+ * Map virtual address page to physical address frame
+ * using flags. Allocate and setup any missing page tables.
+ *
+ * The page table must be locked and interrupts must be disabled.
+ *
+ * @param as Address space to wich page belongs.
+ * @param page Virtual address of the page to be mapped.
+ * @param frame Physical address of memory frame to which the mapping is done.
+ * @param flags Flags to be used for mapping.
+ */
+void page_mapping_insert(as_t *as, uintptr_t page, uintptr_t frame, int flags)
+{
+    ASSERT(page_mapping_operations);
+    ASSERT(page_mapping_operations->mapping_insert);
+    page_mapping_operations->mapping_insert(as, page, frame, flags);
+}
+/** Remove mapping of page.
+ *
+ * Remove any mapping of page within address space as.
+ * TLB shootdown should follow in order to make effects of
+ * this call visible.
+ *
+ * The page table must be locked and interrupts must be disabled.
+ *
+ * @param as Address space to wich page belongs.
+ * @param page Virtual address of the page to be demapped.
+ */
+void page_mapping_remove(as_t *as, uintptr_t page)
+{
+    ASSERT(page_mapping_operations);
+    ASSERT(page_mapping_operations->mapping_remove);
+    page_mapping_operations->mapping_remove(as, page);
+}
+/** Find mapping for virtual page
+ *
+ * Find mapping for virtual page.
+ *
+ * The page table must be locked and interrupts must be disabled.
+ *
+ * @param as Address space to wich page belongs.
+ * @param page Virtual page.
+ *
+ * @return NULL if there is no such mapping; requested mapping otherwise.
+ */
+pte_t *page_mapping_find(as_t *as, uintptr_t page)
+{
+    ASSERT(page_mapping_operations);
+    ASSERT(page_mapping_operations->mapping_find);
+    return page_mapping_operations->mapping_find(as, page);
+}
+/** @}
+ */

 /branches/tracing/kernel/generic/src/mm/backend_phys.c
 ,0 → 1,97
+/*
+ * Copyright (c) 2006 Jakub Jermar
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/** @addtogroup genericmm
+ * @{
+ */
+/**
+ * @file
+ * @brief   Backend for address space areas backed by continuous physical
+ *      memory.
+ */
+#include <debug.h>
+#include <arch/types.h>
+#include <mm/as.h>
+#include <mm/frame.h>
+#include <mm/slab.h>
+#include <memstr.h>
+#include <macros.h>
+#include <arch.h>
+#include <align.h>
+static int phys_page_fault(as_area_t *area, uintptr_t addr, pf_access_t access);
+static void phys_share(as_area_t *area);
+mem_backend_t phys_backend = {
+    .page_fault = phys_page_fault,
+    .frame_free = NULL,
+    .share = phys_share
+};
+/** Service a page fault in the address space area backed by physical memory.
+ *
+ * The address space area and page tables must be already locked.
+ *
+ * @param area Pointer to the address space area.
+ * @param addr Faulting virtual address.
+ * @param access Access mode that caused the fault (i.e. read/write/exec).
+ *
+ * @return AS_PF_FAULT on failure (i.e. page fault) or AS_PF_OK on success (i.e.
+ * serviced).
+ */
+int phys_page_fault(as_area_t *area, uintptr_t addr, pf_access_t access)
+{
+    uintptr_t base = area->backend_data.base;
+    if (!as_area_check_access(area, access))
+        return AS_PF_FAULT;
+    ASSERT(addr - area->base < area->backend_data.frames * FRAME_SIZE);
+    page_mapping_insert(AS, addr, base + (addr - area->base),
+        as_area_get_flags(area));
+        if (!used_space_insert(area, ALIGN_DOWN(addr, PAGE_SIZE), 1))
+                panic("Could not insert used space.\n");
+    return AS_PF_OK;
+}
+/** Share address space area backed by physical memory.
+ *
+ * Do actually nothing as sharing of address space areas
+ * that are backed up by physical memory is very easy.
+ * Note that the function must be defined so that
+ * as_area_share() will succeed.
+ */
+void phys_share(as_area_t *area)
+{
+}
+/** @}
+ */

 /branches/tracing/kernel/generic/src/mm/buddy.c
 ,0 → 1,323
+/*
+ * Copyright (c) 2005 Jakub Jermar
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/** @addtogroup genericmm
+ * @{
+ */
+/**
+ * @file
+ * @brief   Buddy allocator framework.
+ *
+ * This file contains buddy system allocator framework.
+ * Specialized functions are needed for this abstract framework
+ * to be useful.
+ */
+#include <mm/buddy.h>
+#include <mm/frame.h>
+#include <arch/types.h>
+#include <debug.h>
+#include <print.h>
+/** Return size needed for the buddy configuration data */
+size_t buddy_conf_size(int max_order)
+{
+    return sizeof(buddy_system_t) + (max_order + 1) * sizeof(link_t);
+}
+/** Create buddy system
+ *
+ * Allocate memory for and initialize new buddy system.
+ *
+ * @param b Preallocated buddy system control data.
+ * @param max_order The biggest allocable size will be 2^max_order.
+ * @param op Operations for new buddy system.
+ * @param data Pointer to be used by implementation.
+ *
+ * @return New buddy system.
+ */
+void buddy_system_create(buddy_system_t *b,
+             uint8_t max_order,
+             buddy_system_operations_t *op,
+             void *data)
+{
+    int i;
+    ASSERT(max_order < BUDDY_SYSTEM_INNER_BLOCK);
+    ASSERT(op->find_buddy);
+    ASSERT(op->set_order);
+    ASSERT(op->get_order);
+    ASSERT(op->bisect);
+    ASSERT(op->coalesce);
+    ASSERT(op->mark_busy);
+    /*
+     * Use memory after our own structure
+     */
+    b->order = (link_t *) (&b[1]);
+    for (i = 0; i <= max_order; i++)
+        list_initialize(&b->order[i]);
+    b->max_order = max_order;
+    b->op = op;
+    b->data = data;
+}
+/** Check if buddy system can allocate block
+ *
+ * @param b Buddy system pointer
+ * @param i Size of the block (2^i)
+ *
+ * @return True if block can be allocated
+ */
+bool buddy_system_can_alloc(buddy_system_t *b, uint8_t i) {
+    uint8_t k;
+    /*
+     * If requested block is greater then maximal block
+     * we know immediatly that we cannot satisfy the request.
+     */
+    if (i > b->max_order) return false;
+    /*
+     * Check if any bigger or equal order has free elements
+     */
+    for (k=i; k <= b->max_order; k++) {
+        if (!list_empty(&b->order[k])) {
+            return true;
+        }
+    }
+    return false;
+}
+/** Allocate PARTICULAR block from buddy system
+ *
+ * @ return Block of data or NULL if no such block was found
+ */
+link_t *buddy_system_alloc_block(buddy_system_t *b, link_t *block)
+{
+    link_t *left,*right, *tmp;
+    uint8_t order;
+    left = b->op->find_block(b, block, BUDDY_SYSTEM_INNER_BLOCK);
+    ASSERT(left);
+    list_remove(left);
+    while (1) {
+        if (! b->op->get_order(b,left)) {
+            b->op->mark_busy(b, left);
+            return left;
+        }
+        order = b->op->get_order(b, left);
+        right = b->op->bisect(b, left);
+        b->op->set_order(b, left, order-1);
+        b->op->set_order(b, right, order-1);
+        tmp = b->op->find_block(b, block, BUDDY_SYSTEM_INNER_BLOCK);
+        if (tmp == right) {
+            right = left;
+            left = tmp;
+        }
+        ASSERT(tmp == left);
+        b->op->mark_busy(b, left);
+        buddy_system_free(b, right);
+        b->op->mark_available(b, left);
+    }
+}
+/** Allocate block from buddy system.
+ *
+ * @param b Buddy system pointer.
+ * @param i Returned block will be 2^i big.
+ *
+ * @return Block of data represented by link_t.
+ */
+link_t *buddy_system_alloc(buddy_system_t *b, uint8_t i)
+{
+    link_t *res, *hlp;
+    ASSERT(i <= b->max_order);
+    /*
+     * If the list of order i is not empty,
+     * the request can be immediatelly satisfied.
+     */
+    if (!list_empty(&b->order[i])) {
+        res = b->order[i].next;
+        list_remove(res);
+        b->op->mark_busy(b, res);
+        return res;
+    }
+    /*
+     * If order i is already the maximal order,
+     * the request cannot be satisfied.
+     */
+    if (i == b->max_order)
+        return NULL;
+    /*
+     * Try to recursively satisfy the request from higher order lists.
+     */
+    hlp = buddy_system_alloc(b, i + 1);
+    /*
+     * The request could not be satisfied
+     * from higher order lists.
+     */
+    if (!hlp)
+        return NULL;
+    res = hlp;
+    /*
+     * Bisect the block and set order of both of its parts to i.
+     */
+    hlp = b->op->bisect(b, res);
+    b->op->set_order(b, res, i);
+    b->op->set_order(b, hlp, i);
+    /*
+     * Return the other half to buddy system. Mark the first part
+     * full, so that it won't coalesce again.
+     */
+    b->op->mark_busy(b, res);
+    buddy_system_free(b, hlp);
+    return res;
+}
+/** Return block to buddy system.
+ *
+ * @param b Buddy system pointer.
+ * @param block Block to return.
+ */
+void buddy_system_free(buddy_system_t *b, link_t *block)
+{
+    link_t *buddy, *hlp;
+    uint8_t i;
+    /*
+     * Determine block's order.
+     */
+    i = b->op->get_order(b, block);
+    ASSERT(i <= b->max_order);
+    if (i != b->max_order) {
+        /*
+         * See if there is any buddy in the list of order i.
+         */
+        buddy = b->op->find_buddy(b, block);
+        if (buddy) {
+            ASSERT(b->op->get_order(b, buddy) == i);
+            /*
+             * Remove buddy from the list of order i.
+             */
+            list_remove(buddy);
+            /*
+             * Invalidate order of both block and buddy.
+             */
+            b->op->set_order(b, block, BUDDY_SYSTEM_INNER_BLOCK);
+            b->op->set_order(b, buddy, BUDDY_SYSTEM_INNER_BLOCK);
+            /*
+             * Coalesce block and buddy into one block.
+             */
+            hlp = b->op->coalesce(b, block, buddy);
+            /*
+             * Set order of the coalesced block to i + 1.
+             */
+            b->op->set_order(b, hlp, i + 1);
+            /*
+             * Recursively add the coalesced block to the list of order i + 1.
+             */
+            buddy_system_free(b, hlp);
+            return;
+        }
+    }
+    /*
+     * Insert block into the list of order i.
+     */
+    list_append(block, &b->order[i]);
+}
+/** Prints out structure of buddy system
+ *
+ * @param b Pointer to buddy system
+ * @param elem_size Element size
+ */
+void buddy_system_structure_print(buddy_system_t *b, size_t elem_size) {
+    index_t i;
+    count_t cnt, elem_count = 0, block_count = 0;
+    link_t * cur;
+    printf("Order\tBlocks\tSize    \tBlock size\tElems per block\n");
+    printf("-----\t------\t--------\t----------\t---------------\n");
+    for (i=0;i <= b->max_order; i++) {
+        cnt = 0;
+        if (!list_empty(&b->order[i])) {
+            for (cur = b->order[i].next; cur != &b->order[i]; cur = cur->next)
+                cnt++;
+        }
+        printf("#%zd\t%5zd\t%7zdK\t%8zdK\t%6zd\t", i, cnt, (cnt * (1 << i) * elem_size) >> 10, ((1 << i) * elem_size) >> 10, 1 << i);
+        if (!list_empty(&b->order[i])) {
+            for (cur = b->order[i].next; cur != &b->order[i]; cur = cur->next) {
+                b->op->print_id(b, cur);
+                printf(" ");
+            }
+        }
+        printf("\n");
+        block_count += cnt;
+        elem_count += (1 << i) * cnt;
+    }
+    printf("-----\t------\t--------\t----------\t---------------\n");
+    printf("Buddy system contains %zd free elements (%zd blocks)\n" , elem_count, block_count);
+}
+/** @}
+ */

Subversion Repositories HelenOS

Compare Revisions

Ignore whitespace Rev 2786 → Rev 2787