其他分享
首页 > 其他分享> > 18.1 KSM实现

18.1 KSM实现

作者:互联网

KSM在初始化是会创建一个名为"ksmd"的内核线程。

[mm/ksm.c]

static int __init ksm_init(void)
{
    struct task_struct *ksm_thread;
    int err;

    err = ksm_slab_init();
    if (err)
        goto out;

    ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
    if (IS_ERR(ksm_thread)) {
        pr_err("ksm: creating kthread failed\n");
        err = PTR_ERR(ksm_thread);
        goto out_free;
    }

#ifdef CONFIG_SYSFS
    err = sysfs_create_group(mm_kobj, &ksm_attr_group);
    if (err) {
        pr_err("ksm: register sysfs failed\n");
        kthread_stop(ksm_thread);
        goto out_free;
    }
#else
    ksm_run = KSM_RUN_MERGE;    /* no way for user to start it */

#endif /* CONFIG_SYSFS */

#ifdef CONFIG_MEMORY_HOTREMOVE
    /* There is no significance to this priority 100 */
    hotplug_memory_notifier(ksm_memory_callback, 100);
#endif
    return 0;

out_free:
    ksm_slab_free();
out:
    return err;
}

KSM只会处理通过madvise系统调用显式指定的用户进程空间内存,因此用户程序想使用这个功能就必须分配内存时显式地调用"madvise(add, length, MADV_MERGEABLE)", 如果用户想在KSM中取消某一个用户进程地址空间的合并功能,也需要显式得调用"madvise(addr, lenght, MADV_UNMERGEABLE)"。

    在Android系统中,在libc库(Android系统的libc库是bionic)中的mmap函数实现已经默认添加了此功能。

[libc/bionic/mmap.cpp]

void* mmap64(void* addr, size_t size, int prot, int flags, int fd, off64_t offset) {
  if (offset < 0 || (offset & ((1UL << MMAP2_SHIFT)-1)) != 0) {
    errno = EINVAL;
    return MAP_FAILED;
  }

  // prevent allocations large enough for `end - start` to overflow
  size_t rounded = BIONIC_ALIGN(size, PAGE_SIZE);
  if (rounded < size || rounded > PTRDIFF_MAX) {
    errno = ENOMEM;
    return MAP_FAILED;
  }

  bool is_private_anonymous =
      (flags & (MAP_PRIVATE | MAP_ANONYMOUS)) == (MAP_PRIVATE | MAP_ANONYMOUS);
  bool is_stack_or_grows_down = (flags & (MAP_STACK | MAP_GROWSDOWN)) != 0;

  void* result = __mmap2(addr, size, prot, flags, fd, offset >> MMAP2_SHIFT);

/*判断mmap分配的内存,即进程用户空间地址是否私有映射(MAP_PRIVATE)或匿名映射(MAP_ANONYMOUS),
    如果是,则显式地调用madvise系统调用把进程用户空间地址区间添加到Linux内核KSM系统中。*/
  if (result != MAP_FAILED && kernel_has_MADV_MERGEABLE &&
      is_private_anonymous && !is_stack_or_grows_down) {
    ErrnoRestorer errno_restorer;
    int rc = madvise(result, size, MADV_MERGEABLE);
    if (rc == -1 && errno == EINVAL) {
      kernel_has_MADV_MERGEABLE = false;
    }
  }

  return result;
}

[madvise()->ksm_madvise()->__ksm_enter()]

int __ksm_enter(struct mm_struct *mm)
{
    struct mm_slot *mm_slot;
    int needs_wakeup;
    /*分配一个struct mm_slot数据结构*/
    mm_slot = alloc_mm_slot();
    if (!mm_slot)
        return -ENOMEM;

    /* Check ksm_run too?  Would need tighter locking */
    needs_wakeup = list_empty(&ksm_mm_head.mm_list);

    /*添加管理ksm mmlist链表的spinlock锁*/
    spin_lock(&ksm_mmlist_lock);

    /*把当前的mm数据结构添加到mm_lots_hash哈希表中(mm_lots_hash是一个全局hash链表,必须加锁访问)*/
    insert_to_mm_slots_hash(mm, mm_slot);
    /*
     * When KSM_RUN_MERGE (or KSM_RUN_STOP),
     * insert just behind the scanning cursor, to let the area settle
     * down a little; when fork is followed by immediate exec, we don't
     * want ksmd to waste time setting up and tearing down an rmap_list.
     *
     * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
     * scanning cursor, otherwise KSM pages in newly forked mms will be
     * missed: then we might as well insert at the end of the list.
     */
    if (ksm_run & KSM_RUN_UNMERGE)
        list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
    else
        list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
    spin_unlock(&ksm_mmlist_lock);

    /*设置mm->flags中的MMF_VM_MERGEABLE标志位,表示这个进程已经添加到KSM系统中*/
    set_bit(MMF_VM_MERGEABLE, &mm->flags);
    atomic_inc(&mm->mm_count);

    /*如果之前ksm_mm_head.mm_list链表为空,则唤醒ksmd内核线程。*/
    if (needs_wakeup)
        wake_up_interruptible(&ksm_thread_wait);

    return 0;
}

ksm_scan_thread()函数实现:此函数是ksmd内核线程的主干,每次会执行ksm_do_scan()函数取扫描和合并100个页面(见ksm_thread_pages_to_scan变量),然后睡眠等待20毫秒(见ksm_thread_sleep_millisecs变量),这两个参数可以在"/sys/kernel/mm/ksm"目录下的相关参数中去设置和修改。

[ksmd内核线程]

static int ksm_scan_thread(void *nothing)
{
    set_freezable();
    set_user_nice(current, 5);

    while (!kthread_should_stop()) {
        mutex_lock(&ksm_thread_mutex);
        wait_while_offlining();
        if (ksmd_should_run())
            ksm_do_scan(ksm_thread_pages_to_scan);
        mutex_unlock(&ksm_thread_mutex);

        try_to_freeze();

        if (ksmd_should_run()) {
            schedule_timeout_interruptible(
                msecs_to_jiffies(ksm_thread_sleep_millisecs));
        } else {
            wait_event_freezable(ksm_thread_wait,
                ksmd_should_run() || kthread_should_stop());
        }
    }
    return 0;
}

ksm_do_scan()函数实现:

/**
 * ksm_do_scan  - the ksm scanner main worker function.
 * @scan_npages - number of pages we want to scan before we return.
 */
static void ksm_do_scan(unsigned int scan_npages)
{
    struct rmap_item *rmap_item;
    struct page *uninitialized_var(page);
    /*while循环中尝试去合并scan_npages个页面,*/
    while (scan_npages-- && likely(!freezing(current))) {
        cond_resched();
        /*scan_get_next_rmap_item()获取一个合适的匿名页面page,下面查看此函数实现*/
        rmap_item = scan_get_next_rmap_item(&page);
        if (!rmap_item)
            return;
        /*cmp_and_merge_page()会让page在KSM中的
        stable和unstable的两颗红黑树中查找是否有合适合并的对象,并且尝试去合并它们
        下面查看此函数实现*/
        cmp_and_merge_page(page, rmap_item);
        put_page(page);
    }
}

KSM的核心数据结构:

/*描述一个虚拟地址反向映射的条目(item)*/
struct rmap_item {
    /*所有的rmap_item连接成一个链表,链表头在ksm_scan.rmap_list中*/
    struct rmap_item *rmap_list;
    union {
        /*当rmap_item加入stable树时,指向VMA的anon_vma数据结构*/
        struct anon_vma *anon_vma;  /* when stable */
#ifdef CONFIG_NUMA
        int nid;        /* when node of unstable tree */
#endif
    };
    /*进程的struct mm_struct数据结构*/
    struct mm_struct *mm;
    /*rmap_item所跟踪的用户空间地址*/
    unsigned long address;      /* + low bits used for flags below */
    /*虚拟地址对应物理页面的旧校验值*/
    unsigned int oldchecksum;   /* when unstable */
    union {
        /*rmap_item加入unstable红黑树的节点*/
        struct rb_node node;    /* when node of unstable tree */
        struct {        /* when listed from stable tree */
            /*加入stable红黑树的节点*/
            struct stable_node *head;
            /*stable链表*/
            struct hlist_node hlist;
        };
    };
};

/*描述添加到KSM系统中将要被扫描的进程mm_struct数据结构*/
struct mm_slot {
    /*用于添加到mm_slot哈希表中*/
    struct hlist_node link;
    /*用于添加到mm_slot链表中,链表头在ksm_mm_head*/
    struct list_head mm_list;
    /*rmap_item链表头*/
    struct rmap_item *rmap_list;
    /*进程的mm数据结构*/
    struct mm_struct *mm;
};

/*用于表示当前扫描的状态*/
struct ksm_scan {
    /*当前正在扫描的mm_slot*/
    struct mm_slot *mm_slot;
    /*下一次扫描的地址*/
    unsigned long address;
    /*将要扫描rmap_item的指针*/
    struct rmap_item **rmap_list;
    /*全局扫描完成后会计数一次,用于删除unstable节点*/
    unsigned long seqnr;
};

[mm/ksm.c]

ksm_mm_head是mm_slot链表的头。ksm_scan是静态全局的数据结构,用于描述当前扫描的mm_slot。
static struct mm_slot ksm_mm_head = {
    .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
};
static struct ksm_scan ksm_scan = {
    .mm_slot = &ksm_mm_head,
};

scan_get_next_rmap_item()函数实现:获取一个合格的匿名页面page。

如果找到合格的page,分配一个rmap_item结构体,用来描述page

下面来看ksm_do_scan()中scan_get_next_rmap_item()函数的实现。

[ksm_do_scan()->scan_get_next_rmap_item()]

static struct rmap_item *scan_get_next_rmap_item(struct page **page)
{
    struct mm_struct *mm;
    struct mm_slot *slot;
    struct vm_area_struct *vma;
    struct rmap_item *rmap_item;
    int nid;

    if (list_empty(&ksm_mm_head.mm_list))
        return NULL;

    /*ksmd第一次跑的情况,初始化ksm_scan数据结构中的成员ksm_scan.mm_slot、
    ksm_scan.address和ksm_scan.rmap_list*/
    slot = ksm_scan.mm_slot;
    if (slot == &ksm_mm_head) {
        /*
         * A number of pages can hang around indefinitely on per-cpu
         * pagevecs, raised page count preventing write_protect_page
         * from merging them.  Though it doesn't really matter much,
         * it is puzzling to see some stuck in pages_volatile until
         * other activity jostles them out, and they also prevented
         * LTP's KSM test from succeeding deterministically; so drain
         * them here (here rather than on entry to ksm_do_scan(),
         * so we don't IPI too often when pages_to_scan is set low).
         */
        lru_add_drain_all();

        /*
         * Whereas stale stable_nodes on the stable_tree itself
         * get pruned in the regular course of stable_tree_search(),
         * those moved out to the migrate_nodes list can accumulate:
         * so prune them once before each full scan.
         */
        if (!ksm_merge_across_nodes) {
            struct stable_node *stable_node;
            struct list_head *this, *next;
            struct page *page;

            list_for_each_safe(this, next, &migrate_nodes) {
                stable_node = list_entry(this,
                        struct stable_node, list);
                page = get_ksm_page(stable_node, false);
                if (page)
                    put_page(page);
                cond_resched();
            }
        }

        for (nid = 0; nid < ksm_nr_node_ids; nid++)
            root_unstable_tree[nid] = RB_ROOT;

        spin_lock(&ksm_mmlist_lock);
        slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
        ksm_scan.mm_slot = slot;
        spin_unlock(&ksm_mmlist_lock);
        /*
         * Although we tested list_empty() above, a racing __ksm_exit
         * of the last mm on the list may have removed it since then.
         */
        if (slot == &ksm_mm_head)
            return NULL;
next_mm:
        ksm_scan.address = 0;
        ksm_scan.rmap_list = &slot->rmap_list;
    }


    /*扫描当前slot对应的用户进程中的所有VMAs,寻找一个合适的匿名页面*/
    mm = slot->mm;
    down_read(&mm->mmap_sem);
    if (ksm_test_exit(mm))
        vma = NULL;
    else
        vma = find_vma(mm, ksm_scan.address);/*因为ksm_scan.address刚初始化时为0,
    所以这里会找到这个用户进程中的第一个VMA。*/
    
    /*for循环遍历所有的VMA*/
    for (; vma; vma = vma->vm_next) {
        if (!(vma->vm_flags & VM_MERGEABLE))
            continue;
        if (ksm_scan.address < vma->vm_start)
            ksm_scan.address = vma->vm_start; /*第一个vma的地址存放在ksm_scan.address*/
        if (!vma->anon_vma)
            ksm_scan.address = vma->vm_end;

        while (ksm_scan.address < vma->vm_end) {
            if (ksm_test_exit(mm))
                break;
            /*follow_page()函数从虚拟地址开始找回normal mapping页面的struct page数据结构,
            KSM只会处理匿名页面的情况*/
            *page = follow_page(vma, ksm_scan.address, FOLL_GET);
            if (IS_ERR_OR_NULL(*page)) {
                ksm_scan.address += PAGE_SIZE;
                cond_resched();
                continue;
            }
            /*使用PageAnon()来判断该页是否为匿名页面*/
            if (PageAnon(*page) ||
                page_trans_compound_anon(*page)) {
                /*下面两行代码冲刷该页对应的cache。*/
                flush_anon_page(vma, *page, ksm_scan.address);
                flush_dcache_page(*page);
                /*get_next_rmap_item()去找ksm_scan.rmap_list链表上是否有
                该虚拟地址对应的rmap_item,没有找到就新建一个*/
                rmap_item = get_next_rmap_item(slot,
                    ksm_scan.rmap_list, ksm_scan.address);
                if (rmap_item) {
                    /*ksm_scan.rmap_list指向刚找到或者新建的rmap_item,方便后续的扫描。
                    找到合适的匿名页面后,释放mm->mmap_sem信号量,这个信号量是在扫描VMA
                    时加的,然后返回该页struct page数据结构*/
                    ksm_scan.rmap_list =
                            &rmap_item->rmap_list;
                    ksm_scan.address += PAGE_SIZE;
                } else
                    put_page(*page);
                up_read(&mm->mmap_sem);
                return rmap_item;
            }
            put_page(*page);
            ksm_scan.address += PAGE_SIZE;
            cond_resched();
        }
    }
    /*运行到这里说明for循环里扫描该进程所有的VMA都没有找到合适的匿名页面,因为如果
    找到一个合适的匿名页面是会返回rmap_item的。如果被扫描的进程已经销毁了(mm->mm_users = 0),
    那么设置ksm_scan.address = 0.*/
    if (ksm_test_exit(mm)) {
        ksm_scan.address = 0;
        ksm_scan.rmap_list = &slot->rmap_list;
    }
    /*
     * Nuke all the rmap_items that are above this current rmap:
     * because there were no VM_MERGEABLE vmas with such addresses.
     */
    /*在该进程中没找到合适的匿名页面时,那么对应的rmap_item已经没有用处,为了
    避免占用内存空间,直接全部删除。*/
    remove_trailing_rmap_items(slot, ksm_scan.rmap_list);

    spin_lock(&ksm_mmlist_lock);
    /*取下一个mm_slot,这里操作了mm_slot链表,所以用一个spinlock锁
    ksm_mmlist_lock来保护链表*/
    ksm_scan.mm_slot = list_entry(slot->mm_list.next,
                        struct mm_slot, mm_list);

    /*如果找到一个合适的匿名页面是会返回rmap_item的。如果被扫描的进程已经销毁
    了(mm->mm_users = 0),那么设置ksm_scan.address = 0.
        处理该进程被销毁的情况,把mm_slot从ksm_mm_head链表删除,释放mm_slot数据结构,
    清空mm->flags中的MMF_VM_MERGEABLE标志位*/
    if (ksm_scan.address == 0) {
        /*
         * We've completed a full scan of all vmas, holding mmap_sem
         * throughout, and found no VM_MERGEABLE: so do the same as
         * __ksm_exit does to remove this mm from all our lists now.
         * This applies either when cleaning up after __ksm_exit
         * (but beware: we can reach here even before __ksm_exit),
         * or when all VM_MERGEABLE areas have been unmapped (and
         * mmap_sem then protects against race with MADV_MERGEABLE).
         */
        hash_del(&slot->link);
        list_del(&slot->mm_list);
        spin_unlock(&ksm_mmlist_lock);

        free_mm_slot(slot);
        clear_bit(MMF_VM_MERGEABLE, &mm->flags);
        up_read(&mm->mmap_sem);
        mmdrop(mm);
    } else {
        spin_unlock(&ksm_mmlist_lock);
        up_read(&mm->mmap_sem);
    }

    /* Repeat until we've completed scanning the whole list */
    /*如果没有扫描完一轮所有的mm_slot,那么继续扫描下一个mm_slot。*/
    slot = ksm_scan.mm_slot;
    if (slot != &ksm_mm_head)
        goto next_mm;

    /*如果扫描完一轮mm_slot,则增加ksm_scan.seqnr计数*/
    ksm_scan.seqnr++;
    return NULL;
}

回到ksm_do_scan()函数

cmp_and_merge_page()函数实现:

[ksm_do_scan()->cmp_and_merge_page()]

/*
 * cmp_and_merge_page - first see if page can be merged into the stable tree;
 * if not, compare checksum to previous and if it's the same, see if page can
 * be inserted into the unstable tree, or merged with a page already there and
 * both transferred to the stable tree.
 *
 * @page: the page that we are searching identical page to.
 * @rmap_item: the reverse mapping into the virtual address of this page
 */
/*
参数说明:
@page: 刚才扫描mm_slot时找到的一个合适的匿名页面,rmap_item表示该page对应的rmap_item数据结构。
@rmap_item: 表示该page对应的rmap_item数据结构。
*/
static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
{
    struct rmap_item *tree_rmap_item;
    struct page *tree_page = NULL;
    struct stable_node *stable_node;
    struct page *kpage;
    unsigned int checksum;
    int err;

    /*如果这个页面是stable_node,则page_stable_node()返回这个page对应的stable_node,
    否则返回NULL*/
    stable_node = page_stable_node(page);
    if (stable_node) {
        if (stable_node->head != &migrate_nodes &&
            get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
            rb_erase(&stable_node->node,
                 root_stable_tree + NUMA(stable_node->nid));
            stable_node->head = &migrate_nodes;
            list_add(&stable_node->list, stable_node->head);
        }
        if (stable_node->head != &migrate_nodes &&
            rmap_item->head == stable_node)
            return;
    }

    /* We first start with searching the page inside the stable tree */
    /*stable_tree_search()函数在stable红黑树中查找页面内容和page相同的stable页。下面查看此函数实现*/
    kpage = stable_tree_search(page);
    /*如果找到的stable页kpage和page是同一个页面,说明该页已经是KSM页面,不需要继续处理。
    直接返回。put_page()减少_count引用计数,注意page在scan_get_next_rmap_item()->
    follow_page()时给该页增加了_count引用计数。*/
    if (kpage == page && rmap_item->head == stable_node) {
        put_page(kpage);
        return;
    }
    
    remove_rmap_item_from_tree(rmap_item);

    if (kpage) {
        /*如果在stable红黑树中找到一个页面内容相同的节点,那么调用try_to_merge_with_ksm_page()
        尝试合并这个页面到节点上。合并成功后,stable_tree_append()会把rmap_item添加到
        stable_node->hlist哈希链表上,下面查看此函数实现*/
        err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
        if (!err) {
            /*
             * The page was successfully merged:
             * add its rmap_item to the stable tree.
             */
            lock_page(kpage);
            /*try_to_merge_with_ksm_page把page合并到kpage页面后,需要做
            一些统计工作,下面来查看此函数实现*/
            stable_tree_append(rmap_item, page_stable_node(kpage));
            unlock_page(kpage);
        }
        put_page(kpage);
        return;
    }

    /*
     * If the hash value of the page has changed from the last time
     * we calculated it, this page is changing frequently: therefore we
     * don't want to insert it in the unstable tree, and we don't want
     * to waste our time searching for something identical to it there.
     */
    /*若在stable红黑树中没能找到和page内容相同的节点,则重新计算该页的校验值。
    如果校验值发生变化,说明该页面的内容被频繁修改,这种页面不适合添加到unstable
    红黑树中*/
    checksum = calc_checksum(page);
    if (rmap_item->oldchecksum != checksum) {
        rmap_item->oldchecksum = checksum;
        return;
    }

    /*上面是在stable树中找到候选者页面内容相同的情况,假设在stable树中没有找到的话,那么
    接下来就去查找unstable树*/


    /*unstable_tree_search_insert()搜索unstable红黑树中是否有和该页内容相同的节点,
    如果没有找到,则将page插入到unstable树,在一次全盘扫描结束的时候会删除unstable树,再
下一次重新加入page,下面查看此函数实现*/
    tree_rmap_item =
        unstable_tree_search_insert(rmap_item, page, &tree_page);


    /*若在unstable红黑树中能找到页面内容相同的节点tree_rmap_item和页面tree_page,
    那么调用try_to_merge_two_pages()去尝试合并该页page和tree_page称为一个KSM页面
    kpage。stable_tree_insert()会把kpage添加到stable红黑树中,创建一个新的stable
    节点。stable_tree_append()把tree_rmap_item和rmap_item添加到stable节点的哈希
    链表中,并更新统计计数ksm_pages_sharing和ksm_pages_shared*/
    if (tree_rmap_item) {
        /*当unstable树中找到和候选页面page内容相同的tree_page后,尝试把该page和
        tree_page合并成一个KSM页面,下面我们查看try_to_merge_two_pages()函数实现*/
        kpage = try_to_merge_two_pages(rmap_item, page,
                        tree_rmap_item, tree_page);
        put_page(tree_page);
        if (kpage) {
            /*
             * The pages were successfully merged: insert new
             * node in the stable tree and add both rmap_items.
             */
            lock_page(kpage);
            /*当候选page荣升为KSM页面kpage后,stable_tree_insert()会把KSM页
            kpage添加到stable树中,下面查看此函数实现*/
            stable_node = stable_tree_insert(kpage);
            if (stable_node) {
                stable_tree_append(tree_rmap_item, stable_node);
                stable_tree_append(rmap_item, stable_node);
            }
            unlock_page(kpage);

            /*
             * If we fail to insert the page into the stable tree,
             * we will have 2 virtual addresses that are pointing
             * to a ksm page left outside the stable tree,
             * in which case we need to break_cow on both.
             */

            /*到这里,我们就完成了对一个页面如何合并成KSM页面的介绍,包括了查找stable树和
            unstable树等,接下来看如果在合并过程中发生失败的情况。*/

            /*如果stable节点插入到stable红黑树失败,那么调用
            break_cow()主动触发一个缺页中断来分离这个ksm页面,查看此函数的实现*/
            if (!stable_node) {
                break_cow(tree_rmap_item);
                break_cow(rmap_item);
            }
        }
    }
}
回到ksm_do_scan

stable_tree_search()函数实现:搜索stable红黑树并查找是否有和page页面内容一致的节点。

[ksm_do_scan()->cmp_and_merge_page()->stable_tree_search()]

/*
 * stable_tree_search - search for page inside the stable tree
 *
 * This function checks if there is a page inside the stable tree
 * with identical content to the page that we are scanning right now.
 *
 * This function returns the stable tree node of identical content if found,
 * NULL otherwise.
 */
static struct page *stable_tree_search(struct page *page)
{
    int nid;
    struct rb_root *root;
    struct rb_node **new;
    struct rb_node *parent;
    struct stable_node *stable_node;
    struct stable_node *page_node;

    /*如果page已经是stable page,那不需要搜索了。*/
    page_node = page_stable_node(page);
    if (page_node && page_node->head != &migrate_nodes) {
        /* ksm page forked */
        get_page(page);
        return page;
    }

    nid = get_kpfn_nid(page_to_pfn(page));

    /*从这里开始搜索stable红黑树*/
    root = root_stable_tree + nid;
again:
    new = &root->rb_node;
    parent = NULL;

    while (*new) {
        struct page *tree_page;
        int ret;

        cond_resched();
        /*rb_entry取出一个节点元素stable_node*/
        stable_node = rb_entry(*new, struct stable_node, node);
        /*get_ksm_page()函数把对应的stable节点转换为struct page数据结构。
        stable节点中有一个成员kpfn存放着页帧号,通过页帧号可以求出对应的page数据
        结构tree_page,注意这个函数会增加该结点tree_page的_count引用计数。*/
        tree_page = get_ksm_page(stable_node, false);
        if (!tree_page)
            return NULL;
        /*通过memcmp_pages()来对比page和tree_page的内容是否一致。*/
        ret = memcmp_pages(page, tree_page);
        /*调用put_page()来减少tree_page的_count引用计数,之前get_ksm_page()对该页增加了引用计数*/
        put_page(tree_page);

        parent = *new;
        /*如果两个page内容不一致,则继续搜索红黑树的叶节点。*/
        if (ret < 0)
            new = &parent->rb_left;
        else if (ret > 0)
            new = &parent->rb_right;
        else {
            /*
             * Lock and unlock the stable_node's page (which
             * might already have been migrated) so that page
             * migration is sure to notice its raised count.
             * It would be more elegant to return stable_node
             * than kpage, but that involves more changes.
             */
            /*page和tree_page内容一致,重新用get_ksm_page()增加tree_page的引用计数,
            其实是让页面迁移模块(page migration)知道这里在使用这个页面,最后返回tree_page*/
            tree_page = get_ksm_page(stable_node, true);
            if (tree_page) {
                unlock_page(tree_page);
                if (get_kpfn_nid(stable_node->kpfn) !=
                        NUMA(stable_node->nid)) {
                    put_page(tree_page);
                    goto replace;
                }
                return tree_page;
            }
            /*
             * There is now a place for page_node, but the tree may
             * have been rebalanced, so re-evaluate parent and new.
             */
            if (page_node)
                goto again;
            return NULL;
        }
    }

    if (!page_node)
        return NULL;

    list_del(&page_node->list);
    DO_NUMA(page_node->nid = nid);
    rb_link_node(&page_node->node, parent, new);
    rb_insert_color(&page_node->node, root);
    get_page(page);
    return page;

replace:
    if (page_node) {
        list_del(&page_node->list);
        DO_NUMA(page_node->nid = nid);
        rb_replace_node(&stable_node->node, &page_node->node, root);
        get_page(page);
    } else {
        rb_erase(&stable_node->node, root);
        page = NULL;
    }
    stable_node->head = &migrate_nodes;
    list_add(&stable_node->list, stable_node->head);
    return page;
}
回到cmp_and_merge_page

try_to_merge_with_ksm_page()函数实现:合并page页面到ksm页面

[ksm_do_scan()->cmp_and_merge_page()->try_to_merge_with_ksm_page()]

/*
 * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
 * but no new kernel page is allocated: kpage must already be a ksm page.
 *
 * This function returns 0 if the pages were merged, -EFAULT otherwise.
 */
/*参数说明:
@rmap_item: 候选页对应的rmap_item结构
@page:需要合并的页
@kpage: 是stable树中的KSM页面,尝试把候选页page合并到kpage中。

*/
static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
                      struct page *page, struct page *kpage)
{
    struct mm_struct *mm = rmap_item->mm;
    struct vm_area_struct *vma;
    int err = -EFAULT;

    /*需要操作VMA,因此加一个mm->mmap_sem读者锁*/
    down_read(&mm->mmap_sem);
    if (ksm_test_exit(mm))
        goto out;
    /*根据虚拟地址找到对应的VMA*/
    vma = find_vma(mm, rmap_item->address);
    if (!vma || vma->vm_start > rmap_item->address)
        goto out;

    /*调用try_to_merge_one_page(),尝试合并page到kpage中。下面查看此函数实现*/
    err = try_to_merge_one_page(vma, page, kpage);
    if (err)
        goto out;

    /* Unstable nid is in union with stable anon_vma: remove first */
    remove_rmap_item_from_tree(rmap_item);

    /* Must get reference to anon_vma while still holding mmap_sem */
    /*rmap_item->anon_vma指向VMA对应的anon_vma数据结构*/
    rmap_item->anon_vma = vma->anon_vma;
    /*增加anon_vma->refcount的引用计数,防止anon_vma被释放*/
    get_anon_vma(vma->anon_vma);
out:
    /*释放mm->mmap_sem的读者锁*/
    up_read(&mm->mmap_sem);
    return err;
}
回到cmp_and_merge_page

try_to_merge_one_page()函数实现:尝试合并page页面到kpage

[ksm_do_scan()->cmp_and_merge_page()->try_to_merge_with_ksm_page()->try_to_merge_one_page()]

/*
 * try_to_merge_one_page - take two pages and merge them into one
 * @vma: the vma that holds the pte pointing to page
 * @page: the PageAnon page that we want to replace with kpage
 * @kpage: the PageKsm page that we want to map instead of page,
 *         or NULL the first time when we want to use page as kpage.
 *
 * This function returns 0 if the pages were merged, -EFAULT otherwise.
 */
static int try_to_merge_one_page(struct vm_area_struct *vma,
                 struct page *page, struct page *kpage)
{
    pte_t orig_pte = __pte(0);
    int err = -EFAULT;

    /*page和kpage是同一个page*/
    if (page == kpage)          /* ksm page forked */
        return 0;
    /*page对应的VMA属性是不可合并的,即没有包含VM_MERGEABLE标志。*/
    if (!(vma->vm_flags & VM_MERGEABLE))
        goto out;
    if (PageTransCompound(page) && page_trans_compound_anon_split(page))
        goto out;
    BUG_ON(PageTransCompound(page));
    /*剔除不是匿名页面的部分*/
    if (!PageAnon(page))
        goto out;

    /*
     * We need the page lock to read a stable PageSwapCache in
     * write_protect_page().  We use trylock_page() instead of
     * lock_page() because we don't want to wait here - we
     * prefer to continue scanning and merging different pages,
     * then come back to this page when it is unlocked.
     */
    /*这里为什么要使用trylock_page(page), 而不是lock_page(page)呢?我们需要
    申请该页的页面锁以方便在稍后的write_protect_page()中读取稳定的PageSwapCache
    的状态,并且不需要在这里睡眠等待该页的页面锁。如果该页被其他人加锁了,我们可以
    略过它,先处理其他页面。*/
    if (!trylock_page(page))
        goto out;
    /*
     * If this anonymous page is mapped only here, its pte may need
     * to be write-protected.  If it's mapped elsewhere, all of its
     * ptes are necessarily already write-protected.  But in either
     * case, we need to lock and check page_count is not raised.
     */
    /*write_protect_page()对该页映射VMA的pte进行写保护操作,下面查看此函数实现*/
    if (write_protect_page(vma, page, &orig_pte) == 0) {
        /*在与unstable书节点合并时,参数kpage有可能传过来NULL,这主要是设置
        page为stable节点,并且设置该页的活动情况(mark_page_accessed())*/
        if (!kpage) {
            /*
             * While we hold page lock, upgrade page from
             * PageAnon+anon_vma to PageKsm+NULL stable_node:
             * stable_tree_insert() will update stable_node.
             */
            set_page_stable_node(page, NULL);
            mark_page_accessed(page);
            err = 0;

        /*pages_identical()再一次比较page和kpage内容是否一致。如果一致,则调用
        replace_page(),把该page对应的pte设置对应的kpage中*/
        } else if (pages_identical(page, kpage))
            err = replace_page(vma, page, kpage, orig_pte);/*vma的用户空间和kpage建立映射,下面查看此函数实现*/
    }

    if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
        munlock_vma_page(page);
        if (!PageMlocked(kpage)) {
            unlock_page(page);
            lock_page(kpage);
            mlock_vma_page(kpage);
            page = kpage;       /* for final unlock */
        }
    }

    unlock_page(page);
out:
    return err;
}
回到try_to_merge_with_ksm_page

write_protect_page()函数实现:

[ksm_do_scan()->cmp_and_merge_page()->try_to_merge_with_ksm_page()->try_to_merge_one_page()->write_protect_page()]

static int write_protect_page(struct vm_area_struct *vma, struct page *page,
                  pte_t *orig_pte)
{
    struct mm_struct *mm = vma->vm_mm;
    unsigned long addr;
    pte_t *ptep;
    spinlock_t *ptl;
    int swapped;
    int err = -EFAULT;
    unsigned long mmun_start;   /* For mmu_notifiers */
    unsigned long mmun_end;     /* For mmu_notifiers */

    /*通过VMA和page数据结构可以计算出page对应的虚拟地址address。page结构
    中有一个成员index,表示在VMA中的偏移量,由此可以得出虚拟地址。*/
    addr = page_address_in_vma(page, vma);
    if (addr == -EFAULT)
        goto out;

    BUG_ON(PageTransCompound(page));

    mmun_start = addr;
    mmun_end   = addr + PAGE_SIZE;
    mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
    /*有mm和虚拟地址address通过查询页表找到该地址对应的pte页表项*/
    ptep = page_check_address(page, mm, addr, &ptl, 0);
    if (!ptep)
        goto out_mn;

    /*因为该函数的作用是设置pte为写保护,因此对应pte页表项的属性是可写或脏页面需要设置pte为写
    保护(对ARM处理器设置页表项的L_PTE_RDONLY比特位,对x86处理器清_PAGE_BIT_RW比特位),脏页面
    通过set_page_dirty()函数来调用该页的mapping->a_ops->set_page_dirty()函数并通知回写系统。*/
    if (pte_write(*ptep) || pte_dirty(*ptep)) {
        pte_t entry;

        swapped = PageSwapCache(page);
        /*刷新这个页面对应的cache。*/
        flush_cache_page(vma, addr, page_to_pfn(page));
        /*
         * Ok this is tricky, when get_user_pages_fast() run it doesn't
         * take any lock, therefore the check that we are going to make
         * with the pagecount against the mapcount is racey and
         * O_DIRECT can happen right after the check.
         * So we clear the pte and flush the tlb before the check
         * this assure us that no O_DIRECT can happen after the check
         * or in the middle of the check.
         */
        /*清空pte页表项内容并冲刷相应的TLB,保证没有DIRECT_IO发生,函数返回该pte原来的内容*/
        entry = ptep_clear_flush_notify(vma, addr, ptep);
        /*
         * Check that no O_DIRECT or similar I/O is in progress on the
         * page
         */
        /*为什么要有这样的判断公式呢?
        这是一个需要深入理解内存管理代码才能明白的问题,涉及到page的_count和_mapcount两个引用计数的巧妙运用。
        write_protect_page()函数本身的目的是让页面变成只读,后续可以做比较和合并的工作了。要把一个页面变成只读
        需要满足如下两个条件:
            (1) 确认没有其他人获取了该页面。
            (2) 将指向该页面的pte变成只读属性。
        第二个条件容易处理,难点在第一个条件上。一般来说,page 的_count计数有如下4中来源:
            (1) page cache在radix tree上,KSM不考虑page cache情况
            (2) 被用户引用,_count和_mapcount都会增加计数
            (3) page->private私有数据也会增加_count计数,对于匿名页面,需要判断是否在swap cache中,例如add_to_swap()函数。
            (4) 内核中有某些页面操作时会增加_count计数,例如follow_page()、get_user_pages_fast()等。
        假设没有他内核路径操作该页面,并且该页面不在swap cache中,两个引用计数的关系为:
                (page->_mapcount + 1) = page->_count
            那么在write_protect_page()场景中,swapped指的是页面是否为swapcache,在add_to_swap()函数里增加_count计数,因此
        上面的公式可以变为:
                (page->_mapcount + 1) + PageSwapCache() = page->_count
            这里为什么会有"+1"呢?因为该页面scan_get_next_rmap_item()函数通过follow_page()操作来获取struct page数据结构,
        这个过程会让page->_count引用计数加1,综上所述,在当前场景下判断没有DIRECT_IO读写的情况,公式变为:
                (page->_mapcount + 1) + 1 + PageSwapCache() == page->_count
            因此判断不相等,说明有内核代码路径(例如DIRECT_IO读写)正在操作该页面,那么write_protect_page()函数只能返回错误。
        */
        if (page_mapcount(page) + 1 + swapped != page_count(page)) {
            set_pte_at(mm, addr, ptep, entry);
            goto out_unlock;
        }
        if (pte_dirty(entry))
            set_page_dirty(page);

        /*下面两行生成一个具有只读属性的PTE entry,并设置到硬件页面中*/
        entry = pte_mkclean(pte_wrprotect(entry));
        set_pte_at_notify(mm, addr, ptep, entry);
    }
    *orig_pte = *ptep;
    err = 0;

out_unlock:
    pte_unmap_unlock(ptep, ptl);
out_mn:
    mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
    return err;
}
回到try_to_merge_one_page

replace_page()函数实现:

[ksm_do_scan()->cmp_and_merge_page()->try_to_merge_with_ksm_page()->try_to_merge_one_page()->replace_page()]

/**
 * replace_page - replace page in vma by new ksm page
 * @vma:      vma that holds the pte pointing to page
 * @page:     the page we are replacing by kpage 旧的page
 * @kpage:    the ksm page we replace page by kpage是stable树中找到的KSM页面
 * @orig_pte: the original value of the pte 用于判断在这期间page是否被修改了
 *
 * Returns 0 on success, -EFAULT on failure.
 */
/*此函数的作用:简单说就是使用kpage的pfn加上原来page的一些属性构造成一个新的pte页表项,然后写入原来
page的pte页表项中,这样原来的page页对应的VMA用户地址空间就和kpage建立了映射关系*/
static int replace_page(struct vm_area_struct *vma, struct page *page,
            struct page *kpage, pte_t orig_pte)
{
    struct mm_struct *mm = vma->vm_mm;
    pmd_t *pmd;
    pte_t *ptep;
    spinlock_t *ptl;
    unsigned long addr;
    int err = -EFAULT;
    unsigned long mmun_start;   /* For mmu_notifiers */
    unsigned long mmun_end;     /* For mmu_notifiers */

    addr = page_address_in_vma(page, vma);
    if (addr == -EFAULT)
        goto out;

    pmd = mm_find_pmd(mm, addr);
    if (!pmd)
        goto out;

    mmun_start = addr;
    mmun_end   = addr + PAGE_SIZE;
    mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);

    ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
    if (!pte_same(*ptep, orig_pte)) {
        pte_unmap_unlock(ptep, ptl);
        goto out_mn;
    }

    /*给kpage增加在_count引用计数*/
    get_page(kpage);

    /*看起来page_add_anon_rmap()是要把kpage添加到RMAP系统中,因为kpage早已经
    添加到RMAP系统中,所以这里只是增加_mapcount计数*/
    page_add_anon_rmap(kpage, vma, addr);

    /*冲刷addr和pte对应的cache,然后清空pte的内容和对应的TLB后,写入新的pte内容*/
    flush_cache_page(vma, addr, pte_pfn(*ptep));
    ptep_clear_flush_notify(vma, addr, ptep);
    set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));

    /*减少page的_mapcount和_count计数,并且删掉该page在swap分区的swap space*/
    page_remove_rmap(page);
    if (!page_mapped(page))
        try_to_free_swap(page);
    put_page(page);

    pte_unmap_unlock(ptep, ptl);
    err = 0;
out_mn:
    mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
    return err;
}

回到try_to_merge_one_page函数

stable_tree_append()函数实现:

[ksm_do_scan()->cmp_and_merge_page()->stable_tree_append()]

/*
 * stable_tree_append - add another rmap_item to the linked list of
 * rmap_items hanging off a given node of the stable tree, all sharing
 * the same ksm page.
 */
/*参数说明:
@rmap_item: 是page页面对应的rmap_item数据结构
@stable_node: struct stable_node是KSM页面mapping指向的数据结构,类似匿名页面中的anon_vma数据结构
    stable_node参数是kpage指向的struct stable_node数据结构。
*/
static void stable_tree_append(struct rmap_item *rmap_item,
                   struct stable_node *stable_node)
{
    rmap_item->head = stable_node;
    rmap_item->address |= STABLE_FLAG;
    /*把rmap_item添加到kpage页面的stable_node中的哈希链表里*/
    hlist_add_head(&rmap_item->hlist, &stable_node->hlist);

    /*如果有多个页面同时映射到stable_node上,则增加ksm_pages_sharing计数,
    否则增加ksm_pages_shared计数,说明这是一个新成立的stable节点。
    ksm_pages_shared计数表示系统中有多个ksm节点,ksm_pages_sharing计数表示
    合并到ksm节点中的页面个数。*/
    if (rmap_item->hlist.next)
        ksm_pages_sharing++;
    else
        ksm_pages_shared++;
}
回到cmp_and_merge_page()函数

unstable_tree_search_insert()函数实现:

[ksm_do_scan()->cmp_and_merge_page()->unstable_tree_search_insert()]

/*
 * unstable_tree_search_insert - search for identical page,
 * else insert rmap_item into the unstable tree.
 *
 * This function searches for a page in the unstable tree identical to the
 * page currently being scanned; and if no identical page is found in the
 * tree, we insert rmap_item as a new object into the unstable tree.
 *
 * This function returns pointer to rmap_item found to be identical
 * to the currently scanned page, NULL otherwise.
 *
 * This function does both searching and inserting, because they share
 * the same walking algorithm in an rbtree.
 */
static
struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
                          struct page *page,
                          struct page **tree_pagep)
{
    struct rb_node **new;
    struct rb_root *root;
    struct rb_node *parent = NULL;
    int nid;

    nid = get_kpfn_nid(page_to_pfn(page));
    root = root_unstable_tree + nid;
    new = &root->rb_node;

    /*查找unstable红黑树,这棵树的根在root_unstable_tree。*/
    while (*new) {
        struct rmap_item *tree_rmap_item;
        struct page *tree_page;
        int ret;

        cond_resched();
        tree_rmap_item = rb_entry(*new, struct rmap_item, node);
        /*get_mergeable_page()判断从树中取出来的页面是否合格,只有匿名页面
        才可以被合并。如果在树中没找到和候选页面相同的内容,那么会把候选页面
        也添加到该树中。*/
        tree_page = get_mergeable_page(tree_rmap_item);
        if (IS_ERR_OR_NULL(tree_page))
            return NULL;

        /*
         * Don't substitute a ksm page for a forked page.
         */
        if (page == tree_page) {
            put_page(tree_page);
            return NULL;
        }

        ret = memcmp_pages(page, tree_page);

        parent = *new;
        if (ret < 0) {
            put_page(tree_page);
            new = &parent->rb_left;
        } else if (ret > 0) {
            put_page(tree_page);
            new = &parent->rb_right;
        } else if (!ksm_merge_across_nodes &&
               page_to_nid(tree_page) != nid) {
            /*
             * If tree_page has been migrated to another NUMA node,
             * it will be flushed out and put in the right unstable
             * tree next time: only merge with it when across_nodes.
             */
            put_page(tree_page);
            return NULL;
        } else {
            *tree_pagep = tree_page;
            return tree_rmap_item;
        }
    }

    /*rmap_item->address的低12比特位用于存放一些标志位。例如UNSTABLE_FLAG(0x100)
    表示rmap_item在unstable树中,另外低8位用于存放全盘扫描的次数seqnr。unstable树
    的节点会在一次全盘扫描后被删掉,在下一次全盘扫描重新加入到unstable树中。
    ksm_pages_unshared表示有在unstable树中的节点个数。*/
    rmap_item->address |= UNSTABLE_FLAG;
    rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
    DO_NUMA(rmap_item->nid = nid);
    rb_link_node(&rmap_item->node, parent, new);
    rb_insert_color(&rmap_item->node, root);

    ksm_pages_unshared++;
    return NULL;
}
回到cmp_and_merge_page函数

try_to_merge_two_pages()函数实现:

[ksm_do_scan()->cmp_and_merge_page()->try_to_merge_two_pages()]

/*
 * try_to_merge_two_pages - take two identical pages and prepare them
 * to be merged into one page.
 *
 * This function returns the kpage if we successfully merged two identical
 * pages into one ksm page, NULL otherwise.
 *
 * Note that this function upgrades page to ksm page: if one of the pages
 * is already a ksm page, try_to_merge_with_ksm_page should be used.
 */
static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
                       struct page *page,
                       struct rmap_item *tree_rmap_item,
                       struct page *tree_page)
{
    int err;
    /*这里调用了两次try_to_merge_with_ksm_page(),注意这两次调用的参数不一
    样,实现的功能也不一样。
    第一次,参数是候选者page和对应的rmap_item,kpage为NULL,因此第一次调用主
    要是把page的页表设置为写保护,并且把该页设置为KSM节点。*/
    err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
    if (!err) {
        /*第二次,参数变成了tree_page和对应的tree_rmap_item,kpage为候选者
        page,因此这里要实现的功能是把tree_page的页表设置为写保护,然后再比较
        tree_page和page之间的内容是否一致。在查找unstable树时已经做过了页面
        内容的比较,为什么这里还需要再比较一次呢?因为在这个过程中,页面有可能
        被别的进程修改了内容。当两个页面内容确保一致后,借用page的pfn来重新生
        成一个页表项并设置到tree_page的页表中,也就是tree_page对应的进程虚拟
        地址和物理页面page重新建立了映射关系,tree_page和page合并成了一个KSM
        页面,page作为KSM页面的联络点。*/
        err = try_to_merge_with_ksm_page(tree_rmap_item,
                            tree_page, page);
        /*
         * If that fails, we have a ksm page with only one pte
         * pointing to it: so break it.
         */
        if (err)
            break_cow(rmap_item);
    }
    return err ? NULL : page;
}
回到cmp_and_merge_page函数

stable_tree_insert()函数实现:

[ksm_do_scan()->cmp_and_merge_page()->stable_tree_insert()]

/*
 * stable_tree_insert - insert stable tree node pointing to new ksm page
 * into the stable tree.
 *
 * This function returns the stable tree node just allocated on success,
 * NULL otherwise.
 */
static struct stable_node *stable_tree_insert(struct page *kpage)
{
    int nid;
    unsigned long kpfn;
    struct rb_root *root;
    struct rb_node **new;
    struct rb_node *parent = NULL;
    struct stable_node *stable_node;

    kpfn = page_to_pfn(kpage);
    nid = get_kpfn_nid(kpfn);
    root = root_stable_tree + nid;
    new = &root->rb_node;

    /*查找stable树查找合适插入的叶节点*/
    while (*new) {
        struct page *tree_page;
        int ret;

        cond_resched();
        stable_node = rb_entry(*new, struct stable_node, node);
        tree_page = get_ksm_page(stable_node, false);
        if (!tree_page)
            return NULL;

        ret = memcmp_pages(kpage, tree_page);
        put_page(tree_page);

        parent = *new;
        if (ret < 0)
            new = &parent->rb_left;
        else if (ret > 0)
            new = &parent->rb_right;
        else {
            /*
             * It is not a bug that stable_tree_search() didn't
             * find this node: because at that time our page was
             * not yet write-protected, so may have changed since.
             */
            return NULL;
        }
    }

    /*分配一个新的stable_node节点,page->mapping指向stable_node节点,然后把
    stable_node节点插入到stable树中*/
    stable_node = alloc_stable_node();
    if (!stable_node)
        return NULL;

    /*最后rmap_item和tree_rmap_item会添加到新的stable_tree的哈希链表中,并更新ksm的数据统计*/
    INIT_HLIST_HEAD(&stable_node->hlist);
    stable_node->kpfn = kpfn;
    set_page_stable_node(kpage, stable_node);
    DO_NUMA(stable_node->nid = nid);
    rb_link_node(&stable_node->node, parent, new);
    rb_insert_color(&stable_node->node, root);

    return stable_node;
}

回到cmp_and_merge_page函数

break_cow()函数实现:

此函数处理已经把页面设置成写保护的情况,并人为造一个写错误的缺页中断,即写时复制(COW)的场景。其中,参数rmap_item中保存了该页的虚拟地址和进程数据结构,由此可以找到对应的VMA。

[ksm_do_scan()->cmp_and_merge_page()->break_cow()]

static void break_cow(struct rmap_item *rmap_item)
{
    struct mm_struct *mm = rmap_item->mm;
    unsigned long addr = rmap_item->address;
    struct vm_area_struct *vma;

    /*
     * It is not an accident that whenever we want to break COW
     * to undo, we also need to drop a reference to the anon_vma.
     */
    put_anon_vma(rmap_item->anon_vma);

    down_read(&mm->mmap_sem);
    vma = find_mergeable_vma(mm, addr);
    if (vma)
        break_ksm(vma, addr);
    up_read(&mm->mmap_sem);
}

break_ksm()函数实现:

[ksm_do_scan()->cmp_and_merge_page()->break_cow()->break_ksm()]

/*
 * We use break_ksm to break COW on a ksm page: it's a stripped down
 *
 *  if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
 *      put_page(page);
 *
 * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
 * in case the application has unmapped and remapped mm,addr meanwhile.
 * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
 * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
 */
static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
{
    struct page *page;
    int ret = 0;

    do {
        cond_resched();
        /*follow_page函数由VMA和虚拟地址获取normal mapping的页面数据结构,参数flags
        是FOLL_GET|FOLL_MIGRATION, FOLL_GET表示增加该页的_count计数,FOLL_MIGRATION
        表示如果该页在页迁移的过程中会等待页迁移完成。*/
        page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION);
        if (IS_ERR_OR_NULL(page))
            break;

        /*对于KSM页面,这里直接调用handle_mm_fault()人为造一个写错误(FAULT_FLAG_WRITE)的
        缺页中断,在缺页中断处理函数找那个处理写时复制COW,最终调用do_wp_page()重新分配一个页面来
        和对应的虚拟地址建立映射关系。*/
        if (PageKsm(page))
            ret = handle_mm_fault(vma->vm_mm, vma, addr,
                            FAULT_FLAG_WRITE);
        else
            ret = VM_FAULT_WRITE;
        put_page(page);
    } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
    /*
     * We must loop because handle_mm_fault() may back out if there's
     * any difficulty e.g. if pte accessed bit gets updated concurrently.
     *
     * VM_FAULT_WRITE is what we have been hoping for: it indicates that
     * COW has been broken, even if the vma does not permit VM_WRITE;
     * but note that a concurrent fault might break PageKsm for us.
     *
     * VM_FAULT_SIGBUS could occur if we race with truncation of the
     * backing file, which also invalidates anonymous pages: that's
     * okay, that truncation will have unmapped the PageKsm for us.
     *
     * VM_FAULT_OOM: at the time of writing (late July 2009), setting
     * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
     * current task has TIF_MEMDIE set, and will be OOM killed on return
     * to user; and ksmd, having no mm, would never be chosen for that.
     *
     * But if the mm is in a limited mem_cgroup, then the fault may fail
     * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
     * even ksmd can fail in this way - though it's usually breaking ksm
     * just to undo a merge it made a moment before, so unlikely to oom.
     *
     * That's a pity: we might therefore have more kernel pages allocated
     * than we're counting as nodes in the stable tree; but ksm_do_scan
     * will retry to break_cow on each pass, so should recover the page
     * in due course.  The important thing is to not let VM_MERGEABLE
     * be cleared while any such pages might remain in the area.
     */
    return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
}

标签:node,mm,rmap,实现,stable,KSM,ksm,18.1,page
来源: https://blog.csdn.net/dai_xiangjun/article/details/118977235