伙伴系统中用于分配页的函数以下:node
alloc_pages(mask,order)分配2^order页并返回一个struct page的实例,表示分配的内存块的起始页。alloc_page(mask)是前者在order=0状况下的简化形式,只分配一页。linux
get_zeroed_page(mask)分配一页并返回一个page实例,页对应的内存填充0(全部其余函数分配以后的内容是未定义的)。算法
__get_free_pages(mask,order)和__get_free_page(mask)的工做方式与上述函数相同,但返回分配内存块的虚拟地址,而不是page实例。缓存
get_dma_pages(gfp_mask,order)用来得到适用于DMA的页。app
在空闲内存没法知足请求以致于分配失败的状况下,全部上述函数都返回空指针 (alloc_pages和alloc_page)或者0(get_zeroed_page、__get_free_pages和 __get_free_page)。所以内核在各次分配以后必须检查返回的结果。这种惯例与设计得很好的用户层应用程序没有什么不一样,但在内核中忽略检查 将会致使严重得多的故障。less
前述全部函数中使用的mask参数的语义是什么?linux将内核划分为内存域,内核提供了所谓的内存域修饰符,来指定从哪一个内存域分配所需的页。函数
- #define __GFP_DMA ((__force gfp_t)0x01u)
- #define __GFP_HIGHMEM ((__force gfp_t)0x02u)
- #define __GFP_DMA32 ((__force gfp_t)0x04u)
除了内存域修饰符以外,掩码中还能够设置一些标志,这些额外的标志并不限制从哪一个物理内存段分配内存,但确实能够改变分配器的行为。
- #define __GFP_WAIT ((__force gfp_t)0x10u) //表示分配内存的请求能够中断。也就是说,调度器在该请求期间可随意选择另外一个过程执行,或者该请求能够被另外一个更重要的事件中断。
- #define __GFP_HIGH ((__force gfp_t)0x20u) //若是请求很是重要,则设置__GFP_HIGH,即内核急切的须要内存时。在分配内存失败可能给内核带来严重得后果时,通常会设置该标志
- #define __GFP_IO ((__force gfp_t)0x40u) //在查找空闲内存期间内核能够进行I/O操做。这意味着若是内核在内存分配期间换出页,那么仅当设置该标志时,才能将选择的页写入磁盘。
- #define __GFP_FS ((__force gfp_t)0x80u) //容许内核执行VFS操做
- #define __GFP_COLD ((__force gfp_t)0x100u) //若是须要分配不在CPU高速缓存中的“冷”页时,则设置__GFP_COLD。
- #define __GFP_NOWARN ((__force gfp_t)0x200u) //在分配失败时禁止内核故障警告。
- #define __GFP_REPEAT ((__force gfp_t)0x400u) //在分配失败后自动重试,但在尝试若干次以后会中止。
- #define __GFP_NOFAIL ((__force gfp_t)0x800u) //在分配失败后一直重试,直至成功。
- #define __GFP_NORETRY ((__force gfp_t)0x1000u)//不重试,可能失败
- #define __GFP_COMP ((__force gfp_t)0x4000u)//增长复合页元数据
- #define __GFP_ZERO ((__force gfp_t)0x8000u)//在分配成功时,将返回填充字节0的页。
- #define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) //不适用紧急分配链表
- #define __GFP_HARDWALL ((__force gfp_t)0x20000u) // 只在NUMA系统上有意义。它限制只在分配到当前进程的各个CPU所关联的结点分配内存。若是进程容许在全部的CPU上运行(默认状况下),该标志是没有 意义的。只有进程能够运行的CPU受限时,该标志才有意义。
- #define __GFP_THISNODE ((__force gfp_t)0x40000u)//页只在NUMA系统上有意义,若是设置该比特位,则内存分配失败的状况下不容许使用其余结点做为备用,须要保证在当前结点或者明确指定的结点上成功分配内存。
- #define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) //将分配的内存标记为可回收
- #define __GFP_MOVABLE ((__force gfp_t)0x100000u) //将分配的内存标记为可移动
-
- #define __GFP_BITS_SHIFT 21 /* Room for 21 __GFP_FOO bits */
- #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
-
- /* This equals 0, but use constants in case they ever change */
- #define GFP_NOWAIT (GFP_ATOMIC & ~__GFP_HIGH)
因为这些标志老是组合使用,内核作了一些分组,包含了用于各类标准情形的适当地标志。
- #define GFP_ATOMIC (__GFP_HIGH)//用于原子分配,在任何状况下都不能中断,可能使用紧急分配链表中的内存
- #define GFP_NOIO (__GFP_WAIT)//明确禁止IO操做,但能够被中断
- #define GFP_NOFS (__GFP_WAIT | __GFP_IO)//明确禁止访问VFS层操做,但能够被中断
- #define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS)//内核分配的默认配置
- #define GFP_TEMPORARY (__GFP_WAIT | __GFP_IO | __GFP_FS | \
- __GFP_RECLAIMABLE)
- #define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)//用户分配的默认配置
- #define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \
- __GFP_HIGHMEM)//是GFP_USER的一个扩展,页用于用户空间,它容许分配没法直接映射的高端内存,使用高端内存页是没有坏处的,由于用户过程的地址空间老是经过非线性页表组织的
- #define GFP_HIGHUSER_MOVABLE (__GFP_WAIT | __GFP_IO | __GFP_FS | \
- __GFP_HARDWALL | __GFP_HIGHMEM | \
- __GFP_MOVABLE)//相似于GFP_HIGHUSER,但分配是在虚拟内存域ZONE_MOVABLE中进行
- #define GFP_NOFS_PAGECACHE (__GFP_WAIT | __GFP_IO | __GFP_MOVABLE)
- #define GFP_USER_PAGECACHE (__GFP_WAIT | __GFP_IO | __GFP_FS | \
- __GFP_HARDWALL | __GFP_MOVABLE)
- #define GFP_HIGHUSER_PAGECACHE (__GFP_WAIT | __GFP_IO | __GFP_FS | \
- __GFP_HARDWALL | __GFP_HIGHMEM | \
- __GFP_MOVABLE)
-
- #ifdef CONFIG_NUMA
- #define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
- #else
- #define GFP_THISNODE ((__force gfp_t)0)
- #endif
-
- /* This mask makes up all the page movable related flags */
- #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
-
- /* Control page allocator reclaim behavior */
- #define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\
- __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\
- __GFP_NORETRY|__GFP_NOMEMALLOC)
-
- /* Control allocation constraints */
- #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
-
- /* Do not use these with a slab allocator */
- #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
-
- /* Flag - indicates that the buffer will be suitable for DMA. Ignored on some
- platforms, used as appropriate on others */
-
- #define GFP_DMA __GFP_DMA
-
- /* 4GB DMA on some platforms */
- #define GFP_DMA32 __GFP_DMA32
- #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
- #define __get_free_page(gfp_mask) \
- __get_free_pages((gfp_mask),0)
- #define __get_dma_pages(gfp_mask, order) \
- __get_free_pages((gfp_mask) | GFP_DMA,(order))
- fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
- {
- struct page * page;
- page = alloc_pages(gfp_mask, order);
- if (!page)
- return 0;
- return (unsigned long) page_address(page);
- }
- #define alloc_pages(gfp_mask, order) \
- alloc_pages_node(numa_node_id(), gfp_mask, order)
根据上面的代码,能够得出各个分配函数之间的关系以下图所示:

主要的函数是alloc_pages_node。alloc_pages_node源代码的详细分析以下:oop
- static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
- unsigned int order)
- {
- if (unlikely(order >= MAX_ORDER))//执行一个检查,避免分配过大的内存块
- return NULL;
-
- /* Unknown node is current node */
- if (nid < 0)//若是指定负的结点ID(不存在),内核自动地使用当前执行CPU对应的结点ID。
- nid = numa_node_id();
-
- return __alloc_pages(gfp_mask, order,
- NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_mask));//接下来的工做委托给__alloc_pages,只需传递一组适当地参数,请注意,gfp_zone用于选择分配内存的内存域。
- }
- static inline enum zone_type gfp_zone(gfp_t flags)//本函数比较好理解,就是根据指定的标志肯定内存域
- {
- int base = 0;
-
- #ifdef CONFIG_NUMA
- if (flags & __GFP_THISNODE)
- base = MAX_NR_ZONES;
- #endif
-
- #ifdef CONFIG_ZONE_DMA
- if (flags & __GFP_DMA)
- return base + ZONE_DMA;
- #endif
- #ifdef CONFIG_ZONE_DMA32
- if (flags & __GFP_DMA32)
- return base + ZONE_DMA32;
- #endif
- if ((flags & (__GFP_HIGHMEM | __GFP_MOVABLE)) ==
- (__GFP_HIGHMEM | __GFP_MOVABLE))
- return base + ZONE_MOVABLE;
- #ifdef CONFIG_HIGHMEM
- if (flags & __GFP_HIGHMEM)
- return base + ZONE_HIGHMEM;
- #endif
- return base + ZONE_NORMAL;
- }
__alloc_pages源代码详细分析以下:
- struct page * fastcall
- __alloc_pages(gfp_t gfp_mask, unsigned int order,
- <pre> struct zonelist *zonelist)//<span style="text- align: justify; " lang="EN-US">gfp_mask</span>< span style="text-align: justify; ">是一些标志位,用来制定如何寻找空闲页框,< span style="text-align: justify; " lang="EN-US">order</span>& lt;span style="text-align: justify; ">用来表示所需物理块的大小,从空闲链表中获取</span& gt;<span style="text-align: justify; " lang="EN-US">2^order< /span><span style="text-align: justify; ">页内存,< span style="text-align: justify; ">在管理区链表</span>< span style="text-align: justify; " lang="EN-US">zonelist</span& gt;<span style="text-align: justify; ">中依次查找每一个区,从中找到知足要求的区< /span></span></span></pre> {const gfp_t wait = gfp_mask & __GFP_WAIT;//gfp_mask是申请内存时用到的控制字,这一句 就是为了检测咱们的控制字里面是否有__GPF_WAIT这个属性struct zone **z;//<span style="text- align:justify">管理区结构 体</span>struct page *page;struct reclaim_state reclaim_state;struct task_struct *p =
- current;int do_retry;int alloc_flags;int did_some_progress;might_sleep_if(wait);// 若是在gfp_mask中设置了__GFP_WAIT位,代表内核能够阻塞当前进程,来等待空闲页面。在分配开始以前即阻塞,目的是为了等待其它进程释放 更多的页面if (should_fail_alloc_page(gfp_mask, order))//经过简单算法在真正分配前检查分配是否会失 败,避免进入真正的分配程序后浪费系统时间return NULL;restart:z
- = zonelist->zones; //zonelist 是struct node中的一个成员,它表示系统内全部normal内存页区的链接链表,<span style="text- align:justify; text-indent:28px">首先 让</span><span style="text-align:justify; text- indent:28px" lang="EN-US">z</span><span style="text- align:justify; text-indent:28px">指向第一个管理区</span>if
- (unlikely(*z == NULL)) {// 若是发现头指针为空,即没有指向struct zone的有效指针,咱们就直接返回错误 /* * Happens if we have an empty zonelist as a result of * GFP_THISNODE being used on a memoryless node */return NULL;}page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,zonelist,
- ALLOC_WMARK_LOW|ALLOC_CPUSET);//get_page_from_freelist 以指定的watermark来分配页面。<span style="text-indent:21pt">每一个zone struct中定义 了三个watermark:pages_min, pages_low, pages_high,表示zone中应保持的空闲页面的阈 值。</span><span style="text-indent:21pt"> get_page_from_freelist函数经过设置Alloc
- flags来选择watermark。</span><span style="text-indent:21pt"></span>if (page)// 首先以pages_low watermark分配页面,若是分配成功,则跳转到 got_pggoto got_pg;/* * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and * __GFP_NOWARN set) should not cause reclaim since the subsystem
- * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim * using a larger set of nodes after it has established that the * allowed per node queues are empty and that nodes are * over allocated. */if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)//若是pages_low watermark分配失败的话,检查gfp_mask,若是GFP_THISNODE标志被设置,代表不能重试,所以跳转到nopage,返回失败goto
- nopage;for (z = zonelist->zones; *z; z++)wakeup_kswapd(*z, order);// 不然调用kswapd对zonelist中的全部zone进行页面回收,期待能将一些闲置页面交换到文件系统中 /* * OK, we're below the kswapd watermark and have kicked background * reclaim. Now things get more complex, so set up alloc_flags according
- * to how we want to proceed. * * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will * set both ALLOC_HARDER
- (!wait) and ALLOC_HIGH (__GFP_HIGH). */alloc_flags = ALLOC_WMARK_MIN;// 设置alloc_flags的值,以page_min watermark来分配内存 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)//倘若进程是非中 断处理程序的实时进程,或者该进程不能被阻塞,那么这个时候,我要在最低阈值的标准的基础上,再次下降阈值 alloc_flags |= ALLOC_HARDER;if
- (gfp_mask & __GFP_HIGH)//<span style="text- align:justify">容许使用保留页面</span><span style="text- align:justify" lang="EN-US">__GFP_HIGH</span>alloc_flags |= ALLOC_HIGH;if (wait)alloc_flags |= ALLOC_CPUSET;/* * Go through the zonelist again. Let __GFP_HIGH and allocations
- * coming from realtime tasks go deeper into reserves. * * This is the last chance, in general, before the goto nopage. * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */page = get_page_from_freelist(gfp_mask,
- order, zonelist, alloc_flags);// 以指定的watermark来分配页面,详细讨论见下文if (page)//分配成功,就进入got_pggoto got_pg; /* This allocation should allow future memory freeing. */rebalance://上面的 第二次分配失败 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))// 若是当前进程容许本次申请的内存能够被释放,而且不处于软硬中断的状态,咱们不顾忌必须保留最小空闲内存页,强行分配&&
- !in_interrupt()) {if (!(gfp_mask & __GFP_NOMEMALLOC)) {// 若是gfp_mask设置不须要保留紧急内存区域,以不设watermark再次分配页面 nofail_alloc:/* go through the zonelist yet again, ignoring mins * /page = get_page_from_freelist(gfp_mask, order,zonelist, ALLOC_NO_WATERMARKS); //以不设watermark进行第三次分配if
- (page)// 第三次分配成功goto got_pg;if (gfp_mask & __GFP_NOFAIL) {//第三次分配失败,若是 gfp_mask设置了__GFP_NOFAIL,则不断重试,直到分配成功 congestion_wait(WRITE, HZ/50);goto nofail_alloc;}}goto nopage;} /* Atomic allocations - we can't balance anything */if (!wait) //<span style="text-align:justify; text-indent:28px">原子分配,不容许阻塞,则只 能返回失败信号,分配失败</span>goto
- nopage;cond_resched();// 从新调度以后,试图释放一些不经常使用的页面/* We now go into synchronous reclaim * /cpuset_memory_pressure_bump();//开始进行同步内存回收p->flags |= PF_MEMALLOC;// 进程的标志位设置为PF_MEMALLOCreclaim_state.reclaimed_slab = 0;//对于再也不活跃的SLAB也给回收掉 p->reclaim_state = &reclaim_state;//改变进程回收的状态did_some_progress
- = try_to_free_pages(zonelist->zones, order, gfp_mask);// 该函数选择最近不十分活跃的页,将其写到交换区,在物理内存中腾出空间p->reclaim_state = NULL;p-> flags &= ~PF_MEMALLOC;cond_resched(); if (order != 0)drain_all_local_pages(); if (likely(did_some_progress)) {//<span style="background- color:rgb(240,243,250)">调度以后,若是确实释放了一部分页面,则从新分配页面</span>page
- = get_page_from_freelist(gfp_mask, order,zonelist, alloc_flags);if (page)goto got_pg;} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {//若是内核可能执行影响VFS层的调用而又没有设置GFP_NORETRY,那么调用OOM killerif (!try_set_zone_oom(zonelist)) {schedule_timeout_uninterruptible(1);goto
- restart;}/* * Go through the zonelist yet one more time, keep * very high watermark here, this is only to catch * a parallel oom killing, we must fail if we're still * under heavy pressure. */page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,zonelist,
- ALLOC_WMARK_HIGH|ALLOC_CPUSET);if (page) {clear_zonelist_oom(zonelist);goto got_pg;}/* The OOM killer will not help higher order allocs so fail */if (order > PAGE_ALLOC_COSTLY_ORDER) {// 杀死一个进程未必当即出现多余2^PAGE_ALLOC_CODTLY_ORDER页的连续内存区,所以若是当前要分配如此大的内存区,那么内核会饶恕所 选择的进程,不执行杀死进程的任务,而是认可失败并跳转到nopageclear_zonelist_oom(zonelist);goto
- nopage;}out_of_memory(zonelist, gfp_mask, order);// 选择一个内核认为犯有分配过多内存“罪行”的进程,并杀死该进程。这有很大概率腾出较多的空闲页,而后跳转到标号restart,重试分配内存的操做 clear_zonelist_oom(zonelist);goto restart;}/* * Don't let big-order allocations loop unless the caller explicitly * requests that. Wait
- for some write requests to complete then retry. * * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order * <= 3, but that may not be true in other implementations. *///若是设置了__GFP_NORETRY,或内核不容许可能影响VFS层的操做do_retry = 0;if (!(gfp_mask & __GFP_NORETRY))
- {// 没有设置__GFP_NORETRYif ((order <= PAGE_ALLOC_COSTLY_ORDER) ||(gfp_mask & amp; __GFP_REPEAT))//若是分配长度小于2^PAGE_ALLOC_COSTLY_ORDER或设置了__GFP_REPEAT,则 内核进入无限循环do_retry = 1;if (gfp_mask & __GFP_NOFAIL)//若是设置了不容许分配失败,内核也会 进入无限循环do_retry = 1;}if (do_retry) {congestion_wait(WRITE,
- HZ/50);goto rebalance;}nopage:if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {printk(KERN_WARNING "%s: page allocation failure."" order:%d, mode:0x%x\n",p->comm, order, gfp_mask);dump_stack();show_mem();}got_pg:return page;}
- <pre></pre>
- <p><span style="font-size:18px">get_page_from_freelist源代码的详细分析以下:</span></p>
- <p></p>
- <pre name="code" class="cpp">static struct page *
- get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, int alloc_flags)
- {
- struct zone **z;//管理区结构体
- struct page *page = NULL;
- int classzone_idx = zone_idx(zonelist->zones[0]);//#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) 获取管理区的编号
- struct zone *zone;
- nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
- int zlc_active = 0; /* set if using zonelist_cache */
- int did_zlc_setup = 0; /* just call zlc_setup() one time */
- enum zone_type highest_zoneidx = -1; /* Gets set for policy zonelists */
-
- zonelist_scan:
- /*
- * Scan zonelist, looking for a zone with enough free.
- * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
- */
- z = zonelist->zones;//让z指向第一个管理区
- //<span style="word- wrap: break-word; text-indent: 28px; background- color: rgb(245, 247, 248); " lang="EN-US"><span style="word- wrap: break-word; "> </span></span><span style="word- wrap: break-word; text-indent: 28px; background- color: rgb(245, 247, 248); ">在容许的节点中,遍历知足要求的管理区</span>
- do {
- /*
- * In NUMA, this could be a policy zonelist which contains
- * zones that may not be allowed by the current gfp_mask.
- * Check the zone is allowed by the current flags
- */
- if (unlikely(alloc_should_filter_zonelist(zonelist))) {//根据zonelist->zlcache_ptr来肯定是否须要过滤掉此内存区链表,关于过滤的条件还不是很清楚,请指教
- if (highest_zoneidx == -1)
- highest_zoneidx = gfp_zone(gfp_mask);//gfp_zone用于指定分配内存的内存域
- if (zone_idx(*z) > highest_zoneidx)//首先考虑利用上面指定的内存域,对于一些分配代价高于指定内存域的内存域先不考虑
- continue;
- }
-
- if (NUMA_BUILD && zlc_active &&//<span style="text-indent: 28px; background-color: rgb(245, 247, 248); ">是第一遍分配,在其余管理区中分配页面时须要考虑其页面是否充足</span>
- !zlc_zone_worth_trying(zonelist, z, allowednodes))//<span style="text-indent: 28px; background-color: rgb(245, 247, 248); ">该管理区页面不是很充足,考虑下一个管理区</span>
- continue;
- zone = *z;
- if ((alloc_flags & ALLOC_CPUSET) &&
- !cpuset_zone_allowed_softwall(zone, gfp_mask))//<span style="text-indent: 28px; background-color: rgb(245, 247, 248); ">当前分配标志不容许在该管理区中分配页面</span>
- goto try_next_zone;
-
- if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {//<span style="text-indent: 28px; background-color: rgb(245, 247, 248); ">分配时须要考虑watermark</span>
- unsigned long mark;//<span style="text-indent: 28px; background-color: rgb(245, 247, 248); ">根据分配标志,肯定使用哪个watermark</span>
- if (alloc_flags & ALLOC_WMARK_MIN)
- mark = zone->pages_min;
- else if (alloc_flags & ALLOC_WMARK_LOW)
- mark = zone->pages_low;
- else
- mark = zone->pages_high;
- if (!zone_watermark_ok(zone, order, mark,
- classzone_idx, alloc_flags)) {//<span style="text-indent: 28px; background-color: rgb(245, 247, 248); ">该管理区的可用内存不能够知足本次分配的要求</span>
- if (!zone_reclaim_mode ||//但不知足分配要求时,若是此内存域不能回收内存或者是回收不到可用内存时,就会跳转到this_zone_full
- !zone_reclaim(zone, gfp_mask, order))
- goto this_zone_full;
- }
- }
-
- page = buffered_rmqueue(zonelist, zone, order, gfp_mask);//<span style="text-indent: 28px; background-color: rgb(245, 247, 248); ">调用伙伴系统的分配函数</span>
- if (page)//<span style="word- wrap: break-word; text-indent: 28px; background- color: rgb(245, 247, 248); " lang="EN-US"><span style="word- wrap: break-word; "> </span></span><span style="word- wrap: break-word; text-indent: 28px; background- color: rgb(245, 247, 248); ">从伙伴系统分配成功,退出</span>
- break;
- this_zone_full:
- if (NUMA_BUILD)
- zlc_mark_zone_full(zonelist, z);//<span style="text-indent: 28px; background-color: rgb(245, 247, 248); ">标记该管理区空间不足,下次分配时将略过本管理区,避免浪费太多时间</span>
- try_next_zone:
- if (NUMA_BUILD && !did_zlc_setup) {//<span style="text-indent: 28px; background-color: rgb(245, 247, 248); ">当前管理区内存不足,须要加大在其余区中的分配力度</span>
- /* we do zlc_setup after the first zone is tried */
- allowednodes = zlc_setup(zonelist, alloc_flags);
- zlc_active = 1;
- did_zlc_setup = 1;
- }
- } while (*(++z) != NULL);
-
- if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {//<span style="word- wrap: break-word; text-indent: 28px; background- color: rgb(245, 247, 248); " lang="EN-US"><span style="word- wrap: break-word; "> </span></span><span style="word- wrap: break-word; text-indent: 28px; background- color: rgb(245, 247, 248); ">第一遍分配不成功,则取 消</span><span style="word-wrap: break-word; text- indent: 28px; background-color: rgb(245, 247, 248); " lang="EN-US">& lt;span style="word-wrap: break-word; ">zlc_active</span>< /span><span style="word-wrap: break-word; text- indent: 28px; background-color: rgb(245, 247, 248); ">,这样会尽可能从其余节点中分配内 存</span>
- /* Disable zlc cache for second zonelist scan */
- zlc_active = 0;
- goto zonelist_scan;
- }
- return page;
- }
- </pre>
- <p><span style="font-size:18px; color:#ff0000">关于上面一段代码中zlc_active的做用不明白,还望理解的人指点一下。</span></p>
- <p></p>
- <pre name="code" class="cpp">struct zonelist {
- struct zonelist_cache *zlcache_ptr; // NULL or &zlcache
- struct zone *zones[MAX_ZONES_PER_ZONELIST + 1]; // NULL delimited
- #ifdef CONFIG_NUMA
- struct zonelist_cache zlcache; // optional ...
- #endif
- };</pre><br>
- <pre name="code" class="cpp">struct zonelist_cache {
- unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */
- DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */
- unsigned long last_full_zap; /* when last zap'd (jiffies) */
- };</pre><br>
- <span style="font-size:18px">zone_watermark_ok源代码详细分析以下:</span><br>
- <p></p>
- <p></p>
- <pre name="code" class="cpp">int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int alloc_flags)
- {
- /* free_pages my go negative - that's OK */
- long min = mark;
- long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;//zone_page_state用来访问每一个内存域的统计量,在此处,获得的是空闲页的数目
- int o;
-
- if (alloc_flags & ALLOC_HIGH)//设置了ALLOC_HIGH以后,将最小值标记减小一半
- min -= min / 2;
- if (alloc_flags & ALLOC_HARDER)//设置了ALLOC_HARDER以后,将最小值标记减小1/4
- min -= min / 4;
-
- if (free_pages <= min + z->lowmem_reserve[classzone_idx])//检查空闲页的数目是否小于最小值与lowmem_reserve中制定的紧急分配值之和,若是小于则不进行内存分配
- return 0;
- for (o = 0; o < order; o++) {//若是不小于,则代码遍历全部小于当前阶的分配阶
- /* At the next order, this order's pages become unavailable */
- free_pages -= z->free_area[o].nr_free << o;//从free_pages减去当前分配阶的全部空闲页
-
- /* Require fewer higher order pages to be free */
- min >>= 1;// 每升高一阶,所需空闲页的最小值减半,<span style="background- color: rgb(245, 247, 248); ">由于阶数越高,每个块中包含的页面就越多。咱们假设初始水线是2^n,那么对阶数0 来讲,min的值就应当是2^n,对阶数为1来讲,min的值就应当除以2变为2^(n-1),由于对于阶数1来讲,每一个块包含的页面数为 2</span>
-
- if (free_pages <= min)//若是内核遍历全部的低端内存域以后,发现内存不足,则不进行内存分配
- return 0;
- }
- return 1;
- }
- </pre><span style="font-size:18px">buffered_rmqueue源代码详细分析以下:</span>
- <p></p>
- <p></p>
- <pre name="code" class="cpp">static struct page *buffered_rmqueue(struct zonelist *zonelist,
- struct zone *zone, int order, gfp_t gfp_flags)
- {
- unsigned long flags;
- struct page *page;
- int cold = !!(gfp_flags & __GFP_COLD);//若是分配参数指定了__GFP_COLD标志,则设置cold标志,两次取反操做确保cold是0或者1,why?请指教
- int cpu;
- int migratetype = allocflags_to_migratetype(gfp_flags);//根据gfp_flags得到迁移类型
-
- again:
- cpu = get_cpu();//获取本CPU
- if (likely(order == 0)) {//分配单页,须要管理每CPU页面缓存
- struct per_cpu_pages *pcp;
-
- pcp = &zone_pcp(zone, cpu)->pcp[cold];//取得本CPU的页面缓存对象
- local_irq_save(flags);//这里须要关中断,由于内存回收过程可能发送核间中断,强制每一个核从每CPU缓存中释放页面。并且中断处理函数也会分配单页。
- if (!pcp->count) {//缓存为空,须要扩大缓存的大小
- pcp->count = rmqueue_bulk(zone, 0,
- pcp->batch, &pcp->list, migratetype);//从伙伴系统中摘除一批页面到缓存中,补充的页面个数由每CPU缓存的batch字段指定
- if (unlikely(!pcp->count))//若是缓存仍然为空,那么说明伙伴系统中页面也没有了,分配失败
- goto failed;
- }
-
- /* Find a page of the appropriate migrate type */
- list_for_each_entry(page, &pcp->list, lru)//遍历每CPU缓存中的全部页,检查是否有指定类型的迁移类型的页可用
- if (page_private(page) == migratetype)
- break;
-
- /* Allocate more to the pcp list if necessary */
- if (unlikely(&page->lru == &pcp->list)) {
- pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, &pcp->list, migratetype);
- page = list_entry(pcp->list.next, struct page, lru);
- }
-
- list_del(&page->lru);//将页面从每CPU缓存链表中取出,并将每CPU缓存计数减1
- pcp->count--;
- } else {
- spin_lock_irqsave(&zone->lock, flags);
- page = __rmqueue(zone, order, migratetype);
- spin_unlock(&zone->lock);
- if (!page)
- goto failed;
- }
-
- __count_zone_vm_events(PGALLOC, zone, 1 << order);
- zone_statistics(zonelist, zone);
- local_irq_restore(flags);
- put_cpu();
-
- VM_BUG_ON(bad_range(zone, page));
- if (prep_new_page(page, order, gfp_flags))
- goto again;
- return page;
-
- failed:
- local_irq_restore(flags);
- put_cpu();
- return NULL;
- }</pre><span style="color:rgb(255,0,0); font-family:Arial; font-size:18px; line-height:26px"> 我也知道有不少的细节都没有分析到位,可是我也没有办法,曾经想着把里面涉及到的每个函数都分析到位,可是那样的话本身至关的痛苦,由于那样的结果就是 不少天都没有办法前进一点,会让人至关的有挫败感,最后只能选择大概先都过一遍,由于本身是一个内核的初学者,而内核先后的关联又很大,也只能先过一遍, 到后面我会从新回来看我写得博客,能增进一些分析就增进一些分析。若是您认为上面确实有很重要的地方我没有分析到,但愿您指 点。</span><br>
- <br>
- <br>
- <p></p>
- <br>
- <p></p>
- <p><br>
- </p>
- <br>
- <br>
- <br>
- <br>
- <br>
- <br>
- <br>
- <br>
- <br>
- <p></p>