在虚拟内存中,我提到了linux虚拟内存区域的结构,但具体其是如何在linux中表示与实现的呢?node
我利用了linux2.6的源码进行了浅显的分析。linux
在linux中,进程控制块即PCB的结构为task_struct,咱们以linux2.6为例,其源码以下:web
struct task_struct { //表示进程当前运行状态 //volatile避免了读取到缓存在寄存器中的脏数据,而是直接从内存中取 //能够看到state基本有三种,但大于0还分为不少不一样的状态 //#define TASK_RUNNING 0 //#define TASK_INTERRUPTIBLE 1 //#define TASK_UNINTERRUPTIBLE 2 //#define TASK_STOPPED 4 //#define TASK_TRACED 8 //#define EXIT_ZOMBIE 16 //#define EXIT_DEAD 32 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ //这个结构体保存了进程描述符中中频繁访问和须要快速访问的字段,内核依赖于该数据结构来得到当前进程的描述符。经过源码能够看到,该struct内拥有指向task_struct的指针 struct thread_info *thread_info; atomic_t usage; //进程标志,描述进程当前的状态(不是运行状态),如:PF_SUPERPRIV表示进程拥有超级用户特权。。 unsigned long flags; /* per process flags, defined below */ //系统调用跟踪 unsigned long ptrace; //内核锁标志(判断是否被上锁) int lock_depth; /* Lock depth */ //进程优先级 int prio, static_prio; struct list_head run_list; //进程调度队列 prio_array_t *array; //进程平均等待时间 unsigned long sleep_avg; //timestamp:进程最近插入运行队列的时间或最近一次进程切换的时间 //last_ran:最近一次替换本进程的进程切换时间 unsigned long long timestamp, last_ran; //进程被唤醒时所使用的条件代码,就是从什么状态被唤醒的。 int activated; //进程的调度类型 unsigned long policy; cpumask_t cpus_allowed; //time_slice:进程的剩余时间片 //first_time_slice:建立后首次获取的时间片,为1表示当前的时间片是从父进程分来的 unsigned int time_slice, first_time_slice; #ifdef CONFIG_SCHEDSTATS struct sched_info sched_info; #endif struct list_head tasks; /* * ptrace_list/ptrace_children forms the list of my children * that were stolen by a ptracer. */ struct list_head ptrace_children; struct list_head ptrace_list; //mm:内存描述符,其下有程地址空间下的虚拟内存信息 //actvie_mm:内核线程所借用的地址空间 struct mm_struct *mm,active_mm; /* task state */ struct linux_binfmt *binfmt; //进程的退出状态,大于0表示僵死 long exit_state; //exit_code:存放进程的退出码 //exit_signal:当进程退出时发给父进程的信号,若是是轻量级进程为-1 int exit_code, exit_signal; int pdeath_signal; /* The signal sent when the parent dies */ /* ??? */ unsigned long personality; //记录是否执行execve系统调用 unsigned did_exec:1; //进程id pid_t pid; //所在线程组领头进程的PID pid_t tgid; /* * pointers to (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with * p->parent->pid) */ // struct task_struct *real_parent; /* real parent process (when being debugged) */ //指向P的当前父进程 struct task_struct *parent; /* parent process */ /* * children/sibling forms the list of my children plus the * tasks I'm ptracing. */ //链表的头部,链表中全部元素都是P建立的子进程 struct list_head children; /* list of my children */ //兄弟进程之间相链接的链表 struct list_head sibling; /* linkage in my parent's children list */ //P所在进程组的领头进程 struct task_struct *group_leader; /* threadgroup leader */ //每一个进程有四个PID,把这四个PID挂到PID HASH表里的不一样位置,这样从PID到task就很快了 /* PID/PID hash table linkage. */ struct pid pids[PIDTYPE_MAX]; //为vfork()用来等待子进程的队列 struct completion *vfork_done; /* for vfork() */ int __user *set_child_tid; /* CLONE_CHILD_SETTID */ int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ //进程的实时优先级 unsigned long rt_priority; //如下为一些时间与定时信息 unsigned long it_real_value, it_real_incr; cputime_t it_virt_value, it_virt_incr; cputime_t it_prof_value, it_prof_incr; struct timer_list real_timer; cputime_t utime, stime; unsigned long nvcsw, nivcsw; /* context switch counts */ struct timespec start_time; /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ unsigned long min_flt, maj_flt; /* process credentials */ uid_t uid,euid,suid,fsuid; gid_t gid,egid,sgid,fsgid; struct group_info *group_info; kernel_cap_t cap_effective, cap_inheritable, cap_permitted; unsigned keep_capabilities:1; struct user_struct *user; #ifdef CONFIG_KEYS struct key *session_keyring; /* keyring inherited over fork */ struct key *process_keyring; /* keyring private to this process (CLONE_THREAD) */ struct key *thread_keyring; /* keyring private to this thread */ #endif int oomkilladj; /* OOM kill score adjustment (bit shift). */ char comm[TASK_COMM_LEN]; /* file system info */ int link_count, total_link_count; /* ipc stuff */ struct sysv_sem sysvsem; /* CPU-specific state of this task */ struct thread_struct thread; /* filesystem information */ //进程的可执行映象所在的文件系统 struct fs_struct *fs; /* open file information */ //进程打开的文件 struct files_struct *files; /* namespace */ struct namespace *namespace; /* signal handlers */ struct signal_struct *signal; struct sighand_struct *sighand; sigset_t blocked, real_blocked; struct sigpending pending; unsigned long sas_ss_sp; size_t sas_ss_size; int (*notifier)(void *priv); void *notifier_data; sigset_t *notifier_mask; void *security; struct audit_context *audit_context; /* Thread group tracking */ u32 parent_exec_id; u32 self_exec_id; /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */ spinlock_t alloc_lock; /* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */ spinlock_t proc_lock; /* context-switch lock */ spinlock_t switch_lock; /* journalling filesystem info */ void *journal_info; /* VM state */ struct reclaim_state *reclaim_state; struct dentry *proc_dentry; struct backing_dev_info *backing_dev_info; struct io_context *io_context; unsigned long ptrace_message; siginfo_t *last_siginfo; /* For ptrace use. */ /* * current io wait handle: wait queue entry to use for io waits * If this thread is processing aio, this points at the waitqueue * inside the currently handled kiocb. It may be NULL (i.e. default * to a stack based synchronous wait) if its doing sync IO. */ wait_queue_t *io_wait; /* i/o counters(bytes read/written, #syscalls */ u64 rchar, wchar, syscr, syscw; #if defined(CONFIG_BSD_PROCESS_ACCT) u64 acct_rss_mem1; /* accumulated rss usage */ u64 acct_vm_mem1; /* accumulated virtual memory usage */ clock_t acct_stimexpd; /* clock_t-converted stime since last update */ #endif #ifdef CONFIG_NUMA struct mempolicy *mempolicy; short il_next; #endif };
能够看到,task_struct具备不少字段,其包含了进程状态、内存、调度、文件系统、时间分配等各类信息,上面我也只是给出了部分的注释,抓住重点的虚拟内存描述符mm_struct,咱们进一步查看其源码:缓存
struct mm_struct { //指向线性区对象的链表头 struct vm_area_struct * mmap; /* list of VMAs */ //指向线性区对象的红黑树 struct rb_root mm_rb; //指向最后一个引用的线性区对象 struct vm_area_struct * mmap_cache; /* last find_vma result */ //在进程地址空间中搜索有效线性地址区间的方法 unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); //释放线性区时调用的方法 void (*unmap_area) (struct vm_area_struct *area); // 标识第一个分配的匿名线性区或者是文件内存映射的线性地址 unsigned long mmap_base; /* base of mmap area */ //内核从这个地址开始搜索进程地址空间中线性地址的空闲区间 unsigned long free_area_cache; /* first hole */ //指向页表 pgd_t * pgd; atomic_t mm_users; /* How many users with user space? */ atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ int map_count; /* number of VMAs */ struct rw_semaphore mmap_sem; //线性区的自旋锁和页表的自旋锁 spinlock_t page_table_lock; /* Protects page tables, mm->rss, mm->anon_rss */ struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung * together off init_mm.mmlist, and are protected * by mmlist_lock */ //各个片断的起始地址和终止地址 unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; unsigned long rss, anon_rss, total_vm, locked_vm, shared_vm; unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes; unsigned long saved_auxv[42]; /* for /proc/PID/auxv */ unsigned dumpable:1; cpumask_t cpu_vm_mask; /* Architecture-specific MM context */ mm_context_t context; /* Token based thrashing protection. */ unsigned long swap_token_time; char recent_pagein; /* coredumping support */ int core_waiters; struct completion *core_startup_done, core_done; /* aio bits */ rwlock_t ioctx_list_lock; struct kioctx *ioctx_list; struct kioctx default_kioctx; unsigned long hiwater_rss; /* High-water RSS usage */ unsigned long hiwater_vm; /* High-water virtual memory usage */ };
其中这里要讲到的字段为mmap,其指向线性区对象的链表头,而对于pgd,其指向该进程的页表。session
在地址空间中,mmap为地址空间的内存区域(用vm_area_struct结构来表示)链表,mm_rb用红黑树来存储,链表表示起来更加方便,红黑树表示起来更加方便查找。区别是,当虚拟区较少的时候,这个时候采用单链表,由mmap指向这个链表,当虚拟区多时此时采用红黑树的结构,由mm_rb指向这棵红黑树。这样就能够在大量数据的时候效率更高。全部的mm_struct结构体经过自身的mm_list域连接在一个双向链表上,该链表的首元素是init_mm内存描述符,表明init进程的地址空间。数据结构
对于mmap指向的vm_area_struct,咱们继续深刻源码:app
struct vm_area_struct { //指向vm_mm struct mm_struct * vm_mm; /* The address space we belong to. */ //该虚拟内存空间的首地址 unsigned long vm_start; /* Our start address within vm_mm. */ //该虚拟内存空间的尾地址 unsigned long vm_end; /* The first byte after our end address within vm_mm. */ //VMA链表的下一个成员 /* linked list of VM areas per task, sorted by address */ struct vm_area_struct *vm_next; pgprot_t vm_page_prot; /* Access permissions of this VMA. */ //保存VMA标志位 unsigned long vm_flags; /* Flags, listed below. */ //将本VMA做为一个节点加入到红黑树中 struct rb_node vm_rb; ... }
至此,咱们能够看出,虚拟内存即为由一个个vm_area_struct结构体,经过链表组装起来的空间。ide
所以,结合我的的理解,及对计算机原理的认识,我尝试模拟做出了linux中虚拟内存的结构图,以下:
但实际上,并不是每一个vm_area_struct对应指向内存的每一个段(每一个segment可能由多个VMA组成),对于vm_area_struct,其描述的是一段连续的、具备相同访问属性的虚存空间,该虚存空间的大小为物理内存页面的整数倍。svg
以上的示意图及分析是结合自身的理解所谈,有所班门弄斧的味道,若有不足之处还望指出;学习
同时,近期学得越多,发现本身不会得也越多。不管是在计算机系统上,仍是在linux源码上,有太多的知识点我不曾涉猎,仍是须要多学习、多总结、多思考。