源码之linux进程:vm_area_struct与虚拟内存的关系

前言

虚拟内存中,我提到了linux虚拟内存区域的结构,但具体其是如何在linux中表示与实现的呢?node

我利用了linux2.6的源码进行了浅显的分析。linux

正文

task_struct

在linux中,进程控制块即PCB的结构为task_struct,咱们以linux2.6为例,其源码以下:web

struct task_struct {
    //表示进程当前运行状态
    //volatile避免了读取到缓存在寄存器中的脏数据,而是直接从内存中取
    //能够看到state基本有三种,但大于0还分为不少不一样的状态
    //#define TASK_RUNNING 0
    //#define TASK_INTERRUPTIBLE 1
    //#define TASK_UNINTERRUPTIBLE 2
    //#define TASK_STOPPED 4
    //#define TASK_TRACED 8
    //#define EXIT_ZOMBIE 16
    //#define EXIT_DEAD 32
	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
    //这个结构体保存了进程描述符中中频繁访问和须要快速访问的字段,内核依赖于该数据结构来得到当前进程的描述符。经过源码能够看到,该struct内拥有指向task_struct的指针
	struct thread_info *thread_info;
    
	atomic_t usage;
    //进程标志,描述进程当前的状态(不是运行状态),如:PF_SUPERPRIV表示进程拥有超级用户特权。。
	unsigned long flags;	/* per process flags, defined below */
    //系统调用跟踪
	unsigned long ptrace;
	//内核锁标志(判断是否被上锁)
	int lock_depth;		/* Lock depth */
	//进程优先级
	int prio, static_prio;
	struct list_head run_list;
    //进程调度队列
	prio_array_t *array;
	//进程平均等待时间
	unsigned long sleep_avg;
    //timestamp:进程最近插入运行队列的时间或最近一次进程切换的时间
    //last_ran:最近一次替换本进程的进程切换时间
	unsigned long long timestamp, last_ran;
    //进程被唤醒时所使用的条件代码,就是从什么状态被唤醒的。
	int activated;
	//进程的调度类型
	unsigned long policy;
	cpumask_t cpus_allowed;
    //time_slice:进程的剩余时间片
    //first_time_slice:建立后首次获取的时间片,为1表示当前的时间片是从父进程分来的
	unsigned int time_slice, first_time_slice;

#ifdef CONFIG_SCHEDSTATS
	struct sched_info sched_info;
#endif

	struct list_head tasks;
	/* * ptrace_list/ptrace_children forms the list of my children * that were stolen by a ptracer. */
	struct list_head ptrace_children;
	struct list_head ptrace_list;
	//mm:内存描述符,其下有程地址空间下的虚拟内存信息
    //actvie_mm:内核线程所借用的地址空间
	struct mm_struct *mm,active_mm;

/* task state */
	struct linux_binfmt *binfmt;
    //进程的退出状态,大于0表示僵死
	long exit_state;
    //exit_code:存放进程的退出码
    //exit_signal:当进程退出时发给父进程的信号,若是是轻量级进程为-1
	int exit_code, exit_signal;
	int pdeath_signal;  /* The signal sent when the parent dies */
	/* ??? */
	unsigned long personality;
    //记录是否执行execve系统调用
	unsigned did_exec:1;
    //进程id
	pid_t pid;
    //所在线程组领头进程的PID
	pid_t tgid;
	/* * pointers to (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with * p->parent->pid) */
    //
	struct task_struct *real_parent; /* real parent process (when being debugged) */
    //指向P的当前父进程
	struct task_struct *parent;	/* parent process */
	/* * children/sibling forms the list of my children plus the * tasks I'm ptracing. */
    //链表的头部,链表中全部元素都是P建立的子进程
	struct list_head children;	/* list of my children */
	//兄弟进程之间相链接的链表
    struct list_head sibling;	/* linkage in my parent's children list */
	//P所在进程组的领头进程
    struct task_struct *group_leader;	/* threadgroup leader */
	//每一个进程有四个PID,把这四个PID挂到PID HASH表里的不一样位置,这样从PID到task就很快了
	/* PID/PID hash table linkage. */
	struct pid pids[PIDTYPE_MAX];
    //为vfork()用来等待子进程的队列
	struct completion *vfork_done;		/* for vfork() */
	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
	//进程的实时优先级
	unsigned long rt_priority;
    //如下为一些时间与定时信息
	unsigned long it_real_value, it_real_incr;
	cputime_t it_virt_value, it_virt_incr;
	cputime_t it_prof_value, it_prof_incr;
	struct timer_list real_timer;
	cputime_t utime, stime;
	unsigned long nvcsw, nivcsw; /* context switch counts */
	struct timespec start_time;
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
	unsigned long min_flt, maj_flt;
/* process credentials */
	uid_t uid,euid,suid,fsuid;
	gid_t gid,egid,sgid,fsgid;
	struct group_info *group_info;
	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted;
	unsigned keep_capabilities:1;
	struct user_struct *user;
#ifdef CONFIG_KEYS
	struct key *session_keyring;	/* keyring inherited over fork */
	struct key *process_keyring;	/* keyring private to this process (CLONE_THREAD) */
	struct key *thread_keyring;	/* keyring private to this thread */
#endif
	int oomkilladj; /* OOM kill score adjustment (bit shift). */
	char comm[TASK_COMM_LEN];
/* file system info */
	int link_count, total_link_count;
/* ipc stuff */
	struct sysv_sem sysvsem;
/* CPU-specific state of this task */
	struct thread_struct thread;
/* filesystem information */
    //进程的可执行映象所在的文件系统
	struct fs_struct *fs;
/* open file information */
    //进程打开的文件
	struct files_struct *files;
/* namespace */
	struct namespace *namespace;
/* signal handlers */
	struct signal_struct *signal;
	struct sighand_struct *sighand;

	sigset_t blocked, real_blocked;
	struct sigpending pending;

	unsigned long sas_ss_sp;
	size_t sas_ss_size;
	int (*notifier)(void *priv);
	void *notifier_data;
	sigset_t *notifier_mask;
	
	void *security;
	struct audit_context *audit_context;

/* Thread group tracking */
   	u32 parent_exec_id;
   	u32 self_exec_id;
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
	spinlock_t alloc_lock;
/* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */
	spinlock_t proc_lock;
/* context-switch lock */
	spinlock_t switch_lock;

/* journalling filesystem info */
	void *journal_info;

/* VM state */
	struct reclaim_state *reclaim_state;

	struct dentry *proc_dentry;
	struct backing_dev_info *backing_dev_info;

	struct io_context *io_context;

	unsigned long ptrace_message;
	siginfo_t *last_siginfo; /* For ptrace use. */
/* * current io wait handle: wait queue entry to use for io waits * If this thread is processing aio, this points at the waitqueue * inside the currently handled kiocb. It may be NULL (i.e. default * to a stack based synchronous wait) if its doing sync IO. */
	wait_queue_t *io_wait;
/* i/o counters(bytes read/written, #syscalls */
	u64 rchar, wchar, syscr, syscw;
#if defined(CONFIG_BSD_PROCESS_ACCT)
	u64 acct_rss_mem1;	/* accumulated rss usage */
	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
	clock_t acct_stimexpd;	/* clock_t-converted stime since last update */
#endif
#ifdef CONFIG_NUMA
  	struct mempolicy *mempolicy;
	short il_next;
#endif
};

mm_struct

能够看到,task_struct具备不少字段,其包含了进程状态、内存、调度、文件系统、时间分配等各类信息,上面我也只是给出了部分的注释,抓住重点的虚拟内存描述符mm_struct,咱们进一步查看其源码:缓存

struct mm_struct {
    //指向线性区对象的链表头
	struct vm_area_struct * mmap;		/* list of VMAs */
    //指向线性区对象的红黑树
	struct rb_root mm_rb;
    //指向最后一个引用的线性区对象
	struct vm_area_struct * mmap_cache;	/* last find_vma result */
    //在进程地址空间中搜索有效线性地址区间的方法
	unsigned long (*get_unmapped_area) (struct file *filp,
				unsigned long addr, unsigned long len,
				unsigned long pgoff, unsigned long flags);
    //释放线性区时调用的方法
	void (*unmap_area) (struct vm_area_struct *area);
    // 标识第一个分配的匿名线性区或者是文件内存映射的线性地址
	unsigned long mmap_base;		/* base of mmap area */
    //内核从这个地址开始搜索进程地址空间中线性地址的空闲区间
	unsigned long free_area_cache;		/* first hole */
    //指向页表
	pgd_t * pgd;
	atomic_t mm_users;			/* How many users with user space? */
	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
	int map_count;				/* number of VMAs */
	struct rw_semaphore mmap_sem;
	//线性区的自旋锁和页表的自旋锁
    spinlock_t page_table_lock;		/* Protects page tables, mm->rss, mm->anon_rss */

	struct list_head mmlist;		/* List of maybe swapped mm's. These are globally strung * together off init_mm.mmlist, and are protected * by mmlist_lock */

    //各个片断的起始地址和终止地址
	unsigned long start_code, end_code, start_data, end_data;
	unsigned long start_brk, brk, start_stack;
	unsigned long arg_start, arg_end, env_start, env_end;
	unsigned long rss, anon_rss, total_vm, locked_vm, shared_vm;
	unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes;

	unsigned long saved_auxv[42]; /* for /proc/PID/auxv */

	unsigned dumpable:1;
	cpumask_t cpu_vm_mask;

	/* Architecture-specific MM context */
	mm_context_t context;

	/* Token based thrashing protection. */
	unsigned long swap_token_time;
	char recent_pagein;

	/* coredumping support */
	int core_waiters;
	struct completion *core_startup_done, core_done;

	/* aio bits */
	rwlock_t		ioctx_list_lock;
	struct kioctx		*ioctx_list;

	struct kioctx		default_kioctx;

	unsigned long hiwater_rss;	/* High-water RSS usage */
	unsigned long hiwater_vm;	/* High-water virtual memory usage */
};

其中这里要讲到的字段为mmap,其指向线性区对象的链表头,而对于pgd,其指向该进程的页表。session

在地址空间中,mmap为地址空间的内存区域(用vm_area_struct结构来表示)链表,mm_rb用红黑树来存储,链表表示起来更加方便,红黑树表示起来更加方便查找。区别是,当虚拟区较少的时候,这个时候采用单链表,由mmap指向这个链表,当虚拟区多时此时采用红黑树的结构,由mm_rb指向这棵红黑树。这样就能够在大量数据的时候效率更高。全部的mm_struct结构体经过自身的mm_list域连接在一个双向链表上,该链表的首元素是init_mm内存描述符,表明init进程的地址空间。数据结构

vm_area_struct

对于mmap指向的vm_area_struct,咱们继续深刻源码:app

struct vm_area_struct {
    //指向vm_mm
	struct mm_struct * vm_mm;	/* The address space we belong to. */
	//该虚拟内存空间的首地址
    unsigned long vm_start;		/* Our start address within vm_mm. */
	//该虚拟内存空间的尾地址
    unsigned long vm_end;		/* The first byte after our end address within vm_mm. */

    //VMA链表的下一个成员
	/* linked list of VM areas per task, sorted by address */
	struct vm_area_struct *vm_next;

	pgprot_t vm_page_prot;		/* Access permissions of this VMA. */
	//保存VMA标志位
    unsigned long vm_flags;		/* Flags, listed below. */
	//将本VMA做为一个节点加入到红黑树中
	struct rb_node vm_rb;
    ...
}

至此,咱们能够看出,虚拟内存即为由一个个vm_area_struct结构体,经过链表组装起来的空间。ide

所以,结合我的的理解,及对计算机原理的认识,我尝试模拟做出了linux中虚拟内存的结构图,以下:
在这里插入图片描述
但实际上,并不是每一个vm_area_struct对应指向内存的每一个段(每一个segment可能由多个VMA组成),对于vm_area_struct,其描述的是一段连续的、具备相同访问属性的虚存空间,该虚存空间的大小为物理内存页面的整数倍svg

杂谈

以上的示意图及分析是结合自身的理解所谈,有所班门弄斧的味道,若有不足之处还望指出;学习

同时,近期学得越多,发现本身不会得也越多。不管是在计算机系统上,仍是在linux源码上,有太多的知识点我不曾涉猎,仍是须要多学习、多总结、多思考。