CPU MEMORY BARRIERS ------------------- The Linux kernel has eight basic CPU memory barriers: TYPE MANDATORY SMP CONDITIONAL =============== ======================= ======================== GENERAL mb() smp_mb() WRITE wmb() smp_wmb() READ rmb() smp_rmb()
structmm_struct { structvm_area_struct *mmap;//虚拟内存区域链表 /* list of VMAs */ structrb_rootmm_rb;//虚拟内存区域红黑树 u32 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU//在内存映射区域中找到一个没用映射的区域 unsignedlong(*get_unmapped_area)(struct file *filp, unsignedlong addr, unsignedlong len, unsignedlong pgoff, unsignedlong flags); #endif unsignedlong mmap_base; //内存映射区域的起始地址 /* base of mmap area */ unsignedlong mmap_legacy_base; /* base of mmap area in bottom-up allocations */ unsignedlong task_size; //用户虚拟地址空间的长度 /* size of task vm space */ unsignedlong highest_vm_end; /* highest vma end address */ pgd_t * pgd;//指向页全局目录,即第一级页表 atomic_t mm_users; //共享一个用户虚拟地址空间的进程的数量 /* How many users with user space? */ atomic_t mm_count; //内存描述符的引用计数 /* How many references to "struct mm_struct" (users count as 1) */ atomic_long_t nr_ptes; /* PTE page table pages */ atomic_long_t nr_pmds; /* PMD page table pages */ int map_count; /* number of VMAs */
spinlock_t page_table_lock; /* Protects page tables and some counters */ structrw_semaphoremmap_sem;
structlist_headmmlist;/* List of maybe swapped mm's. These are globally strung * together off init_mm.mmlist, and are protected * by mmlist_lock */
unsignedlong saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
/* * Special counters, in some configurations protected by the * page_table_lock, in other configurations by being atomic. */ structmm_rss_statrss_stat;
structlinux_binfmt *binfmt;
cpumask_var_t cpu_vm_mask_var;
/* Architecture-specific MM context */ mm_context_t context;//处理器架构特定的内存管理上下文
unsignedlong flags; /* Must use atomic bitops to access the bits */
structcore_state *core_state;/* coredumping support */ #ifdef CONFIG_AIO spinlock_t ioctx_lock; structkioctx_table __rcu *ioctx_table; #endif #ifdef CONFIG_MEMCG /* * "owner" points to a task that is regarded as the canonical * user/owner of this mm. All of the following must be true in * order for it to be changed: * * current == mm->owner * current->mm != mm * new_owner->mm == mm * new_owner->alloc_lock is held */ structtask_struct __rcu *owner; #endif
/* store ref to file /proc/<pid>/exe symlink points to */ structfile *exe_file; #ifdef CONFIG_MMU_NOTIFIER structmmu_notifier_mm *mmu_notifier_mm; #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS pgtable_t pmd_huge_pte; /* protected by page_table_lock */ #endif #ifdef CONFIG_CPUMASK_OFFSTACK structcpumaskcpumask_allocation; #endif #ifdef CONFIG_NUMA_BALANCING /* * numa_next_scan is the next time that the PTEs will be marked * pte_numa. NUMA hinting faults will gather statistics and migrate * pages to new nodes if necessary. */ unsignedlong numa_next_scan;
/* Restart point for scanning and setting pte_numa */ unsignedlong numa_scan_offset;
/* numa_scan_seq prevents two threads setting pte_numa */ int numa_scan_seq; #endif #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) /* * An operation with batched TLB flushing is going on. Anything that * can move process memory needs to flush the TLB when moving a * PROT_NONE or PROT_NUMA mapped page. */ bool tlb_flush_pending; #endif structuprobes_stateuprobes_state; #ifdef CONFIG_X86_INTEL_MPX /* address of the bounds directory */ void __user *bd_addr; #endif }
structvm_area_struct { /* The first cache line has the info for VMA tree walking. */ //分别保存该虚拟地址空间的首地址喝末地址后第一个字节的地址 unsignedlong vm_start; /* Our start address within vm_mm. */ unsignedlong vm_end; /* The first byte after our end address within vm_mm. */
/* linked list of VM areas per task, sorted by address */ structvm_area_struct *vm_next, *vm_prev;//每个片段组成的双链表 //如果采用链表,影响搜索速度,所以采用红黑树。将VMA作为一个节点加入到红黑树中。 structrb_nodevm_rb;
/* * Largest free memory gap in bytes to the left of this VMA. * Either between this VMA and vma->vm_prev, or between one of the * VMAs below us in the VMA rbtree and its ->vm_prev. This helps * get_unmapped_area find a free area of the right size. */ unsignedlong rb_subtree_gap;
/* Second cache line starts here. */ //指向虚拟内存区域对应的用户虚拟地址空间 structmm_struct *vm_mm;/* The address space we belong to. */ //保护位 访问权限 pgprot_t vm_page_prot; /* Access permissions of this VMA. */ //读写标志位 rwx unsignedlong vm_flags; /* Flags, see mm.h. */
/* * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. */ //为了支持查询一个文件区间被映射到哪些虚拟内存区域 struct { structrb_noderb; unsignedlong rb_subtree_last; } shared;
/* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma * list, after a COW of one of the file pages. A MAP_SHARED vma * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack * or brk vma (with NULL file) can only be in an anon_vma list. */ //把虚拟内存区域关联的所有anon_yma实例串联起来, //一个虚拟内存区域会关联到父进程的anon_vma实例和自己的anon_vma实例 structlist_headanon_vma_chain;/* Serialized by mmap_sem & * page_table_lock */ //指向一个anon_vma实例,结构anon_vma用来组织匿名页被映射到的所有的虚拟地址空间 structanon_vma *anon_vma;/* Serialized by page_table_lock */
/* Function pointers to deal with this struct. */ /* 虚拟内存操作集合 struct vm_operations_struct { void (*open)(struct vm_area_struct * area); //在创建虚拟内存区域时调用open方法 void (*close)(struct vm_area_struct * area); //在删除虚拟内存区域时调用close方法 int (*mremap)(struct vm_area_struct * area); //使用系统调用mremap移动虚拟内存区域时调用mremap方法 int (*fault) (struct vm fault *vmf); //访问文件映射的虚拟页时,如果没有映射到物理页,生成缺页异常, //异常处理程序调用fault就去来把文件的数据读到文件页缓存当中 //与fault类似,区别是huge_fault方法针对使用透明巨型页的文件映射 int (*huge_fault) (struct vm_fault *vmf,enum page_entry_size pe_size); 读文件映射的虚拟页时,如果没有映射到物理页,生成缺页异常,异常处理程序除了读入正在访问的文件页,还会预读后续的文件页,调用map pages方法在文件的页缓存中分配物理页 void (*map pages) (struct vm_fault *vmf, pgoff_t start pgoff, pgoff_t end pgoff) ; //第一次写私有的文件映射时,生成页错误异常,异常处理程序执行写时复制, 调用page_mkwrite方法以通知文件系统页即将变成可写,以便文件系统检查是否允许写, 或者等待页进入合适的状态。int (*page_mkwrite) (struct vm_fault *vmf) ; */
conststructvm_operations_struct *vm_ops;
/* Information about our backing store: */ unsignedlong vm_pgoff; //文件偏移,单位是页
structfile * vm_file;//文件 ,匿名映射则指针为空 void * vm_private_data; //指向内存区私有数据 #ifndef CONFIG_MMU structvm_region *vm_region;/* NOMMU mapping region */ #endif #ifdef CONFIG_NUMA structmempolicy *vm_policy;/* NUMA policy for the VMA */ #endif };
error = mlock_future_check(mm, mm->def_flags, len); if (error) return error;
/* * mm->mmap_sem is required to protect against another thread * changing the mappings in case we sleep. */ verify_mm_writelocked(mm);
/* * Clear old maps. this also does some error checking for us */ //循环遍历用户进程红黑树中的VMA,然后根据addr来查找合适的插入点 munmap_back: if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { if (do_munmap(mm, addr, len)) return -ENOMEM; goto munmap_back; }
/* Check against address space limits *after* clearing old maps... */ //检查是否要对此虚拟区间进行扩充 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) return -ENOMEM;
if (mm->map_count > sysctl_max_map_count) return -ENOMEM; //判断系统是否有足够的内存 if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) return -ENOMEM;
/* Can we just expand an old private anonymous mapping? */ //判断是否可以合并 vma = vma_merge(mm, prev, addr, addr + len, flags, NULL, NULL, pgoff, NULL); if (vma) goto out;//如果可以,去合并
/* * create a vma struct for an anonymous mapping */ //没办法合并,新创建一个VMA,VMA地址是[addr,addr+len] vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (!vma) { vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; }
//mmzone.h typedefstructpglist_data { structzonenode_zones[MAX_NR_ZONES];//内存区域数组 structzonelistnode_zonelists[MAX_ZONELISTS];//备用区域数组 int nr_zones;//该节点包含的内存区域数量 #ifdef CONFIG_FLAT_NODE_MEM_MAP //除了稀疏内存模型以外 structpage *node_mem_map;//页描述符数组 每个物理页对应一个页描述符 #ifdef CONFIG_PAGE_EXTENSION structpage_ext *node_page_ext;//页的扩展属性 #endif #endif #ifndef CONFIG_NO_BOOTMEM structbootmem_data *bdata;//引导bootmen分配器 #endif #ifdef CONFIG_MEMORY_HOTPLUG /* * Must be held any time you expect node_start_pfn, node_present_pages * or node_spanned_pages stay constant. Holding this will also * guarantee that any pfn_valid() stays that way. * * pgdat_resize_lock() and pgdat_resize_unlock() are provided to * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG. * * Nests above zone->lock and zone->span_seqlock */ spinlock_t node_size_lock; #endif unsignedlong node_start_pfn;//该节点的起始物理页号 unsignedlong node_present_pages; //物理页总数 unsignedlong node_spanned_pages; //物理页范围的总长度,包括空洞 int node_id;//节点标识符 wait_queue_head_t kswapd_wait; wait_queue_head_t pfmemalloc_wait; structtask_struct *kswapd;/* Protected by mem_hotplug_begin/end() */ int kswapd_max_order; enumzone_typeclasszone_idx; #ifdef CONFIG_NUMA_BALANCING /* Lock serializing the migrate rate limiting window */ spinlock_t numabalancing_migrate_lock;
/* Rate limiting time interval */ unsignedlong numabalancing_migrate_next_window;
/* Number of pages migrated during the rate limiting time interval */ unsignedlong numabalancing_migrate_nr_pages; #endif } pg_data_t;
//mmzone.h enumzone_type { #ifdef CONFIG_ZONE_DMA /* * ZONE_DMA is used when there are devices that are not able * to do DMA to all of addressable memory (ZONE_NORMAL). Then we * carve out the portion of memory that is needed for these devices. * The range is arch specific. * * Some examples * * Architecture Limit * --------------------------- * parisc, ia64, sparc <4G * s390 <2G * arm Various * alpha Unlimited or 0-16MB. * * i386, x86_64 and multiple other arches * <16M. */ /*DMA区域 Direct Memory Access ,意思是直接内存访问。 如果有些设备不能直接访问所有内存,需要使用DMA区域。ISA*/ ZONE_DMA, #endif #ifdef CONFIG_ZONE_DMA32 /* * x86_64 needs two ZONE_DMAs because it supports devices that are * only able to do DMA to the lower 16M but also 32 bit devices that * can only do DMA areas below 4G. */ /*DMA32区域64位系统,如果既要支持只能直接访问16MB以下的内存设备,64位系统, 如果既要支持只能直接访问16MB以下的内存设备, 又要支持只能直接访问4GB以下内存的32设备,必须使用此DMA32区域*/ ZONE_DMA32, #endif /* * Normal addressable memory is in ZONE_NORMAL. DMA operations can be * performed on pages in ZONE_NORMAL if the DMA devices support * transfers to all addressable memory. */ /*普通区域 直接映射到内核虚拟地址空间的内存区域,又称为普通区域, 又称为直接映射区域,又称为线性映射区域*/
ZONE_NORMAL, #ifdef CONFIG_HIGHMEM /* * A memory area that is only addressable by the kernel through * mapping portions into its own address space. This is for example * used by i386 to allow the kernel to address the memory beyond * 900MB. The kernel will set up special mappings (page * table entries on i386) for each page that the kernel needs to * access. */ /*高端内存区域 此区域是32位时代的产物,内核和用户地址空间按1: 3划分,内核地址空间只有1GB, 不能把1cE以上的内存直接映射到内核地址。 */ ZONE_HIGHMEM, #endif /*可移动区域 它是一个伪内存区域,用来防止内存碎片*/ ZONE_MOVABLE, __MAX_NR_ZONES };
structzone { /* Read-mostly fields */
/* zone watermarks, access with *_wmark_pages(zone) macros */ unsignedlong watermark[NR_WMARK];//页分配器使用的流水线
/* * We don't know if the memory that we're going to allocate will be freeable * or/and it will be released eventually, so to avoid totally wasting several * GB of ram we must reserve some of the lower zone memory (otherwise we risk * to run OOM on the lower zones despite there's tons of freeable ram * on the higher zones). This array is recalculated at runtime if the * sysctl_lowmem_reserve_ratio sysctl changes. */ long lowmem_reserve[MAX_NR_ZONES];//页分配器使用,当前区域保留多少页不能借给高的区域类型
#ifdef CONFIG_NUMA int node; #endif
/* * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on * this zone's LRU. Maintained by the pageout code. */ unsignedint inactive_ratio;
/* * This is a per-zone reserve of pages that should not be * considered dirtyable memory. */ unsignedlong dirty_balance_reserve;
#ifndef CONFIG_SPARSEMEM /* * Flags for a pageblock_nr_pages block. See pageblock-flags.h. * In SPARSEMEM, this map is stored in struct mem_section */ unsignedlong *pageblock_flags; #endif/* CONFIG_SPARSEMEM */
#ifdef CONFIG_NUMA /* * zone reclaim becomes active if more unmapped pages exist. */ unsignedlong min_unmapped_pages; unsignedlong min_slab_pages; #endif/* CONFIG_NUMA */
/* * spanned_pages is the total pages spanned by the zone, including * holes, which is calculated as: * spanned_pages = zone_end_pfn - zone_start_pfn; * * present_pages is physical pages existing within the zone, which * is calculated as: * present_pages = spanned_pages - absent_pages(pages in holes); * * managed_pages is present pages managed by the buddy system, which * is calculated as (reserved_pages includes pages allocated by the * bootmem allocator): * managed_pages = present_pages - reserved_pages; * * So present_pages may be used by memory hotplug or memory power * management logic to figure out unmanaged pages by checking * (present_pages - managed_pages). And managed_pages should be used * by page allocator and vm scanner to calculate all kinds of watermarks * and thresholds. * * Locking rules: * * zone_start_pfn and spanned_pages are protected by span_seqlock. * It is a seqlock because it has to be read outside of zone->lock, * and it is done in the main allocator path. But, it is written * quite infrequently. * * The span_seq lock is declared along with zone->lock because it is * frequently read in proximity to zone->lock. It's good to * give them a chance of being in the same cacheline. * * Write access to present_pages at runtime should be protected by * mem_hotplug_begin/end(). Any reader who can't tolerant drift of * present_pages should get_online_mems() to get a stable value. * * Read access to managed_pages should be safe because it's unsigned * long. Write access to zone->managed_pages and totalram_pages are * protected by managed_page_count_lock at runtime. Idealy only * adjust_managed_page_count() should be used instead of directly * touching zone->managed_pages and totalram_pages. */ unsignedlong managed_pages;//伙伴分配器管理的物理页数量 unsignedlong spanned_pages;//当前区域跨越的总页数,包括空洞 unsignedlong present_pages;//当前区域存在的物理页总数,不包括空洞
constchar *name;//区域名称
/* * Number of MIGRATE_RESERVE page block. To maintain for just * optimization. Protected by zone->lock. */ int nr_migrate_reserve_block;
#ifdef CONFIG_MEMORY_ISOLATION /* * Number of isolated pageblock. It is used to solve incorrect * freepage counting problem due to racy retrieving migratetype * of pageblock. Protected by zone->lock. */ unsignedlong nr_isolate_pageblock; #endif
#ifdef CONFIG_MEMORY_HOTPLUG /* see spanned/present_pages for more description */ seqlock_t span_seqlock; #endif
/* * wait_table -- the array holding the hash table * wait_table_hash_nr_entries -- the size of the hash table array * wait_table_bits -- wait_table_size == (1 << wait_table_bits) * * The purpose of all these is to keep track of the people * waiting for a page to become available and make them * runnable again when possible. The trouble is that this * consumes a lot of space, especially when so few things * wait on pages at a given time. So instead of using * per-page waitqueues, we use a waitqueue hash table. * * The bucket discipline is to sleep on the same queue when * colliding and wake all in that wait queue when removing. * When something wakes, it must check to be sure its page is * truly available, a la thundering herd. The cost of a * collision is great, but given the expected load of the * table, they should be so rare as to be outweighed by the * benefits from the saved space. * * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the * primary users of these fields, and in mm/page_alloc.c * free_area_init_core() performs the initialization of them. */ wait_queue_head_t *wait_table; unsignedlong wait_table_hash_nr_entries; unsignedlong wait_table_bits;
ZONE_PADDING(_pad1_) /* free areas of different sizes */ structfree_areafree_area[MAX_ORDER];//不同长度的空间区域
/* zone flags, see below */ unsignedlong flags;
/* Write-intensive fields used from the page allocator */ spinlock_t lock;
ZONE_PADDING(_pad2_)
/* Write-intensive fields used by page reclaim */
/* Fields commonly accessed by the page reclaim scanner */ spinlock_t lru_lock; structlruveclruvec;
/* Evictions & activations on the inactive file list */ atomic_long_t inactive_age;
/* * When free pages are below this point, additional steps are taken * when reading the number of free pages to avoid per-cpu counter * drift allowing watermarks to be breached */ unsignedlong percpu_drift_mark;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA /* pfn where compaction free scanner should start */ unsignedlong compact_cached_free_pfn; /* pfn where async and sync compaction migration scanner should start */ unsignedlong compact_cached_migrate_pfn[2]; #endif
#ifdef CONFIG_COMPACTION /* * On compaction failure, 1<<compact_defer_shift compactions * are skipped before trying again. The number attempted since * last failure is tracked with compact_considered. */ unsignedint compact_considered; unsignedint compact_defer_shift; int compact_order_failed; #endif
#if defined CONFIG_COMPACTION || defined CONFIG_CMA /* Set to true when the PG_migrate_skip bits should be cleared */ bool compact_blockskip_flush; #endif
ZONE_PADDING(_pad3_) /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; } ____cacheline_internodealigned_in_smp;r
structpage { /* First double word block */ unsignedlong flags; /* Atomic flags, some possibly * updated asynchronously */ union { structaddress_space *mapping;/* If low bit clear, points to * inode address_space, or NULL. * If page mapped as anonymous * memory, low bit is set, and * it points to anon_vma object: * see PAGE_MAPPING_ANON below. */ void *s_mem; /* slab first object */ };
/* Second double word */ struct { union { pgoff_t index; /* Our offset within mapping. */ void *freelist; /* sl[aou]b first free object */ bool pfmemalloc; /* If set by the page allocator, * ALLOC_NO_WATERMARKS was set * and the low watermark was not * met implying that the system * is under some pressure. The * caller should try ensure * this page is only used to * free other pages. */ };
union { #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) /* Used for cmpxchg_double in slub */ unsignedlong counters; #else /* * Keep _count separate from slub cmpxchg_double data. * As the rest of the double word is protected by * slab_lock but _count is not. */ unsigned counters; #endif
struct {
union { /* * Count of ptes mapped in * mms, to show when page is * mapped & limit reverse map * searches. * * Used also for tail pages * refcounting instead of * _count. Tail pages cannot * be mapped and keeping the * tail page _count zero at * all times guarantees * get_page_unless_zero() will * never succeed on tail * pages. */ atomic_t _mapcount;
/* Third double word block */ union { structlist_headlru;/* Pageout list, eg. active_list * protected by zone->lru_lock ! * Can be used as a generic list * by the page owner. */ struct {/* slub per cpu partial pages */ structpage *next;/* Next partial slab */ #ifdef CONFIG_64BIT int pages; /* Nr of partial slabs left */ int pobjects; /* Approximate # of objects */ #else shortint pages; shortint pobjects; #endif };
structslab *slab_page;/* slab fields */ structrcu_headrcu_head;/* Used by SLAB * when destroying via RCU */ /* First tail page of compound page */ struct { compound_page_dtor *compound_dtor; unsignedlong compound_order; };
/* Remainder is not double word aligned */ union { unsignedlong private; /* Mapping-private opaque data: * usually used for buffer_heads * if PagePrivate set; used for * swp_entry_t if PageSwapCache; * indicates order in the buddy * system if PG_buddy is set. */ #if USE_SPLIT_PTE_PTLOCKS #if ALLOC_SPLIT_PTLOCKS spinlock_t *ptl; #else spinlock_t ptl; #endif #endif structkmem_cache *slab_cache;/* SL[AU]B: Pointer to slab */ structpage *first_page;/* Compound tail pages */ };
/* * On machines where all RAM is mapped into kernel address space, * we can simply calculate the virtual address. On machines with * highmem some memory is mapped into kernel virtual memory * dynamically, so we need a place to store that address. * Note that this field could be 16 bits on x86 ... ;) * * Architectures with slow multiplication can define * WANT_PAGE_VIRTUAL in asm/page.h */ #if defined(WANT_PAGE_VIRTUAL) void *virtual; /* Kernel virtual address (NULL if not kmapped, ie. highmem) */ #endif/* WANT_PAGE_VIRTUAL */
#ifdef CONFIG_KMEMCHECK /* * kmemcheck wants to track the status of each byte in a page; this * is a pointer to such a status block. NULL if not tracked. */ void *shadow; #endif
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif }
Big Kernel Lock BKL
Big Kernel Lock (BKL)大内核锁,其为Linux内核中的一种锁,与普通的锁原理基本一致,一旦进程获得BKL,则进入被它保护的临界区,不但该临界区被上锁,所有被保护的临
//include/linux/percpu_counter.h structpercpu_counter { raw_spinlock_t lock;//自旋锁 用于在需要准确值时 s64 count;//计数器的准确值 #ifdef CONFIG_HOTPLUG_CPU structlist_headlist;/* All percpu_counters are on a list */ #endif s32 __percpu *counters;//该数组缓存了对计数器的操作 };