diff options
author | Mike Pagano <mpagano@gentoo.org> | 2014-10-10 15:56:35 -0400 |
---|---|---|
committer | Mike Pagano <mpagano@gentoo.org> | 2014-10-10 15:56:35 -0400 |
commit | 45ca8c94954b7b8d9658410f759a5258d7cdca9a (patch) | |
tree | 4773f3981492d82912a85d050a20c121d7e7298a | |
parent | Linux patch 3.12.29 (diff) | |
download | linux-patches-45ca8c94954b7b8d9658410f759a5258d7cdca9a.tar.gz linux-patches-45ca8c94954b7b8d9658410f759a5258d7cdca9a.tar.bz2 linux-patches-45ca8c94954b7b8d9658410f759a5258d7cdca9a.zip |
Linux patch 3.12.303.12-32
-rw-r--r-- | 0000_README | 4 | ||||
-rw-r--r-- | 1029_linux-3.12.30.patch | 7724 |
2 files changed, 7728 insertions, 0 deletions
diff --git a/0000_README b/0000_README index ae0f6aa7..d8b89ecb 100644 --- a/0000_README +++ b/0000_README @@ -158,6 +158,10 @@ Patch: 1028_linux-3.12.29.patch From: http://www.kernel.org Desc: Linux 3.12.29 +Patch: 1029_linux-3.12.30.patch +From: http://www.kernel.org +Desc: Linux 3.12.30 + Patch: 1500_XATTR_USER_PREFIX.patch From: https://bugs.gentoo.org/show_bug.cgi?id=470644 Desc: Support for namespace user.pax.* on tmpfs. diff --git a/1029_linux-3.12.30.patch b/1029_linux-3.12.30.patch new file mode 100644 index 00000000..90682678 --- /dev/null +++ b/1029_linux-3.12.30.patch @@ -0,0 +1,7724 @@ +diff --git a/Makefile b/Makefile +index 67cec33d00c7..1ad1566225ca 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,6 +1,6 @@ + VERSION = 3 + PATCHLEVEL = 12 +-SUBLEVEL = 29 ++SUBLEVEL = 30 + EXTRAVERSION = + NAME = One Giant Leap for Frogkind + +diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c +index 004ba568d93f..33294fdc402e 100644 +--- a/arch/tile/mm/homecache.c ++++ b/arch/tile/mm/homecache.c +@@ -417,7 +417,7 @@ void __homecache_free_pages(struct page *page, unsigned int order) + if (put_page_testzero(page)) { + homecache_change_page_home(page, order, PAGE_HOME_HASH); + if (order == 0) { +- free_hot_cold_page(page, 0); ++ free_hot_cold_page(page, false); + } else { + init_page_count(page); + __free_pages(page, order); +diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h +index fb5e4c658f7a..ef470a7a3d0f 100644 +--- a/arch/unicore32/include/asm/mmu_context.h ++++ b/arch/unicore32/include/asm/mmu_context.h +@@ -14,6 +14,8 @@ + + #include <linux/compiler.h> + #include <linux/sched.h> ++#include <linux/mm.h> ++#include <linux/vmacache.h> + #include <linux/io.h> + + #include <asm/cacheflush.h> +@@ -73,7 +75,7 @@ do { \ + else \ + mm->mmap = NULL; \ + rb_erase(&high_vma->vm_rb, &mm->mm_rb); \ +- mm->mmap_cache = NULL; \ ++ vmacache_invalidate(mm); \ + mm->map_count--; \ + remove_vma(high_vma); \ + } \ +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index e6d90babc245..04905bfc508b 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -62,7 +62,7 @@ static inline void __flush_tlb_all(void) + + static inline void __flush_tlb_one(unsigned long addr) + { +- count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); ++ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); + __flush_tlb_single(addr); + } + +@@ -93,13 +93,13 @@ static inline void __flush_tlb_one(unsigned long addr) + */ + static inline void __flush_tlb_up(void) + { +- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); ++ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + __flush_tlb(); + } + + static inline void flush_tlb_all(void) + { +- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); ++ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + __flush_tlb_all(); + } + +diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c +index ce2d0a2c3e4f..0e25a1bc5ab5 100644 +--- a/arch/x86/kernel/cpu/mtrr/generic.c ++++ b/arch/x86/kernel/cpu/mtrr/generic.c +@@ -683,7 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) + } + + /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ +- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); ++ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + __flush_tlb(); + + /* Save MTRR state */ +@@ -697,7 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) + static void post_set(void) __releases(set_atomicity_lock) + { + /* Flush TLBs (no need to flush caches - they are disabled) */ +- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); ++ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + __flush_tlb(); + + /* Intel (P6) standard MTRRs */ +diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c +index dfa537a03be1..5da29d04de2f 100644 +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -386,13 +386,20 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, + int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) + { +- int young; +- +- young = ptep_test_and_clear_young(vma, address, ptep); +- if (young) +- flush_tlb_page(vma, address); +- +- return young; ++ /* ++ * On x86 CPUs, clearing the accessed bit without a TLB flush ++ * doesn't cause data corruption. [ It could cause incorrect ++ * page aging and the (mistaken) reclaim of hot pages, but the ++ * chance of that should be relatively low. ] ++ * ++ * So as a performance optimization don't flush the TLB when ++ * clearing the accessed bit, it will eventually be flushed by ++ * a context switch or a VM operation anyway. [ In the rare ++ * event of it not getting flushed for a long time the delay ++ * shouldn't really matter because there's no real memory ++ * pressure for swapout to react to. ] ++ */ ++ return ptep_test_and_clear_young(vma, address, ptep); + } + + #ifdef CONFIG_TRANSPARENT_HUGEPAGE +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index ae699b3bbac8..dd8dda167a24 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -103,7 +103,7 @@ static void flush_tlb_func(void *info) + if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) + return; + +- count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); ++ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { + if (f->flush_end == TLB_FLUSH_ALL) + local_flush_tlb(); +@@ -131,7 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, + info.flush_start = start; + info.flush_end = end; + +- count_vm_event(NR_TLB_REMOTE_FLUSH); ++ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); + if (is_uv_system()) { + unsigned int cpu; + +@@ -151,44 +151,19 @@ void flush_tlb_current_task(void) + + preempt_disable(); + +- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); ++ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + local_flush_tlb(); + if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); + preempt_enable(); + } + +-/* +- * It can find out the THP large page, or +- * HUGETLB page in tlb_flush when THP disabled +- */ +-static inline unsigned long has_large_page(struct mm_struct *mm, +- unsigned long start, unsigned long end) +-{ +- pgd_t *pgd; +- pud_t *pud; +- pmd_t *pmd; +- unsigned long addr = ALIGN(start, HPAGE_SIZE); +- for (; addr < end; addr += HPAGE_SIZE) { +- pgd = pgd_offset(mm, addr); +- if (likely(!pgd_none(*pgd))) { +- pud = pud_offset(pgd, addr); +- if (likely(!pud_none(*pud))) { +- pmd = pmd_offset(pud, addr); +- if (likely(!pmd_none(*pmd))) +- if (pmd_large(*pmd)) +- return addr; +- } +- } +- } +- return 0; +-} +- + void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, + unsigned long end, unsigned long vmflag) + { + unsigned long addr; + unsigned act_entries, tlb_entries = 0; ++ unsigned long nr_base_pages; + + preempt_disable(); + if (current->active_mm != mm) +@@ -210,21 +185,20 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, + tlb_entries = tlb_lli_4k[ENTRIES]; + else + tlb_entries = tlb_lld_4k[ENTRIES]; ++ + /* Assume all of TLB entries was occupied by this task */ +- act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm; ++ act_entries = tlb_entries >> tlb_flushall_shift; ++ act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm; ++ nr_base_pages = (end - start) >> PAGE_SHIFT; + + /* tlb_flushall_shift is on balance point, details in commit log */ +- if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) { +- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); ++ if (nr_base_pages > act_entries) { ++ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + local_flush_tlb(); + } else { +- if (has_large_page(mm, start, end)) { +- local_flush_tlb(); +- goto flush_all; +- } + /* flush range by one by one 'invlpg' */ + for (addr = start; addr < end; addr += PAGE_SIZE) { +- count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); ++ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); + __flush_tlb_single(addr); + } + +@@ -262,7 +236,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) + + static void do_flush_tlb_all(void *info) + { +- count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); ++ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + __flush_tlb_all(); + if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) + leave_mm(smp_processor_id()); +@@ -270,7 +244,7 @@ static void do_flush_tlb_all(void *info) + + void flush_tlb_all(void) + { +- count_vm_event(NR_TLB_REMOTE_FLUSH); ++ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); + on_each_cpu(do_flush_tlb_all, NULL, 1); + } + +diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c +index 6e9ff8fac75a..6357298932bf 100644 +--- a/fs/btrfs/compression.c ++++ b/fs/btrfs/compression.c +@@ -474,7 +474,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, + rcu_read_lock(); + page = radix_tree_lookup(&mapping->page_tree, pg_index); + rcu_read_unlock(); +- if (page) { ++ if (page && !radix_tree_exceptional_entry(page)) { + misses++; + if (misses > 4) + break; +diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c +index 594bbfd4996e..7015d9079bd1 100644 +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -4446,7 +4446,8 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) + spin_unlock(&eb->refs_lock); + } + +-static void mark_extent_buffer_accessed(struct extent_buffer *eb) ++static void mark_extent_buffer_accessed(struct extent_buffer *eb, ++ struct page *accessed) + { + unsigned long num_pages, i; + +@@ -4455,7 +4456,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb) + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); +- mark_page_accessed(p); ++ if (p != accessed) ++ mark_page_accessed(p); + } + } + +@@ -4476,7 +4478,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, + eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); + if (eb && atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); +- mark_extent_buffer_accessed(eb); ++ mark_extent_buffer_accessed(eb, NULL); + return eb; + } + rcu_read_unlock(); +@@ -4504,7 +4506,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, + spin_unlock(&mapping->private_lock); + unlock_page(p); + page_cache_release(p); +- mark_extent_buffer_accessed(exists); ++ mark_extent_buffer_accessed(exists, p); + goto free_eb; + } + +@@ -4519,7 +4521,6 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, + attach_extent_buffer_page(eb, p); + spin_unlock(&mapping->private_lock); + WARN_ON(PageDirty(p)); +- mark_page_accessed(p); + eb->pages[i] = p; + if (!PageUptodate(p)) + uptodate = 0; +@@ -4549,7 +4550,7 @@ again: + } + spin_unlock(&tree->buffer_lock); + radix_tree_preload_end(); +- mark_extent_buffer_accessed(exists); ++ mark_extent_buffer_accessed(exists, NULL); + goto free_eb; + } + /* add one reference for the tree */ +@@ -4595,7 +4596,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, + eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); + if (eb && atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); +- mark_extent_buffer_accessed(eb); ++ mark_extent_buffer_accessed(eb, NULL); + return eb; + } + rcu_read_unlock(); +diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c +index 72da4df53c9a..ad80dfa6cf91 100644 +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -426,13 +426,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, + struct page *page = prepared_pages[pg]; + /* + * Copy data from userspace to the current page +- * +- * Disable pagefault to avoid recursive lock since +- * the pages are already locked + */ +- pagefault_disable(); + copied = iov_iter_copy_from_user_atomic(page, i, offset, count); +- pagefault_enable(); + + /* Flush processor's dcache for this page */ + flush_dcache_page(page); +@@ -476,11 +471,12 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) + for (i = 0; i < num_pages; i++) { + /* page checked is some magic around finding pages that + * have been modified without going through btrfs_set_page_dirty +- * clear it here ++ * clear it here. There should be no need to mark the pages ++ * accessed as prepare_pages should have marked them accessed ++ * in prepare_pages via find_or_create_page() + */ + ClearPageChecked(pages[i]); + unlock_page(pages[i]); +- mark_page_accessed(pages[i]); + page_cache_release(pages[i]); + } + } +diff --git a/fs/buffer.c b/fs/buffer.c +index aeeea6529bcd..b7888527f7c3 100644 +--- a/fs/buffer.c ++++ b/fs/buffer.c +@@ -227,7 +227,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) + int all_mapped = 1; + + index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); +- page = find_get_page(bd_mapping, index); ++ page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED); + if (!page) + goto out; + +@@ -1366,12 +1366,13 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size) + struct buffer_head *bh = lookup_bh_lru(bdev, block, size); + + if (bh == NULL) { ++ /* __find_get_block_slow will mark the page accessed */ + bh = __find_get_block_slow(bdev, block); + if (bh) + bh_lru_install(bh); +- } +- if (bh) ++ } else + touch_buffer(bh); ++ + return bh; + } + EXPORT_SYMBOL(__find_get_block); +@@ -1483,16 +1484,27 @@ EXPORT_SYMBOL(set_bh_page); + /* + * Called when truncating a buffer on a page completely. + */ ++ ++/* Bits that are cleared during an invalidate */ ++#define BUFFER_FLAGS_DISCARD \ ++ (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \ ++ 1 << BH_Delay | 1 << BH_Unwritten) ++ + static void discard_buffer(struct buffer_head * bh) + { ++ unsigned long b_state, b_state_old; ++ + lock_buffer(bh); + clear_buffer_dirty(bh); + bh->b_bdev = NULL; +- clear_buffer_mapped(bh); +- clear_buffer_req(bh); +- clear_buffer_new(bh); +- clear_buffer_delay(bh); +- clear_buffer_unwritten(bh); ++ b_state = bh->b_state; ++ for (;;) { ++ b_state_old = cmpxchg(&bh->b_state, b_state, ++ (b_state & ~BUFFER_FLAGS_DISCARD)); ++ if (b_state_old == b_state) ++ break; ++ b_state = b_state_old; ++ } + unlock_buffer(bh); + } + +diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c +index e501ac3a49ff..2f6cfcaa55fd 100644 +--- a/fs/cramfs/inode.c ++++ b/fs/cramfs/inode.c +@@ -179,8 +179,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i + struct page *page = NULL; + + if (blocknr + i < devsize) { +- page = read_mapping_page_async(mapping, blocknr + i, +- NULL); ++ page = read_mapping_page(mapping, blocknr + i, NULL); + /* synchronous error? */ + if (IS_ERR(page)) + page = NULL; +diff --git a/fs/exec.c b/fs/exec.c +index 95eef54de2b6..26bb91bf203b 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -26,6 +26,7 @@ + #include <linux/file.h> + #include <linux/fdtable.h> + #include <linux/mm.h> ++#include <linux/vmacache.h> + #include <linux/stat.h> + #include <linux/fcntl.h> + #include <linux/swap.h> +@@ -818,7 +819,7 @@ EXPORT_SYMBOL(read_code); + static int exec_mmap(struct mm_struct *mm) + { + struct task_struct *tsk; +- struct mm_struct * old_mm, *active_mm; ++ struct mm_struct *old_mm, *active_mm; + + /* Notify parent that we're no longer interested in the old VM */ + tsk = current; +@@ -844,6 +845,8 @@ static int exec_mmap(struct mm_struct *mm) + tsk->mm = mm; + tsk->active_mm = mm; + activate_mm(active_mm, mm); ++ tsk->mm->vmacache_seqnum = 0; ++ vmacache_flush(tsk); + task_unlock(tsk); + arch_pick_mmap_layout(mm); + if (old_mm) { +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index 242226a87be7..7620133f78bf 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -1044,6 +1044,8 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) + * allocating. If we are looking at the buddy cache we would + * have taken a reference using ext4_mb_load_buddy and that + * would have pinned buddy page to page cache. ++ * The call to ext4_mb_get_buddy_page_lock will mark the ++ * page accessed. + */ + ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); + if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { +@@ -1062,7 +1064,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) + ret = -EIO; + goto err; + } +- mark_page_accessed(page); + + if (e4b.bd_buddy_page == NULL) { + /* +@@ -1082,7 +1083,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) + ret = -EIO; + goto err; + } +- mark_page_accessed(page); + err: + ext4_mb_put_buddy_page_lock(&e4b); + return ret; +@@ -1141,7 +1141,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, + + /* we could use find_or_create_page(), but it locks page + * what we'd like to avoid in fast path ... */ +- page = find_get_page(inode->i_mapping, pnum); ++ page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); + if (page == NULL || !PageUptodate(page)) { + if (page) + /* +@@ -1172,15 +1172,16 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, + ret = -EIO; + goto err; + } ++ ++ /* Pages marked accessed already */ + e4b->bd_bitmap_page = page; + e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); +- mark_page_accessed(page); + + block++; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + +- page = find_get_page(inode->i_mapping, pnum); ++ page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); + if (page == NULL || !PageUptodate(page)) { + if (page) + page_cache_release(page); +@@ -1201,9 +1202,10 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, + ret = -EIO; + goto err; + } ++ ++ /* Pages marked accessed already */ + e4b->bd_buddy_page = page; + e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); +- mark_page_accessed(page); + + BUG_ON(e4b->bd_bitmap_page == NULL); + BUG_ON(e4b->bd_buddy_page == NULL); +diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c +index bb312201ca95..15a29af63e20 100644 +--- a/fs/f2fs/checkpoint.c ++++ b/fs/f2fs/checkpoint.c +@@ -70,7 +70,6 @@ repeat: + goto repeat; + } + out: +- mark_page_accessed(page); + return page; + } + +diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c +index 51ef27894433..d0335bdb65b4 100644 +--- a/fs/f2fs/node.c ++++ b/fs/f2fs/node.c +@@ -970,7 +970,6 @@ repeat: + } + got_it: + BUG_ON(nid != nid_of_node(page)); +- mark_page_accessed(page); + return page; + } + +@@ -1026,7 +1025,6 @@ page_hit: + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } +- mark_page_accessed(page); + return page; + } + +diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c +index fa8cb4b7b8fe..fc8e4991736a 100644 +--- a/fs/fuse/dev.c ++++ b/fs/fuse/dev.c +@@ -1613,7 +1613,7 @@ out_finish: + + static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) + { +- release_pages(req->pages, req->num_pages, 0); ++ release_pages(req->pages, req->num_pages, false); + } + + static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, +diff --git a/fs/fuse/file.c b/fs/fuse/file.c +index 4598345ab87d..d08c108065e1 100644 +--- a/fs/fuse/file.c ++++ b/fs/fuse/file.c +@@ -985,13 +985,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + +- pagefault_disable(); + tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); +- pagefault_enable(); + flush_dcache_page(page); + +- mark_page_accessed(page); +- + if (!tmp) { + unlock_page(page); + page_cache_release(page); +diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c +index 1253c2006029..f3aee0bbe886 100644 +--- a/fs/gfs2/aops.c ++++ b/fs/gfs2/aops.c +@@ -517,7 +517,6 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, + p = kmap_atomic(page); + memcpy(buf + copied, p + offset, amt); + kunmap_atomic(p); +- mark_page_accessed(page); + page_cache_release(page); + copied += amt; + index++; +diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c +index 52f177be3bf8..89afe3a8f626 100644 +--- a/fs/gfs2/meta_io.c ++++ b/fs/gfs2/meta_io.c +@@ -128,7 +128,8 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) + yield(); + } + } else { +- page = find_lock_page(mapping, index); ++ page = find_get_page_flags(mapping, index, ++ FGP_LOCK|FGP_ACCESSED); + if (!page) + return NULL; + } +@@ -145,7 +146,6 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) + map_bh(bh, sdp->sd_vfs, blkno); + + unlock_page(page); +- mark_page_accessed(page); + page_cache_release(page); + + return bh; +diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c +index d19b30ababf1..a4a8ed56e438 100644 +--- a/fs/hugetlbfs/inode.c ++++ b/fs/hugetlbfs/inode.c +@@ -1017,6 +1017,11 @@ static int __init init_hugetlbfs_fs(void) + int error; + int i; + ++ if (!hugepages_supported()) { ++ pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n"); ++ return -ENOTSUPP; ++ } ++ + error = bdi_init(&hugetlbfs_backing_dev_info); + if (error) + return error; +diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c +index fe3c0527545f..91bf52d1a88c 100644 +--- a/fs/jffs2/fs.c ++++ b/fs/jffs2/fs.c +@@ -682,7 +682,7 @@ unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c, + struct inode *inode = OFNI_EDONI_2SFFJ(f); + struct page *pg; + +- pg = read_cache_page_async(inode->i_mapping, offset >> PAGE_CACHE_SHIFT, ++ pg = read_cache_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT, + (void *)jffs2_do_readpage_unlock, inode); + if (IS_ERR(pg)) + return (void *)pg; +diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c +index e242bbf72972..fdb74cbb9e0c 100644 +--- a/fs/nfs/blocklayout/blocklayout.c ++++ b/fs/nfs/blocklayout/blocklayout.c +@@ -1220,7 +1220,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx) + end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); + if (end != NFS_I(inode)->npages) { + rcu_read_lock(); +- end = radix_tree_next_hole(&mapping->page_tree, idx + 1, ULONG_MAX); ++ end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX); + rcu_read_unlock(); + } + +diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c +index a27e3fecefaf..250ed5b20c8f 100644 +--- a/fs/ntfs/attrib.c ++++ b/fs/ntfs/attrib.c +@@ -1748,7 +1748,6 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) + if (page) { + set_page_dirty(page); + unlock_page(page); +- mark_page_accessed(page); + page_cache_release(page); + } + ntfs_debug("Done."); +diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c +index ea4ba9daeb47..a0b2f345da2b 100644 +--- a/fs/ntfs/file.c ++++ b/fs/ntfs/file.c +@@ -2060,7 +2060,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, + } + do { + unlock_page(pages[--do_pages]); +- mark_page_accessed(pages[do_pages]); + page_cache_release(pages[do_pages]); + } while (do_pages); + if (unlikely(status)) +diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c +index ad4df869c907..7724fbdf443f 100644 +--- a/fs/proc/task_mmu.c ++++ b/fs/proc/task_mmu.c +@@ -1,4 +1,5 @@ + #include <linux/mm.h> ++#include <linux/vmacache.h> + #include <linux/hugetlb.h> + #include <linux/huge_mm.h> + #include <linux/mount.h> +@@ -159,7 +160,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) + + /* + * We remember last_addr rather than next_addr to hit with +- * mmap_cache most of the time. We have zero last_addr at ++ * vmacache most of the time. We have zero last_addr at + * the beginning and also after lseek. We will have -1 last_addr + * after the end of the vmas. + */ +diff --git a/fs/super.c b/fs/super.c +index d127de207376..fb68a4c90c98 100644 +--- a/fs/super.c ++++ b/fs/super.c +@@ -112,9 +112,14 @@ static unsigned long super_cache_count(struct shrinker *shrink, + + sb = container_of(shrink, struct super_block, s_shrink); + +- if (!grab_super_passive(sb)) +- return 0; +- ++ /* ++ * Don't call grab_super_passive as it is a potential ++ * scalability bottleneck. The counts could get updated ++ * between super_cache_count and super_cache_scan anyway. ++ * Call to super_cache_count with shrinker_rwsem held ++ * ensures the safety of call to list_lru_count_node() and ++ * s_op->nr_cached_objects(). ++ */ + if (sb->s_op && sb->s_op->nr_cached_objects) + total_objects = sb->s_op->nr_cached_objects(sb, + sc->nid); +@@ -125,7 +130,6 @@ static unsigned long super_cache_count(struct shrinker *shrink, + sc->nid); + + total_objects = vfs_pressure_ratio(total_objects); +- drop_super(sb); + return total_objects; + } + +@@ -321,10 +325,8 @@ void deactivate_locked_super(struct super_block *s) + struct file_system_type *fs = s->s_type; + if (atomic_dec_and_test(&s->s_active)) { + cleancache_invalidate_fs(s); +- fs->kill_sb(s); +- +- /* caches are now gone, we can safely kill the shrinker now */ + unregister_shrinker(&s->s_shrink); ++ fs->kill_sb(s); + + put_filesystem(fs); + put_super(s); +diff --git a/include/linux/compaction.h b/include/linux/compaction.h +index 091d72e70d8a..01e3132820da 100644 +--- a/include/linux/compaction.h ++++ b/include/linux/compaction.h +@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, + extern int fragmentation_index(struct zone *zone, unsigned int order); + extern unsigned long try_to_compact_pages(struct zonelist *zonelist, + int order, gfp_t gfp_mask, nodemask_t *mask, +- bool sync, bool *contended); ++ enum migrate_mode mode, bool *contended); + extern void compact_pgdat(pg_data_t *pgdat, int order); + extern void reset_isolation_suitable(pg_data_t *pgdat); + extern unsigned long compaction_suitable(struct zone *zone, int order); +@@ -62,6 +62,22 @@ static inline bool compaction_deferred(struct zone *zone, int order) + return zone->compact_considered < defer_limit; + } + ++/* ++ * Update defer tracking counters after successful compaction of given order, ++ * which means an allocation either succeeded (alloc_success == true) or is ++ * expected to succeed. ++ */ ++static inline void compaction_defer_reset(struct zone *zone, int order, ++ bool alloc_success) ++{ ++ if (alloc_success) { ++ zone->compact_considered = 0; ++ zone->compact_defer_shift = 0; ++ } ++ if (order >= zone->compact_order_failed) ++ zone->compact_order_failed = order + 1; ++} ++ + /* Returns true if restarting compaction after many failures */ + static inline bool compaction_restarting(struct zone *zone, int order) + { +@@ -75,7 +91,7 @@ static inline bool compaction_restarting(struct zone *zone, int order) + #else + static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, + int order, gfp_t gfp_mask, nodemask_t *nodemask, +- bool sync, bool *contended) ++ enum migrate_mode mode, bool *contended) + { + return COMPACT_CONTINUE; + } +diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h +index cc1b01cf2035..a7ebb89ae9fb 100644 +--- a/include/linux/cpuset.h ++++ b/include/linux/cpuset.h +@@ -12,10 +12,31 @@ + #include <linux/cpumask.h> + #include <linux/nodemask.h> + #include <linux/mm.h> ++#include <linux/jump_label.h> + + #ifdef CONFIG_CPUSETS + +-extern int number_of_cpusets; /* How many cpusets are defined in system? */ ++extern struct static_key cpusets_enabled_key; ++static inline bool cpusets_enabled(void) ++{ ++ return static_key_false(&cpusets_enabled_key); ++} ++ ++static inline int nr_cpusets(void) ++{ ++ /* jump label reference count + the top-level cpuset */ ++ return static_key_count(&cpusets_enabled_key) + 1; ++} ++ ++static inline void cpuset_inc(void) ++{ ++ static_key_slow_inc(&cpusets_enabled_key); ++} ++ ++static inline void cpuset_dec(void) ++{ ++ static_key_slow_dec(&cpusets_enabled_key); ++} + + extern int cpuset_init(void); + extern void cpuset_init_smp(void); +@@ -32,13 +53,13 @@ extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask); + + static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) + { +- return number_of_cpusets <= 1 || ++ return nr_cpusets() <= 1 || + __cpuset_node_allowed_softwall(node, gfp_mask); + } + + static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) + { +- return number_of_cpusets <= 1 || ++ return nr_cpusets() <= 1 || + __cpuset_node_allowed_hardwall(node, gfp_mask); + } + +@@ -87,25 +108,26 @@ extern void rebuild_sched_domains(void); + extern void cpuset_print_task_mems_allowed(struct task_struct *p); + + /* +- * get_mems_allowed is required when making decisions involving mems_allowed +- * such as during page allocation. mems_allowed can be updated in parallel +- * and depending on the new value an operation can fail potentially causing +- * process failure. A retry loop with get_mems_allowed and put_mems_allowed +- * prevents these artificial failures. ++ * read_mems_allowed_begin is required when making decisions involving ++ * mems_allowed such as during page allocation. mems_allowed can be updated in ++ * parallel and depending on the new value an operation can fail potentially ++ * causing process failure. A retry loop with read_mems_allowed_begin and ++ * read_mems_allowed_retry prevents these artificial failures. + */ +-static inline unsigned int get_mems_allowed(void) ++static inline unsigned int read_mems_allowed_begin(void) + { + return read_seqcount_begin(¤t->mems_allowed_seq); + } + + /* +- * If this returns false, the operation that took place after get_mems_allowed +- * may have failed. It is up to the caller to retry the operation if ++ * If this returns true, the operation that took place after ++ * read_mems_allowed_begin may have failed artificially due to a concurrent ++ * update of mems_allowed. It is up to the caller to retry the operation if + * appropriate. + */ +-static inline bool put_mems_allowed(unsigned int seq) ++static inline bool read_mems_allowed_retry(unsigned int seq) + { +- return !read_seqcount_retry(¤t->mems_allowed_seq, seq); ++ return read_seqcount_retry(¤t->mems_allowed_seq, seq); + } + + static inline void set_mems_allowed(nodemask_t nodemask) +@@ -119,6 +141,8 @@ static inline void set_mems_allowed(nodemask_t nodemask) + + #else /* !CONFIG_CPUSETS */ + ++static inline bool cpusets_enabled(void) { return false; } ++ + static inline int cpuset_init(void) { return 0; } + static inline void cpuset_init_smp(void) {} + +@@ -221,14 +245,14 @@ static inline void set_mems_allowed(nodemask_t nodemask) + { + } + +-static inline unsigned int get_mems_allowed(void) ++static inline unsigned int read_mems_allowed_begin(void) + { + return 0; + } + +-static inline bool put_mems_allowed(unsigned int seq) ++static inline bool read_mems_allowed_retry(unsigned int seq) + { +- return true; ++ return false; + } + + #endif /* !CONFIG_CPUSETS */ +diff --git a/include/linux/gfp.h b/include/linux/gfp.h +index 9b4dd491f7e8..fa7ac989ff56 100644 +--- a/include/linux/gfp.h ++++ b/include/linux/gfp.h +@@ -364,8 +364,8 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); + + extern void __free_pages(struct page *page, unsigned int order); + extern void free_pages(unsigned long addr, unsigned int order); +-extern void free_hot_cold_page(struct page *page, int cold); +-extern void free_hot_cold_page_list(struct list_head *list, int cold); ++extern void free_hot_cold_page(struct page *page, bool cold); ++extern void free_hot_cold_page_list(struct list_head *list, bool cold); + + extern void __free_memcg_kmem_pages(struct page *page, unsigned int order); + extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order); +diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h +index a291552ab767..aac671be9581 100644 +--- a/include/linux/huge_mm.h ++++ b/include/linux/huge_mm.h +@@ -92,10 +92,6 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); + #endif /* CONFIG_DEBUG_VM */ + + extern unsigned long transparent_hugepage_flags; +-extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +- pmd_t *dst_pmd, pmd_t *src_pmd, +- struct vm_area_struct *vma, +- unsigned long addr, unsigned long end); + extern int split_huge_page_to_list(struct page *page, struct list_head *list); + static inline int split_huge_page(struct page *page) + { +diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h +index 5214ff63c351..511b1a0d6cc2 100644 +--- a/include/linux/hugetlb.h ++++ b/include/linux/hugetlb.h +@@ -396,6 +396,16 @@ static inline int hugepage_migration_support(struct hstate *h) + #endif + } + ++static inline bool hugepages_supported(void) ++{ ++ /* ++ * Some platform decide whether they support huge pages at boot ++ * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when ++ * there is no such support ++ */ ++ return HPAGE_SHIFT != 0; ++} ++ + #else /* CONFIG_HUGETLB_PAGE */ + struct hstate {}; + #define alloc_huge_page_node(h, nid) NULL +diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h +index a5079072da66..9216e465289a 100644 +--- a/include/linux/jump_label.h ++++ b/include/linux/jump_label.h +@@ -62,6 +62,10 @@ struct static_key { + + # include <asm/jump_label.h> + # define HAVE_JUMP_LABEL ++#else ++struct static_key { ++ atomic_t enabled; ++}; + #endif /* CC_HAVE_ASM_GOTO && CONFIG_JUMP_LABEL */ + + enum jump_label_type { +@@ -72,6 +76,12 @@ enum jump_label_type { + struct module; + + #include <linux/atomic.h> ++ ++static inline int static_key_count(struct static_key *key) ++{ ++ return atomic_read(&key->enabled); ++} ++ + #ifdef HAVE_JUMP_LABEL + + #define JUMP_LABEL_TRUE_BRANCH 1UL +@@ -122,24 +132,20 @@ extern void jump_label_apply_nops(struct module *mod); + + #else /* !HAVE_JUMP_LABEL */ + +-struct static_key { +- atomic_t enabled; +-}; +- + static __always_inline void jump_label_init(void) + { + } + + static __always_inline bool static_key_false(struct static_key *key) + { +- if (unlikely(atomic_read(&key->enabled)) > 0) ++ if (unlikely(static_key_count(key) > 0)) + return true; + return false; + } + + static __always_inline bool static_key_true(struct static_key *key) + { +- if (likely(atomic_read(&key->enabled)) > 0) ++ if (likely(static_key_count(key) > 0)) + return true; + return false; + } +@@ -179,7 +185,7 @@ static inline int jump_label_apply_nops(struct module *mod) + + static inline bool static_key_enabled(struct static_key *key) + { +- return (atomic_read(&key->enabled) > 0); ++ return static_key_count(key) > 0; + } + + #endif /* _LINUX_JUMP_LABEL_H */ +diff --git a/include/linux/migrate.h b/include/linux/migrate.h +index ee8b14ae4f3f..449905ebcab3 100644 +--- a/include/linux/migrate.h ++++ b/include/linux/migrate.h +@@ -5,7 +5,9 @@ + #include <linux/mempolicy.h> + #include <linux/migrate_mode.h> + +-typedef struct page *new_page_t(struct page *, unsigned long private, int **); ++typedef struct page *new_page_t(struct page *page, unsigned long private, ++ int **reason); ++typedef void free_page_t(struct page *page, unsigned long private); + + /* + * Return values from addresss_space_operations.migratepage(): +@@ -39,7 +41,7 @@ extern void putback_lru_pages(struct list_head *l); + extern void putback_movable_pages(struct list_head *l); + extern int migrate_page(struct address_space *, + struct page *, struct page *, enum migrate_mode); +-extern int migrate_pages(struct list_head *l, new_page_t x, ++extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, + unsigned long private, enum migrate_mode mode, int reason); + + extern int fail_migrate_page(struct address_space *, +@@ -61,8 +63,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping, + + static inline void putback_lru_pages(struct list_head *l) {} + static inline void putback_movable_pages(struct list_head *l) {} +-static inline int migrate_pages(struct list_head *l, new_page_t x, +- unsigned long private, enum migrate_mode mode, int reason) ++static inline int migrate_pages(struct list_head *l, new_page_t new, ++ free_page_t free, unsigned long private, enum migrate_mode mode, ++ int reason) + { return -ENOSYS; } + + static inline int migrate_prep(void) { return -ENOSYS; } +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 073734339583..2b3a5330dcf2 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -919,6 +919,14 @@ extern void show_free_areas(unsigned int flags); + extern bool skip_free_areas_node(unsigned int flags, int nid); + + int shmem_zero_setup(struct vm_area_struct *); ++#ifdef CONFIG_SHMEM ++bool shmem_mapping(struct address_space *mapping); ++#else ++static inline bool shmem_mapping(struct address_space *mapping) ++{ ++ return false; ++} ++#endif + + extern int can_do_mlock(void); + extern int user_shm_lock(size_t, struct user_struct *); +@@ -1623,9 +1631,6 @@ void page_cache_async_readahead(struct address_space *mapping, + unsigned long size); + + unsigned long max_sane_readahead(unsigned long nr); +-unsigned long ra_submit(struct file_ra_state *ra, +- struct address_space *mapping, +- struct file *filp); + + /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ + extern int expand_stack(struct vm_area_struct *vma, unsigned long address); +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 8e082f18fb6a..b8131e7d6eda 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -324,9 +324,9 @@ struct mm_rss_stat { + + struct kioctx_table; + struct mm_struct { +- struct vm_area_struct * mmap; /* list of VMAs */ ++ struct vm_area_struct *mmap; /* list of VMAs */ + struct rb_root mm_rb; +- struct vm_area_struct * mmap_cache; /* last find_vma result */ ++ u32 vmacache_seqnum; /* per-thread vmacache */ + #ifdef CONFIG_MMU + unsigned long (*get_unmapped_area) (struct file *filp, + unsigned long addr, unsigned long len, +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 56482904a676..450f19c5c865 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -78,10 +78,15 @@ extern int page_group_by_mobility_disabled; + #define NR_MIGRATETYPE_BITS (PB_migrate_end - PB_migrate + 1) + #define MIGRATETYPE_MASK ((1UL << NR_MIGRATETYPE_BITS) - 1) + +-static inline int get_pageblock_migratetype(struct page *page) ++#define get_pageblock_migratetype(page) \ ++ get_pfnblock_flags_mask(page, page_to_pfn(page), \ ++ PB_migrate_end, MIGRATETYPE_MASK) ++ ++static inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) + { + BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2); +- return get_pageblock_flags_mask(page, PB_migrate_end, MIGRATETYPE_MASK); ++ return get_pfnblock_flags_mask(page, pfn, PB_migrate_end, ++ MIGRATETYPE_MASK); + } + + struct free_area { +@@ -138,6 +143,7 @@ enum zone_stat_item { + NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ + NR_DIRTIED, /* page dirtyings since bootup */ + NR_WRITTEN, /* page writings since bootup */ ++ NR_PAGES_SCANNED, /* pages scanned since last reclaim */ + #ifdef CONFIG_NUMA + NUMA_HIT, /* allocated in intended node */ + NUMA_MISS, /* allocated in non intended node */ +@@ -316,19 +322,12 @@ enum zone_type { + #ifndef __GENERATING_BOUNDS_H + + struct zone { +- /* Fields commonly accessed by the page allocator */ ++ /* Read-mostly fields */ + + /* zone watermarks, access with *_wmark_pages(zone) macros */ + unsigned long watermark[NR_WMARK]; + + /* +- * When free pages are below this point, additional steps are taken +- * when reading the number of free pages to avoid per-cpu counter +- * drift allowing watermarks to be breached +- */ +- unsigned long percpu_drift_mark; +- +- /* + * We don't know if the memory that we're going to allocate will be freeable + * or/and it will be released eventually, so to avoid totally wasting several + * GB of ram we must reserve some of the lower zone memory (otherwise we risk +@@ -336,40 +335,26 @@ struct zone { + * on the higher zones). This array is recalculated at runtime if the + * sysctl_lowmem_reserve_ratio sysctl changes. + */ +- unsigned long lowmem_reserve[MAX_NR_ZONES]; +- +- /* +- * This is a per-zone reserve of pages that should not be +- * considered dirtyable memory. +- */ +- unsigned long dirty_balance_reserve; ++ long lowmem_reserve[MAX_NR_ZONES]; + + #ifdef CONFIG_NUMA + int node; ++#endif ++ + /* +- * zone reclaim becomes active if more unmapped pages exist. ++ * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on ++ * this zone's LRU. Maintained by the pageout code. + */ +- unsigned long min_unmapped_pages; +- unsigned long min_slab_pages; +-#endif ++ unsigned int inactive_ratio; ++ ++ struct pglist_data *zone_pgdat; + struct per_cpu_pageset __percpu *pageset; ++ + /* +- * free areas of different sizes ++ * This is a per-zone reserve of pages that should not be ++ * considered dirtyable memory. + */ +- spinlock_t lock; +-#if defined CONFIG_COMPACTION || defined CONFIG_CMA +- /* Set to true when the PG_migrate_skip bits should be cleared */ +- bool compact_blockskip_flush; +- +- /* pfns where compaction scanners should start */ +- unsigned long compact_cached_free_pfn; +- unsigned long compact_cached_migrate_pfn; +-#endif +-#ifdef CONFIG_MEMORY_HOTPLUG +- /* see spanned/present_pages for more description */ +- seqlock_t span_seqlock; +-#endif +- struct free_area free_area[MAX_ORDER]; ++ unsigned long dirty_balance_reserve; + + #ifndef CONFIG_SPARSEMEM + /* +@@ -379,71 +364,14 @@ struct zone { + unsigned long *pageblock_flags; + #endif /* CONFIG_SPARSEMEM */ + +-#ifdef CONFIG_COMPACTION +- /* +- * On compaction failure, 1<<compact_defer_shift compactions +- * are skipped before trying again. The number attempted since +- * last failure is tracked with compact_considered. +- */ +- unsigned int compact_considered; +- unsigned int compact_defer_shift; +- int compact_order_failed; +-#endif +- +- ZONE_PADDING(_pad1_) +- +- /* Fields commonly accessed by the page reclaim scanner */ +- spinlock_t lru_lock; +- struct lruvec lruvec; +- +- unsigned long pages_scanned; /* since last reclaim */ +- unsigned long flags; /* zone flags, see below */ +- +- /* Zone statistics */ +- atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; +- +- /* +- * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on +- * this zone's LRU. Maintained by the pageout code. +- */ +- unsigned int inactive_ratio; +- +- +- ZONE_PADDING(_pad2_) +- /* Rarely used or read-mostly fields */ +- ++#ifdef CONFIG_NUMA + /* +- * wait_table -- the array holding the hash table +- * wait_table_hash_nr_entries -- the size of the hash table array +- * wait_table_bits -- wait_table_size == (1 << wait_table_bits) +- * +- * The purpose of all these is to keep track of the people +- * waiting for a page to become available and make them +- * runnable again when possible. The trouble is that this +- * consumes a lot of space, especially when so few things +- * wait on pages at a given time. So instead of using +- * per-page waitqueues, we use a waitqueue hash table. +- * +- * The bucket discipline is to sleep on the same queue when +- * colliding and wake all in that wait queue when removing. +- * When something wakes, it must check to be sure its page is +- * truly available, a la thundering herd. The cost of a +- * collision is great, but given the expected load of the +- * table, they should be so rare as to be outweighed by the +- * benefits from the saved space. +- * +- * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the +- * primary users of these fields, and in mm/page_alloc.c +- * free_area_init_core() performs the initialization of them. ++ * zone reclaim becomes active if more unmapped pages exist. + */ +- wait_queue_head_t * wait_table; +- unsigned long wait_table_hash_nr_entries; +- unsigned long wait_table_bits; ++ unsigned long min_unmapped_pages; ++ unsigned long min_slab_pages; ++#endif /* CONFIG_NUMA */ + +- /* +- * Discontig memory support fields. +- */ +- struct pglist_data *zone_pgdat; + /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ + unsigned long zone_start_pfn; + +@@ -489,14 +417,103 @@ struct zone { + * adjust_managed_page_count() should be used instead of directly + * touching zone->managed_pages and totalram_pages. + */ ++ unsigned long managed_pages; + unsigned long spanned_pages; + unsigned long present_pages; +- unsigned long managed_pages; ++ ++ const char *name; + + /* +- * rarely used fields: ++ * Number of MIGRATE_RESEVE page block. To maintain for just ++ * optimization. Protected by zone->lock. + */ +- const char *name; ++ int nr_migrate_reserve_block; ++ ++#ifdef CONFIG_MEMORY_HOTPLUG ++ /* see spanned/present_pages for more description */ ++ seqlock_t span_seqlock; ++#endif ++ ++ /* ++ * wait_table -- the array holding the hash table ++ * wait_table_hash_nr_entries -- the size of the hash table array ++ * wait_table_bits -- wait_table_size == (1 << wait_table_bits) ++ * ++ * The purpose of all these is to keep track of the people ++ * waiting for a page to become available and make them ++ * runnable again when possible. The trouble is that this ++ * consumes a lot of space, especially when so few things ++ * wait on pages at a given time. So instead of using ++ * per-page waitqueues, we use a waitqueue hash table. ++ * ++ * The bucket discipline is to sleep on the same queue when ++ * colliding and wake all in that wait queue when removing. ++ * When something wakes, it must check to be sure its page is ++ * truly available, a la thundering herd. The cost of a ++ * collision is great, but given the expected load of the ++ * table, they should be so rare as to be outweighed by the ++ * benefits from the saved space. ++ * ++ * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the ++ * primary users of these fields, and in mm/page_alloc.c ++ * free_area_init_core() performs the initialization of them. ++ */ ++ wait_queue_head_t *wait_table; ++ unsigned long wait_table_hash_nr_entries; ++ unsigned long wait_table_bits; ++ ++ ZONE_PADDING(_pad1_) ++ ++ /* Write-intensive fields used from the page allocator */ ++ spinlock_t lock; ++ ++ /* free areas of different sizes */ ++ struct free_area free_area[MAX_ORDER]; ++ ++ /* zone flags, see below */ ++ unsigned long flags; ++ ++ ZONE_PADDING(_pad2_) ++ ++ /* Write-intensive fields used by page reclaim */ ++ ++ /* Fields commonly accessed by the page reclaim scanner */ ++ spinlock_t lru_lock; ++ struct lruvec lruvec; ++ ++ /* ++ * When free pages are below this point, additional steps are taken ++ * when reading the number of free pages to avoid per-cpu counter ++ * drift allowing watermarks to be breached ++ */ ++ unsigned long percpu_drift_mark; ++ ++#if defined CONFIG_COMPACTION || defined CONFIG_CMA ++ /* pfn where compaction free scanner should start */ ++ unsigned long compact_cached_free_pfn; ++ /* pfn where async and sync compaction migration scanner should start */ ++ unsigned long compact_cached_migrate_pfn[2]; ++#endif ++ ++#ifdef CONFIG_COMPACTION ++ /* ++ * On compaction failure, 1<<compact_defer_shift compactions ++ * are skipped before trying again. The number attempted since ++ * last failure is tracked with compact_considered. ++ */ ++ unsigned int compact_considered; ++ unsigned int compact_defer_shift; ++ int compact_order_failed; ++#endif ++ ++#if defined CONFIG_COMPACTION || defined CONFIG_CMA ++ /* Set to true when the PG_migrate_skip bits should be cleared */ ++ bool compact_blockskip_flush; ++#endif ++ ++ ZONE_PADDING(_pad3_) ++ /* Zone statistics */ ++ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; + } ____cacheline_internodealigned_in_smp; + + typedef enum { +@@ -512,6 +529,7 @@ typedef enum { + ZONE_WRITEBACK, /* reclaim scanning has recently found + * many pages under writeback + */ ++ ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */ + } zone_flags_t; + + static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) +@@ -549,6 +567,11 @@ static inline int zone_is_reclaim_locked(const struct zone *zone) + return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); + } + ++static inline int zone_is_fair_depleted(const struct zone *zone) ++{ ++ return test_bit(ZONE_FAIR_DEPLETED, &zone->flags); ++} ++ + static inline int zone_is_oom_locked(const struct zone *zone) + { + return test_bit(ZONE_OOM_LOCKED, &zone->flags); +@@ -803,10 +826,10 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat) + extern struct mutex zonelists_mutex; + void build_all_zonelists(pg_data_t *pgdat, struct zone *zone); + void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); +-bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, +- int classzone_idx, int alloc_flags); +-bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, +- int classzone_idx, int alloc_flags); ++bool zone_watermark_ok(struct zone *z, unsigned int order, ++ unsigned long mark, int classzone_idx, int alloc_flags); ++bool zone_watermark_ok_safe(struct zone *z, unsigned int order, ++ unsigned long mark, int classzone_idx, int alloc_flags); + enum memmap_context { + MEMMAP_EARLY, + MEMMAP_HOTPLUG, +diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h +index dd7d45b5c496..2284ea62c6cc 100644 +--- a/include/linux/page-flags.h ++++ b/include/linux/page-flags.h +@@ -198,6 +198,7 @@ struct page; /* forward declaration */ + TESTPAGEFLAG(Locked, locked) + PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error) + PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced) ++ __SETPAGEFLAG(Referenced, referenced) + PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty) + PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru) + PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active) +@@ -208,6 +209,7 @@ PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */ + PAGEFLAG(SavePinned, savepinned); /* Xen */ + PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved) + PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked) ++ __SETPAGEFLAG(SwapBacked, swapbacked) + + __PAGEFLAG(SlobFree, slob_free) + +@@ -228,9 +230,9 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1) + TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback) + PAGEFLAG(MappedToDisk, mappedtodisk) + +-/* PG_readahead is only used for file reads; PG_reclaim is only for writes */ ++/* PG_readahead is only used for reads; PG_reclaim is only for writes */ + PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim) +-PAGEFLAG(Readahead, reclaim) /* Reminder to do async read-ahead */ ++PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim) + + #ifdef CONFIG_HIGHMEM + /* +diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h +index c08730c10c7a..2baeee12f48e 100644 +--- a/include/linux/pageblock-flags.h ++++ b/include/linux/pageblock-flags.h +@@ -65,33 +65,26 @@ extern int pageblock_order; + /* Forward declaration */ + struct page; + +-unsigned long get_pageblock_flags_mask(struct page *page, ++unsigned long get_pfnblock_flags_mask(struct page *page, ++ unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask); +-void set_pageblock_flags_mask(struct page *page, ++ ++void set_pfnblock_flags_mask(struct page *page, + unsigned long flags, ++ unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask); + + /* Declarations for getting and setting flags. See mm/page_alloc.c */ +-static inline unsigned long get_pageblock_flags_group(struct page *page, +- int start_bitidx, int end_bitidx) +-{ +- unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1; +- unsigned long mask = (1 << nr_flag_bits) - 1; +- +- return get_pageblock_flags_mask(page, end_bitidx, mask); +-} +- +-static inline void set_pageblock_flags_group(struct page *page, +- unsigned long flags, +- int start_bitidx, int end_bitidx) +-{ +- unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1; +- unsigned long mask = (1 << nr_flag_bits) - 1; +- +- set_pageblock_flags_mask(page, flags, end_bitidx, mask); +-} ++#define get_pageblock_flags_group(page, start_bitidx, end_bitidx) \ ++ get_pfnblock_flags_mask(page, page_to_pfn(page), \ ++ end_bitidx, \ ++ (1 << (end_bitidx - start_bitidx + 1)) - 1) ++#define set_pageblock_flags_group(page, flags, start_bitidx, end_bitidx) \ ++ set_pfnblock_flags_mask(page, flags, page_to_pfn(page), \ ++ end_bitidx, \ ++ (1 << (end_bitidx - start_bitidx + 1)) - 1) + + #ifdef CONFIG_COMPACTION + #define get_pageblock_skip(page) \ +diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h +index e3dea75a078b..d57a02a9747b 100644 +--- a/include/linux/pagemap.h ++++ b/include/linux/pagemap.h +@@ -99,7 +99,7 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) + + #define page_cache_get(page) get_page(page) + #define page_cache_release(page) put_page(page) +-void release_pages(struct page **pages, int nr, int cold); ++void release_pages(struct page **pages, int nr, bool cold); + + /* + * speculatively take a reference to a page. +@@ -243,12 +243,117 @@ static inline struct page *page_cache_alloc_readahead(struct address_space *x) + + typedef int filler_t(void *, struct page *); + +-extern struct page * find_get_page(struct address_space *mapping, +- pgoff_t index); +-extern struct page * find_lock_page(struct address_space *mapping, +- pgoff_t index); +-extern struct page * find_or_create_page(struct address_space *mapping, +- pgoff_t index, gfp_t gfp_mask); ++pgoff_t page_cache_next_hole(struct address_space *mapping, ++ pgoff_t index, unsigned long max_scan); ++pgoff_t page_cache_prev_hole(struct address_space *mapping, ++ pgoff_t index, unsigned long max_scan); ++ ++#define FGP_ACCESSED 0x00000001 ++#define FGP_LOCK 0x00000002 ++#define FGP_CREAT 0x00000004 ++#define FGP_WRITE 0x00000008 ++#define FGP_NOFS 0x00000010 ++#define FGP_NOWAIT 0x00000020 ++ ++struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, ++ int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask); ++ ++/** ++ * find_get_page - find and get a page reference ++ * @mapping: the address_space to search ++ * @offset: the page index ++ * ++ * Looks up the page cache slot at @mapping & @offset. If there is a ++ * page cache page, it is returned with an increased refcount. ++ * ++ * Otherwise, %NULL is returned. ++ */ ++static inline struct page *find_get_page(struct address_space *mapping, ++ pgoff_t offset) ++{ ++ return pagecache_get_page(mapping, offset, 0, 0, 0); ++} ++ ++static inline struct page *find_get_page_flags(struct address_space *mapping, ++ pgoff_t offset, int fgp_flags) ++{ ++ return pagecache_get_page(mapping, offset, fgp_flags, 0, 0); ++} ++ ++/** ++ * find_lock_page - locate, pin and lock a pagecache page ++ * pagecache_get_page - find and get a page reference ++ * @mapping: the address_space to search ++ * @offset: the page index ++ * ++ * Looks up the page cache slot at @mapping & @offset. If there is a ++ * page cache page, it is returned locked and with an increased ++ * refcount. ++ * ++ * Otherwise, %NULL is returned. ++ * ++ * find_lock_page() may sleep. ++ */ ++static inline struct page *find_lock_page(struct address_space *mapping, ++ pgoff_t offset) ++{ ++ return pagecache_get_page(mapping, offset, FGP_LOCK, 0, 0); ++} ++ ++/** ++ * find_or_create_page - locate or add a pagecache page ++ * @mapping: the page's address_space ++ * @index: the page's index into the mapping ++ * @gfp_mask: page allocation mode ++ * ++ * Looks up the page cache slot at @mapping & @offset. If there is a ++ * page cache page, it is returned locked and with an increased ++ * refcount. ++ * ++ * If the page is not present, a new page is allocated using @gfp_mask ++ * and added to the page cache and the VM's LRU list. The page is ++ * returned locked and with an increased refcount. ++ * ++ * On memory exhaustion, %NULL is returned. ++ * ++ * find_or_create_page() may sleep, even if @gfp_flags specifies an ++ * atomic allocation! ++ */ ++static inline struct page *find_or_create_page(struct address_space *mapping, ++ pgoff_t offset, gfp_t gfp_mask) ++{ ++ return pagecache_get_page(mapping, offset, ++ FGP_LOCK|FGP_ACCESSED|FGP_CREAT, ++ gfp_mask, gfp_mask & GFP_RECLAIM_MASK); ++} ++ ++/** ++ * grab_cache_page_nowait - returns locked page at given index in given cache ++ * @mapping: target address_space ++ * @index: the page index ++ * ++ * Same as grab_cache_page(), but do not wait if the page is unavailable. ++ * This is intended for speculative data generators, where the data can ++ * be regenerated if the page couldn't be grabbed. This routine should ++ * be safe to call while holding the lock for another page. ++ * ++ * Clear __GFP_FS when allocating the page to avoid recursion into the fs ++ * and deadlock against the caller's locked page. ++ */ ++static inline struct page *grab_cache_page_nowait(struct address_space *mapping, ++ pgoff_t index) ++{ ++ return pagecache_get_page(mapping, index, ++ FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, ++ mapping_gfp_mask(mapping), ++ GFP_NOFS); ++} ++ ++struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); ++struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); ++unsigned find_get_entries(struct address_space *mapping, pgoff_t start, ++ unsigned int nr_entries, struct page **entries, ++ pgoff_t *indices); + unsigned find_get_pages(struct address_space *mapping, pgoff_t start, + unsigned int nr_pages, struct page **pages); + unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, +@@ -268,10 +373,6 @@ static inline struct page *grab_cache_page(struct address_space *mapping, + return find_or_create_page(mapping, index, mapping_gfp_mask(mapping)); + } + +-extern struct page * grab_cache_page_nowait(struct address_space *mapping, +- pgoff_t index); +-extern struct page * read_cache_page_async(struct address_space *mapping, +- pgoff_t index, filler_t *filler, void *data); + extern struct page * read_cache_page(struct address_space *mapping, + pgoff_t index, filler_t *filler, void *data); + extern struct page * read_cache_page_gfp(struct address_space *mapping, +@@ -279,14 +380,6 @@ extern struct page * read_cache_page_gfp(struct address_space *mapping, + extern int read_cache_pages(struct address_space *mapping, + struct list_head *pages, filler_t *filler, void *data); + +-static inline struct page *read_mapping_page_async( +- struct address_space *mapping, +- pgoff_t index, void *data) +-{ +- filler_t *filler = (filler_t *)mapping->a_ops->readpage; +- return read_cache_page_async(mapping, index, filler, data); +-} +- + static inline struct page *read_mapping_page(struct address_space *mapping, + pgoff_t index, void *data) + { +diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h +index e4dbfab37729..b45d391b4540 100644 +--- a/include/linux/pagevec.h ++++ b/include/linux/pagevec.h +@@ -22,6 +22,11 @@ struct pagevec { + + void __pagevec_release(struct pagevec *pvec); + void __pagevec_lru_add(struct pagevec *pvec); ++unsigned pagevec_lookup_entries(struct pagevec *pvec, ++ struct address_space *mapping, ++ pgoff_t start, unsigned nr_entries, ++ pgoff_t *indices); ++void pagevec_remove_exceptionals(struct pagevec *pvec); + unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, + pgoff_t start, unsigned nr_pages); + unsigned pagevec_lookup_tag(struct pagevec *pvec, +diff --git a/include/linux/plist.h b/include/linux/plist.h +index aa0fb390bd29..8b6c970cff6c 100644 +--- a/include/linux/plist.h ++++ b/include/linux/plist.h +@@ -98,6 +98,13 @@ struct plist_node { + } + + /** ++ * PLIST_HEAD - declare and init plist_head ++ * @head: name for struct plist_head variable ++ */ ++#define PLIST_HEAD(head) \ ++ struct plist_head head = PLIST_HEAD_INIT(head) ++ ++/** + * PLIST_NODE_INIT - static struct plist_node initializer + * @node: struct plist_node variable name + * @__prio: initial node priority +@@ -134,6 +141,8 @@ static inline void plist_node_init(struct plist_node *node, int prio) + extern void plist_add(struct plist_node *node, struct plist_head *head); + extern void plist_del(struct plist_node *node, struct plist_head *head); + ++extern void plist_requeue(struct plist_node *node, struct plist_head *head); ++ + /** + * plist_for_each - iterate over the plist + * @pos: the type * to use as a loop counter +@@ -143,6 +152,16 @@ extern void plist_del(struct plist_node *node, struct plist_head *head); + list_for_each_entry(pos, &(head)->node_list, node_list) + + /** ++ * plist_for_each_continue - continue iteration over the plist ++ * @pos: the type * to use as a loop cursor ++ * @head: the head for your list ++ * ++ * Continue to iterate over plist, continuing after the current position. ++ */ ++#define plist_for_each_continue(pos, head) \ ++ list_for_each_entry_continue(pos, &(head)->node_list, node_list) ++ ++/** + * plist_for_each_safe - iterate safely over a plist of given type + * @pos: the type * to use as a loop counter + * @n: another type * to use as temporary storage +@@ -163,6 +182,18 @@ extern void plist_del(struct plist_node *node, struct plist_head *head); + list_for_each_entry(pos, &(head)->node_list, mem.node_list) + + /** ++ * plist_for_each_entry_continue - continue iteration over list of given type ++ * @pos: the type * to use as a loop cursor ++ * @head: the head for your list ++ * @m: the name of the list_struct within the struct ++ * ++ * Continue to iterate over list of given type, continuing after ++ * the current position. ++ */ ++#define plist_for_each_entry_continue(pos, head, m) \ ++ list_for_each_entry_continue(pos, &(head)->node_list, m.node_list) ++ ++/** + * plist_for_each_entry_safe - iterate safely over list of given type + * @pos: the type * to use as a loop counter + * @n: another type * to use as temporary storage +@@ -229,6 +260,20 @@ static inline int plist_node_empty(const struct plist_node *node) + #endif + + /** ++ * plist_next - get the next entry in list ++ * @pos: the type * to cursor ++ */ ++#define plist_next(pos) \ ++ list_next_entry(pos, node_list) ++ ++/** ++ * plist_prev - get the prev entry in list ++ * @pos: the type * to cursor ++ */ ++#define plist_prev(pos) \ ++ list_prev_entry(pos, node_list) ++ ++/** + * plist_first - return the first node (and thus, highest priority) + * @head: the &struct plist_head pointer + * +diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h +index 403940787be1..e8be53ecfc45 100644 +--- a/include/linux/radix-tree.h ++++ b/include/linux/radix-tree.h +@@ -219,6 +219,7 @@ static inline void radix_tree_replace_slot(void **pslot, void *item) + int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); + void *radix_tree_lookup(struct radix_tree_root *, unsigned long); + void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); ++void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); + void *radix_tree_delete(struct radix_tree_root *, unsigned long); + unsigned int + radix_tree_gang_lookup(struct radix_tree_root *root, void **results, +@@ -226,10 +227,6 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, + unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root, + void ***results, unsigned long *indices, + unsigned long first_index, unsigned int max_items); +-unsigned long radix_tree_next_hole(struct radix_tree_root *root, +- unsigned long index, unsigned long max_scan); +-unsigned long radix_tree_prev_hole(struct radix_tree_root *root, +- unsigned long index, unsigned long max_scan); + int radix_tree_preload(gfp_t gfp_mask); + int radix_tree_maybe_preload(gfp_t gfp_mask); + void radix_tree_init(void); +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 0827bec7d82f..cb67b4e2dba2 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -63,6 +63,10 @@ struct fs_struct; + struct perf_event_context; + struct blk_plug; + ++#define VMACACHE_BITS 2 ++#define VMACACHE_SIZE (1U << VMACACHE_BITS) ++#define VMACACHE_MASK (VMACACHE_SIZE - 1) ++ + /* + * List of flags we want to share for kernel threads, + * if only because they are not used by them anyway. +@@ -1093,6 +1097,9 @@ struct task_struct { + #ifdef CONFIG_COMPAT_BRK + unsigned brk_randomized:1; + #endif ++ /* per-thread vma caching */ ++ u32 vmacache_seqnum; ++ struct vm_area_struct *vmacache[VMACACHE_SIZE]; + #if defined(SPLIT_RSS_COUNTING) + struct task_rss_stat rss_stat; + #endif +diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h +index 30aa0dc60d75..deb49609cd36 100644 +--- a/include/linux/shmem_fs.h ++++ b/include/linux/shmem_fs.h +@@ -49,6 +49,7 @@ extern struct file *shmem_file_setup(const char *name, + loff_t size, unsigned long flags); + extern int shmem_zero_setup(struct vm_area_struct *); + extern int shmem_lock(struct file *file, int lock, struct user_struct *user); ++extern bool shmem_mapping(struct address_space *mapping); + extern void shmem_unlock_mapping(struct address_space *mapping); + extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, + pgoff_t index, gfp_t gfp_mask); +diff --git a/include/linux/swap.h b/include/linux/swap.h +index 46ba0c6c219f..241bf0922770 100644 +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -214,8 +214,9 @@ struct percpu_cluster { + struct swap_info_struct { + unsigned long flags; /* SWP_USED etc: see above */ + signed short prio; /* swap priority of this type */ ++ struct plist_node list; /* entry in swap_active_head */ ++ struct plist_node avail_list; /* entry in swap_avail_head */ + signed char type; /* strange name for an index */ +- signed char next; /* next type on the swap list */ + unsigned int max; /* extent of the swap_map */ + unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ +@@ -255,11 +256,6 @@ struct swap_info_struct { + struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */ + }; + +-struct swap_list_t { +- int head; /* head of priority-ordered swapfile list */ +- int next; /* swapfile to be used next */ +-}; +- + /* linux/mm/page_alloc.c */ + extern unsigned long totalram_pages; + extern unsigned long totalreserve_pages; +@@ -272,12 +268,14 @@ extern unsigned long nr_free_pagecache_pages(void); + + + /* linux/mm/swap.c */ +-extern void __lru_cache_add(struct page *); + extern void lru_cache_add(struct page *); ++extern void lru_cache_add_anon(struct page *page); ++extern void lru_cache_add_file(struct page *page); + extern void lru_add_page_tail(struct page *page, struct page *page_tail, + struct lruvec *lruvec, struct list_head *head); + extern void activate_page(struct page *); + extern void mark_page_accessed(struct page *); ++extern void init_page_accessed(struct page *page); + extern void lru_add_drain(void); + extern void lru_add_drain_cpu(int cpu); + extern void lru_add_drain_all(void); +@@ -287,22 +285,6 @@ extern void swap_setup(void); + + extern void add_page_to_unevictable_list(struct page *page); + +-/** +- * lru_cache_add: add a page to the page lists +- * @page: the page to add +- */ +-static inline void lru_cache_add_anon(struct page *page) +-{ +- ClearPageActive(page); +- __lru_cache_add(page); +-} +- +-static inline void lru_cache_add_file(struct page *page) +-{ +- ClearPageActive(page); +- __lru_cache_add(page); +-} +- + /* linux/mm/vmscan.c */ + extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, + gfp_t gfp_mask, nodemask_t *mask); +@@ -460,7 +442,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) + #define free_page_and_swap_cache(page) \ + page_cache_release(page) + #define free_pages_and_swap_cache(pages, nr) \ +- release_pages((pages), (nr), 0); ++ release_pages((pages), (nr), false); + + static inline void show_swap_cache_info(void) + { +diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h +index e282624e8c10..388293a91e8c 100644 +--- a/include/linux/swapfile.h ++++ b/include/linux/swapfile.h +@@ -6,7 +6,7 @@ + * want to expose them to the dozens of source files that include swap.h + */ + extern spinlock_t swap_lock; +-extern struct swap_list_t swap_list; ++extern struct plist_head swap_active_head; + extern struct swap_info_struct *swap_info[]; + extern int try_to_unuse(unsigned int, bool, unsigned long); + +diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h +index c557c6d096de..3a712e2e7d76 100644 +--- a/include/linux/vm_event_item.h ++++ b/include/linux/vm_event_item.h +@@ -71,12 +71,14 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, + THP_ZERO_PAGE_ALLOC, + THP_ZERO_PAGE_ALLOC_FAILED, + #endif ++#ifdef CONFIG_DEBUG_TLBFLUSH + #ifdef CONFIG_SMP + NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ + NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */ +-#endif ++#endif /* CONFIG_SMP */ + NR_TLB_LOCAL_FLUSH_ALL, + NR_TLB_LOCAL_FLUSH_ONE, ++#endif /* CONFIG_DEBUG_TLBFLUSH */ + NR_VM_EVENT_ITEMS + }; + +diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h +new file mode 100644 +index 000000000000..c3fa0fd43949 +--- /dev/null ++++ b/include/linux/vmacache.h +@@ -0,0 +1,38 @@ ++#ifndef __LINUX_VMACACHE_H ++#define __LINUX_VMACACHE_H ++ ++#include <linux/sched.h> ++#include <linux/mm.h> ++ ++/* ++ * Hash based on the page number. Provides a good hit rate for ++ * workloads with good locality and those with random accesses as well. ++ */ ++#define VMACACHE_HASH(addr) ((addr >> PAGE_SHIFT) & VMACACHE_MASK) ++ ++static inline void vmacache_flush(struct task_struct *tsk) ++{ ++ memset(tsk->vmacache, 0, sizeof(tsk->vmacache)); ++} ++ ++extern void vmacache_flush_all(struct mm_struct *mm); ++extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma); ++extern struct vm_area_struct *vmacache_find(struct mm_struct *mm, ++ unsigned long addr); ++ ++#ifndef CONFIG_MMU ++extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, ++ unsigned long start, ++ unsigned long end); ++#endif ++ ++static inline void vmacache_invalidate(struct mm_struct *mm) ++{ ++ mm->vmacache_seqnum++; ++ ++ /* deal with overflows */ ++ if (unlikely(mm->vmacache_seqnum == 0)) ++ vmacache_flush_all(mm); ++} ++ ++#endif /* __LINUX_VMACACHE_H */ +diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h +index a67b38415768..67ce70c8279b 100644 +--- a/include/linux/vmstat.h ++++ b/include/linux/vmstat.h +@@ -83,6 +83,14 @@ static inline void vm_events_fold_cpu(int cpu) + #define count_vm_numa_events(x, y) do { (void)(y); } while (0) + #endif /* CONFIG_NUMA_BALANCING */ + ++#ifdef CONFIG_DEBUG_TLBFLUSH ++#define count_vm_tlb_event(x) count_vm_event(x) ++#define count_vm_tlb_events(x, y) count_vm_events(x, y) ++#else ++#define count_vm_tlb_event(x) do {} while (0) ++#define count_vm_tlb_events(x, y) do { (void)(y); } while (0) ++#endif ++ + #define __count_zone_vm_events(item, zone, delta) \ + __count_vm_events(item##_NORMAL - ZONE_NORMAL + \ + zone_idx(zone), delta) +diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h +index fde1b3e94c7d..c6814b917bdf 100644 +--- a/include/trace/events/compaction.h ++++ b/include/trace/events/compaction.h +@@ -5,6 +5,7 @@ + #define _TRACE_COMPACTION_H + + #include <linux/types.h> ++#include <linux/list.h> + #include <linux/tracepoint.h> + #include <trace/events/gfpflags.h> + +@@ -47,10 +48,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, + + TRACE_EVENT(mm_compaction_migratepages, + +- TP_PROTO(unsigned long nr_migrated, +- unsigned long nr_failed), ++ TP_PROTO(unsigned long nr_all, ++ int migrate_rc, ++ struct list_head *migratepages), + +- TP_ARGS(nr_migrated, nr_failed), ++ TP_ARGS(nr_all, migrate_rc, migratepages), + + TP_STRUCT__entry( + __field(unsigned long, nr_migrated) +@@ -58,7 +60,22 @@ TRACE_EVENT(mm_compaction_migratepages, + ), + + TP_fast_assign( +- __entry->nr_migrated = nr_migrated; ++ unsigned long nr_failed = 0; ++ struct list_head *page_lru; ++ ++ /* ++ * migrate_pages() returns either a non-negative number ++ * with the number of pages that failed migration, or an ++ * error code, in which case we need to count the remaining ++ * pages manually ++ */ ++ if (migrate_rc >= 0) ++ nr_failed = migrate_rc; ++ else ++ list_for_each(page_lru, migratepages) ++ nr_failed++; ++ ++ __entry->nr_migrated = nr_all - nr_failed; + __entry->nr_failed = nr_failed; + ), + +@@ -67,6 +84,48 @@ TRACE_EVENT(mm_compaction_migratepages, + __entry->nr_failed) + ); + ++TRACE_EVENT(mm_compaction_begin, ++ TP_PROTO(unsigned long zone_start, unsigned long migrate_start, ++ unsigned long free_start, unsigned long zone_end), ++ ++ TP_ARGS(zone_start, migrate_start, free_start, zone_end), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, zone_start) ++ __field(unsigned long, migrate_start) ++ __field(unsigned long, free_start) ++ __field(unsigned long, zone_end) ++ ), ++ ++ TP_fast_assign( ++ __entry->zone_start = zone_start; ++ __entry->migrate_start = migrate_start; ++ __entry->free_start = free_start; ++ __entry->zone_end = zone_end; ++ ), ++ ++ TP_printk("zone_start=%lu migrate_start=%lu free_start=%lu zone_end=%lu", ++ __entry->zone_start, ++ __entry->migrate_start, ++ __entry->free_start, ++ __entry->zone_end) ++); ++ ++TRACE_EVENT(mm_compaction_end, ++ TP_PROTO(int status), ++ ++ TP_ARGS(status), ++ ++ TP_STRUCT__entry( ++ __field(int, status) ++ ), ++ ++ TP_fast_assign( ++ __entry->status = status; ++ ), ++ ++ TP_printk("status=%d", __entry->status) ++); + + #endif /* _TRACE_COMPACTION_H */ + +diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h +index d0c613476620..aece1346ceb7 100644 +--- a/include/trace/events/kmem.h ++++ b/include/trace/events/kmem.h +@@ -267,14 +267,12 @@ DEFINE_EVENT_PRINT(mm_page, mm_page_pcpu_drain, + TRACE_EVENT(mm_page_alloc_extfrag, + + TP_PROTO(struct page *page, +- int alloc_order, int fallback_order, +- int alloc_migratetype, int fallback_migratetype, +- int change_ownership), ++ int alloc_order, int fallback_order, ++ int alloc_migratetype, int fallback_migratetype, int new_migratetype), + + TP_ARGS(page, + alloc_order, fallback_order, +- alloc_migratetype, fallback_migratetype, +- change_ownership), ++ alloc_migratetype, fallback_migratetype, new_migratetype), + + TP_STRUCT__entry( + __field( struct page *, page ) +@@ -291,7 +289,7 @@ TRACE_EVENT(mm_page_alloc_extfrag, + __entry->fallback_order = fallback_order; + __entry->alloc_migratetype = alloc_migratetype; + __entry->fallback_migratetype = fallback_migratetype; +- __entry->change_ownership = change_ownership; ++ __entry->change_ownership = (new_migratetype == alloc_migratetype); + ), + + TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d", +diff --git a/include/trace/events/pagemap.h b/include/trace/events/pagemap.h +index 1c9fabde69e4..ce0803b8d05f 100644 +--- a/include/trace/events/pagemap.h ++++ b/include/trace/events/pagemap.h +@@ -28,12 +28,10 @@ TRACE_EVENT(mm_lru_insertion, + + TP_PROTO( + struct page *page, +- unsigned long pfn, +- int lru, +- unsigned long flags ++ int lru + ), + +- TP_ARGS(page, pfn, lru, flags), ++ TP_ARGS(page, lru), + + TP_STRUCT__entry( + __field(struct page *, page ) +@@ -44,9 +42,9 @@ TRACE_EVENT(mm_lru_insertion, + + TP_fast_assign( + __entry->page = page; +- __entry->pfn = pfn; ++ __entry->pfn = page_to_pfn(page); + __entry->lru = lru; +- __entry->flags = flags; ++ __entry->flags = trace_pagemap_flags(page); + ), + + /* Flag format is based on page-types.c formatting for pagemap */ +@@ -64,9 +62,9 @@ TRACE_EVENT(mm_lru_insertion, + + TRACE_EVENT(mm_lru_activate, + +- TP_PROTO(struct page *page, unsigned long pfn), ++ TP_PROTO(struct page *page), + +- TP_ARGS(page, pfn), ++ TP_ARGS(page), + + TP_STRUCT__entry( + __field(struct page *, page ) +@@ -75,7 +73,7 @@ TRACE_EVENT(mm_lru_activate, + + TP_fast_assign( + __entry->page = page; +- __entry->pfn = pfn; ++ __entry->pfn = page_to_pfn(page); + ), + + /* Flag format is based on page-types.c formatting for pagemap */ +diff --git a/kernel/cpuset.c b/kernel/cpuset.c +index 0b29c52479a6..c8289138cad4 100644 +--- a/kernel/cpuset.c ++++ b/kernel/cpuset.c +@@ -61,12 +61,7 @@ + #include <linux/cgroup.h> + #include <linux/wait.h> + +-/* +- * Tracks how many cpusets are currently defined in system. +- * When there is only one cpuset (the root cpuset) we can +- * short circuit some hooks. +- */ +-int number_of_cpusets __read_mostly; ++struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; + + /* See "Frequency meter" comments, below. */ + +@@ -611,7 +606,7 @@ static int generate_sched_domains(cpumask_var_t **domains, + goto done; + } + +- csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); ++ csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL); + if (!csa) + goto done; + csn = 0; +@@ -1022,7 +1017,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, + task_lock(tsk); + /* + * Determine if a loop is necessary if another thread is doing +- * get_mems_allowed(). If at least one node remains unchanged and ++ * read_mems_allowed_begin(). If at least one node remains unchanged and + * tsk does not have a mempolicy, then an empty nodemask will not be + * possible when mems_allowed is larger than a word. + */ +@@ -1986,7 +1981,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) + if (is_spread_slab(parent)) + set_bit(CS_SPREAD_SLAB, &cs->flags); + +- number_of_cpusets++; ++ cpuset_inc(); + + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) + goto out_unlock; +@@ -2037,7 +2032,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) + if (is_sched_load_balance(cs)) + update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); + +- number_of_cpusets--; ++ cpuset_dec(); + clear_bit(CS_ONLINE, &cs->flags); + + mutex_unlock(&cpuset_mutex); +@@ -2092,7 +2087,6 @@ int __init cpuset_init(void) + if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) + BUG(); + +- number_of_cpusets = 1; + return 0; + } + +diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c +index 0506d447aed2..e911ec662d03 100644 +--- a/kernel/debug/debug_core.c ++++ b/kernel/debug/debug_core.c +@@ -49,6 +49,7 @@ + #include <linux/pid.h> + #include <linux/smp.h> + #include <linux/mm.h> ++#include <linux/vmacache.h> + #include <linux/rcupdate.h> + + #include <asm/cacheflush.h> +@@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) + if (!CACHE_FLUSH_IS_SAFE) + return; + +- if (current->mm && current->mm->mmap_cache) { +- flush_cache_range(current->mm->mmap_cache, +- addr, addr + BREAK_INSTR_SIZE); ++ if (current->mm) { ++ int i; ++ ++ for (i = 0; i < VMACACHE_SIZE; i++) { ++ if (!current->vmacache[i]) ++ continue; ++ flush_cache_range(current->vmacache[i], ++ addr, addr + BREAK_INSTR_SIZE); ++ } + } ++ + /* Force flush instruction cache if it was outside the mm */ + flush_icache_range(addr, addr + BREAK_INSTR_SIZE); + } +diff --git a/kernel/fork.c b/kernel/fork.c +index 143962949bed..29a1b0283d3b 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -28,6 +28,8 @@ + #include <linux/mman.h> + #include <linux/mmu_notifier.h> + #include <linux/fs.h> ++#include <linux/mm.h> ++#include <linux/vmacache.h> + #include <linux/nsproxy.h> + #include <linux/capability.h> + #include <linux/cpu.h> +@@ -363,7 +365,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) + + mm->locked_vm = 0; + mm->mmap = NULL; +- mm->mmap_cache = NULL; ++ mm->vmacache_seqnum = 0; + mm->map_count = 0; + cpumask_clear(mm_cpumask(mm)); + mm->mm_rb = RB_ROOT; +@@ -882,6 +884,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) + if (!oldmm) + return 0; + ++ /* initialize the new vmacache entries */ ++ vmacache_flush(tsk); ++ + if (clone_flags & CLONE_VM) { + atomic_inc(&oldmm->mm_users); + mm = oldmm; +diff --git a/lib/plist.c b/lib/plist.c +index 1ebc95f7a46f..0f2084d30798 100644 +--- a/lib/plist.c ++++ b/lib/plist.c +@@ -134,6 +134,46 @@ void plist_del(struct plist_node *node, struct plist_head *head) + plist_check_head(head); + } + ++/** ++ * plist_requeue - Requeue @node at end of same-prio entries. ++ * ++ * This is essentially an optimized plist_del() followed by ++ * plist_add(). It moves an entry already in the plist to ++ * after any other same-priority entries. ++ * ++ * @node: &struct plist_node pointer - entry to be moved ++ * @head: &struct plist_head pointer - list head ++ */ ++void plist_requeue(struct plist_node *node, struct plist_head *head) ++{ ++ struct plist_node *iter; ++ struct list_head *node_next = &head->node_list; ++ ++ plist_check_head(head); ++ BUG_ON(plist_head_empty(head)); ++ BUG_ON(plist_node_empty(node)); ++ ++ if (node == plist_last(head)) ++ return; ++ ++ iter = plist_next(node); ++ ++ if (node->prio != iter->prio) ++ return; ++ ++ plist_del(node, head); ++ ++ plist_for_each_continue(iter, head) { ++ if (node->prio != iter->prio) { ++ node_next = &iter->node_list; ++ break; ++ } ++ } ++ list_add_tail(&node->node_list, node_next); ++ ++ plist_check_head(head); ++} ++ + #ifdef CONFIG_DEBUG_PI_LIST + #include <linux/sched.h> + #include <linux/module.h> +@@ -170,6 +210,14 @@ static void __init plist_test_check(int nr_expect) + BUG_ON(prio_pos->prio_list.next != &first->prio_list); + } + ++static void __init plist_test_requeue(struct plist_node *node) ++{ ++ plist_requeue(node, &test_head); ++ ++ if (node != plist_last(&test_head)) ++ BUG_ON(node->prio == plist_next(node)->prio); ++} ++ + static int __init plist_test(void) + { + int nr_expect = 0, i, loop; +@@ -193,6 +241,10 @@ static int __init plist_test(void) + nr_expect--; + } + plist_test_check(nr_expect); ++ if (!plist_node_empty(test_node + i)) { ++ plist_test_requeue(test_node + i); ++ plist_test_check(nr_expect); ++ } + } + + for (i = 0; i < ARRAY_SIZE(test_node); i++) { +diff --git a/lib/radix-tree.c b/lib/radix-tree.c +index 7811ed3b4e70..e8adb5d8a184 100644 +--- a/lib/radix-tree.c ++++ b/lib/radix-tree.c +@@ -946,81 +946,6 @@ next: + } + EXPORT_SYMBOL(radix_tree_range_tag_if_tagged); + +- +-/** +- * radix_tree_next_hole - find the next hole (not-present entry) +- * @root: tree root +- * @index: index key +- * @max_scan: maximum range to search +- * +- * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the lowest +- * indexed hole. +- * +- * Returns: the index of the hole if found, otherwise returns an index +- * outside of the set specified (in which case 'return - index >= max_scan' +- * will be true). In rare cases of index wrap-around, 0 will be returned. +- * +- * radix_tree_next_hole may be called under rcu_read_lock. However, like +- * radix_tree_gang_lookup, this will not atomically search a snapshot of +- * the tree at a single point in time. For example, if a hole is created +- * at index 5, then subsequently a hole is created at index 10, +- * radix_tree_next_hole covering both indexes may return 10 if called +- * under rcu_read_lock. +- */ +-unsigned long radix_tree_next_hole(struct radix_tree_root *root, +- unsigned long index, unsigned long max_scan) +-{ +- unsigned long i; +- +- for (i = 0; i < max_scan; i++) { +- if (!radix_tree_lookup(root, index)) +- break; +- index++; +- if (index == 0) +- break; +- } +- +- return index; +-} +-EXPORT_SYMBOL(radix_tree_next_hole); +- +-/** +- * radix_tree_prev_hole - find the prev hole (not-present entry) +- * @root: tree root +- * @index: index key +- * @max_scan: maximum range to search +- * +- * Search backwards in the range [max(index-max_scan+1, 0), index] +- * for the first hole. +- * +- * Returns: the index of the hole if found, otherwise returns an index +- * outside of the set specified (in which case 'index - return >= max_scan' +- * will be true). In rare cases of wrap-around, ULONG_MAX will be returned. +- * +- * radix_tree_next_hole may be called under rcu_read_lock. However, like +- * radix_tree_gang_lookup, this will not atomically search a snapshot of +- * the tree at a single point in time. For example, if a hole is created +- * at index 10, then subsequently a hole is created at index 5, +- * radix_tree_prev_hole covering both indexes may return 5 if called under +- * rcu_read_lock. +- */ +-unsigned long radix_tree_prev_hole(struct radix_tree_root *root, +- unsigned long index, unsigned long max_scan) +-{ +- unsigned long i; +- +- for (i = 0; i < max_scan; i++) { +- if (!radix_tree_lookup(root, index)) +- break; +- index--; +- if (index == ULONG_MAX) +- break; +- } +- +- return index; +-} +-EXPORT_SYMBOL(radix_tree_prev_hole); +- + /** + * radix_tree_gang_lookup - perform multiple lookup on a radix tree + * @root: radix tree root +@@ -1335,15 +1260,18 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) + } + + /** +- * radix_tree_delete - delete an item from a radix tree ++ * radix_tree_delete_item - delete an item from a radix tree + * @root: radix tree root + * @index: index key ++ * @item: expected item + * +- * Remove the item at @index from the radix tree rooted at @root. ++ * Remove @item at @index from the radix tree rooted at @root. + * +- * Returns the address of the deleted item, or NULL if it was not present. ++ * Returns the address of the deleted item, or NULL if it was not present ++ * or the entry at the given @index was not @item. + */ +-void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) ++void *radix_tree_delete_item(struct radix_tree_root *root, ++ unsigned long index, void *item) + { + struct radix_tree_node *node = NULL; + struct radix_tree_node *slot = NULL; +@@ -1378,6 +1306,11 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) + if (slot == NULL) + goto out; + ++ if (item && slot != item) { ++ slot = NULL; ++ goto out; ++ } ++ + /* + * Clear all tags associated with the item to be deleted. + * This way of doing it would be inefficient, but seldom is any set. +@@ -1422,6 +1355,21 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) + out: + return slot; + } ++EXPORT_SYMBOL(radix_tree_delete_item); ++ ++/** ++ * radix_tree_delete - delete an item from a radix tree ++ * @root: radix tree root ++ * @index: index key ++ * ++ * Remove the item at @index from the radix tree rooted at @root. ++ * ++ * Returns the address of the deleted item, or NULL if it was not present. ++ */ ++void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) ++{ ++ return radix_tree_delete_item(root, index, NULL); ++} + EXPORT_SYMBOL(radix_tree_delete); + + /** +diff --git a/mm/Makefile b/mm/Makefile +index 305d10acd081..fb51bc61d80a 100644 +--- a/mm/Makefile ++++ b/mm/Makefile +@@ -16,7 +16,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ + readahead.o swap.o truncate.o vmscan.o shmem.o \ + util.o mmzone.o vmstat.o backing-dev.o \ + mm_init.o mmu_context.o percpu.o slab_common.o \ +- compaction.o balloon_compaction.o \ ++ compaction.o balloon_compaction.o vmacache.o \ + interval_tree.o list_lru.o $(mmu-y) + + obj-y += init-mm.o +diff --git a/mm/compaction.c b/mm/compaction.c +index 6441083e76d3..adb6d0560e96 100644 +--- a/mm/compaction.c ++++ b/mm/compaction.c +@@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone) + unsigned long end_pfn = zone_end_pfn(zone); + unsigned long pfn; + +- zone->compact_cached_migrate_pfn = start_pfn; ++ zone->compact_cached_migrate_pfn[0] = start_pfn; ++ zone->compact_cached_migrate_pfn[1] = start_pfn; + zone->compact_cached_free_pfn = end_pfn; + zone->compact_blockskip_flush = false; + +@@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat) + */ + static void update_pageblock_skip(struct compact_control *cc, + struct page *page, unsigned long nr_isolated, +- bool migrate_scanner) ++ bool set_unsuitable, bool migrate_scanner) + { + struct zone *zone = cc->zone; ++ unsigned long pfn; + + if (cc->ignore_skip_hint) + return; +@@ -141,20 +143,32 @@ static void update_pageblock_skip(struct compact_control *cc, + if (!page) + return; + +- if (!nr_isolated) { +- unsigned long pfn = page_to_pfn(page); ++ if (nr_isolated) ++ return; ++ ++ /* ++ * Only skip pageblocks when all forms of compaction will be known to ++ * fail in the near future. ++ */ ++ if (set_unsuitable) + set_pageblock_skip(page); + +- /* Update where compaction should restart */ +- if (migrate_scanner) { +- if (!cc->finished_update_migrate && +- pfn > zone->compact_cached_migrate_pfn) +- zone->compact_cached_migrate_pfn = pfn; +- } else { +- if (!cc->finished_update_free && +- pfn < zone->compact_cached_free_pfn) +- zone->compact_cached_free_pfn = pfn; +- } ++ pfn = page_to_pfn(page); ++ ++ /* Update where async and sync compaction should restart */ ++ if (migrate_scanner) { ++ if (cc->finished_update_migrate) ++ return; ++ if (pfn > zone->compact_cached_migrate_pfn[0]) ++ zone->compact_cached_migrate_pfn[0] = pfn; ++ if (cc->mode != MIGRATE_ASYNC && ++ pfn > zone->compact_cached_migrate_pfn[1]) ++ zone->compact_cached_migrate_pfn[1] = pfn; ++ } else { ++ if (cc->finished_update_free) ++ return; ++ if (pfn < zone->compact_cached_free_pfn) ++ zone->compact_cached_free_pfn = pfn; + } + } + #else +@@ -166,7 +180,7 @@ static inline bool isolation_suitable(struct compact_control *cc, + + static void update_pageblock_skip(struct compact_control *cc, + struct page *page, unsigned long nr_isolated, +- bool migrate_scanner) ++ bool set_unsuitable, bool migrate_scanner) + { + } + #endif /* CONFIG_COMPACTION */ +@@ -195,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, + } + + /* async aborts if taking too long or contended */ +- if (!cc->sync) { ++ if (cc->mode == MIGRATE_ASYNC) { + cc->contended = true; + return false; + } +@@ -208,30 +222,39 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, + return true; + } + +-static inline bool compact_trylock_irqsave(spinlock_t *lock, +- unsigned long *flags, struct compact_control *cc) ++/* ++ * Aside from avoiding lock contention, compaction also periodically checks ++ * need_resched() and either schedules in sync compaction or aborts async ++ * compaction. This is similar to what compact_checklock_irqsave() does, but ++ * is used where no lock is concerned. ++ * ++ * Returns false when no scheduling was needed, or sync compaction scheduled. ++ * Returns true when async compaction should abort. ++ */ ++static inline bool compact_should_abort(struct compact_control *cc) + { +- return compact_checklock_irqsave(lock, flags, false, cc); ++ /* async compaction aborts if contended */ ++ if (need_resched()) { ++ if (cc->mode == MIGRATE_ASYNC) { ++ cc->contended = true; ++ return true; ++ } ++ ++ cond_resched(); ++ } ++ ++ return false; + } + + /* Returns true if the page is within a block suitable for migration to */ + static bool suitable_migration_target(struct page *page) + { +- int migratetype = get_pageblock_migratetype(page); +- +- /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ +- if (migratetype == MIGRATE_RESERVE) +- return false; +- +- if (is_migrate_isolate(migratetype)) +- return false; +- +- /* If the page is a large free page, then allow migration */ ++ /* If the page is a large free page, then disallow migration */ + if (PageBuddy(page) && page_order(page) >= pageblock_order) +- return true; ++ return false; + + /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ +- if (migrate_async_suitable(migratetype)) ++ if (migrate_async_suitable(get_pageblock_migratetype(page))) + return true; + + /* Otherwise skip the block */ +@@ -254,6 +277,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, + struct page *cursor, *valid_page = NULL; + unsigned long flags; + bool locked = false; ++ bool checked_pageblock = false; + + cursor = pfn_to_page(blockpfn); + +@@ -285,8 +309,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, + break; + + /* Recheck this is a suitable migration target under lock */ +- if (!strict && !suitable_migration_target(page)) +- break; ++ if (!strict && !checked_pageblock) { ++ /* ++ * We need to check suitability of pageblock only once ++ * and this isolate_freepages_block() is called with ++ * pageblock range, so just check once is sufficient. ++ */ ++ checked_pageblock = true; ++ if (!suitable_migration_target(page)) ++ break; ++ } + + /* Recheck this is a buddy page under lock */ + if (!PageBuddy(page)) +@@ -330,7 +362,8 @@ isolate_fail: + + /* Update the pageblock-skip if the whole pageblock was scanned */ + if (blockpfn == end_pfn) +- update_pageblock_skip(cc, valid_page, total_isolated, false); ++ update_pageblock_skip(cc, valid_page, total_isolated, true, ++ false); + + count_compact_events(COMPACTFREE_SCANNED, nr_scanned); + if (total_isolated) +@@ -461,11 +494,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, + unsigned long last_pageblock_nr = 0, pageblock_nr; + unsigned long nr_scanned = 0, nr_isolated = 0; + struct list_head *migratelist = &cc->migratepages; +- isolate_mode_t mode = 0; + struct lruvec *lruvec; + unsigned long flags; + bool locked = false; + struct page *page = NULL, *valid_page = NULL; ++ bool set_unsuitable = true; ++ const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ? ++ ISOLATE_ASYNC_MIGRATE : 0) | ++ (unevictable ? ISOLATE_UNEVICTABLE : 0); + + /* + * Ensure that there are not too many pages isolated from the LRU +@@ -474,7 +510,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, + */ + while (unlikely(too_many_isolated(zone))) { + /* async migration should just abort */ +- if (!cc->sync) ++ if (cc->mode == MIGRATE_ASYNC) + return 0; + + congestion_wait(BLK_RW_ASYNC, HZ/10); +@@ -483,11 +519,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, + return 0; + } + ++ if (compact_should_abort(cc)) ++ return 0; ++ + /* Time to isolate some pages for migration */ +- cond_resched(); + for (; low_pfn < end_pfn; low_pfn++) { + /* give a chance to irqs before checking need_resched() */ +- if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) { ++ if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { + if (should_release_lock(&zone->lru_lock)) { + spin_unlock_irqrestore(&zone->lru_lock, flags); + locked = false; +@@ -526,25 +564,31 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, + + /* If isolation recently failed, do not retry */ + pageblock_nr = low_pfn >> pageblock_order; +- if (!isolation_suitable(cc, page)) +- goto next_pageblock; ++ if (last_pageblock_nr != pageblock_nr) { ++ int mt; ++ ++ last_pageblock_nr = pageblock_nr; ++ if (!isolation_suitable(cc, page)) ++ goto next_pageblock; ++ ++ /* ++ * For async migration, also only scan in MOVABLE ++ * blocks. Async migration is optimistic to see if ++ * the minimum amount of work satisfies the allocation ++ */ ++ mt = get_pageblock_migratetype(page); ++ if (cc->mode == MIGRATE_ASYNC && ++ !migrate_async_suitable(mt)) { ++ set_unsuitable = false; ++ goto next_pageblock; ++ } ++ } + + /* Skip if free */ + if (PageBuddy(page)) + continue; + + /* +- * For async migration, also only scan in MOVABLE blocks. Async +- * migration is optimistic to see if the minimum amount of work +- * satisfies the allocation +- */ +- if (!cc->sync && last_pageblock_nr != pageblock_nr && +- !migrate_async_suitable(get_pageblock_migratetype(page))) { +- cc->finished_update_migrate = true; +- goto next_pageblock; +- } +- +- /* + * Check may be lockless but that's ok as we recheck later. + * It's possible to migrate LRU pages and balloon pages + * Skip any other type of page +@@ -553,11 +597,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, + if (unlikely(balloon_page_movable(page))) { + if (locked && balloon_page_isolate(page)) { + /* Successfully isolated */ +- cc->finished_update_migrate = true; +- list_add(&page->lru, migratelist); +- cc->nr_migratepages++; +- nr_isolated++; +- goto check_compact_cluster; ++ goto isolate_success; + } + } + continue; +@@ -580,6 +620,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, + continue; + } + ++ /* ++ * Migration will fail if an anonymous page is pinned in memory, ++ * so avoid taking lru_lock and isolating it unnecessarily in an ++ * admittedly racy check. ++ */ ++ if (!page_mapping(page) && ++ page_count(page) > page_mapcount(page)) ++ continue; ++ + /* Check if it is ok to still hold the lock */ + locked = compact_checklock_irqsave(&zone->lru_lock, &flags, + locked, cc); +@@ -594,12 +643,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, + continue; + } + +- if (!cc->sync) +- mode |= ISOLATE_ASYNC_MIGRATE; +- +- if (unevictable) +- mode |= ISOLATE_UNEVICTABLE; +- + lruvec = mem_cgroup_page_lruvec(page, zone); + + /* Try isolate the page */ +@@ -609,13 +652,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, + VM_BUG_ON(PageTransCompound(page)); + + /* Successfully isolated */ +- cc->finished_update_migrate = true; + del_page_from_lru_list(page, lruvec, page_lru(page)); ++ ++isolate_success: ++ cc->finished_update_migrate = true; + list_add(&page->lru, migratelist); + cc->nr_migratepages++; + nr_isolated++; + +-check_compact_cluster: + /* Avoid isolating too much */ + if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { + ++low_pfn; +@@ -626,7 +670,6 @@ check_compact_cluster: + + next_pageblock: + low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; +- last_pageblock_nr = pageblock_nr; + } + + acct_isolated(zone, locked, cc); +@@ -634,9 +677,13 @@ next_pageblock: + if (locked) + spin_unlock_irqrestore(&zone->lru_lock, flags); + +- /* Update the pageblock-skip if the whole pageblock was scanned */ ++ /* ++ * Update the pageblock-skip information and cached scanner pfn, ++ * if the whole pageblock was scanned without isolating any page. ++ */ + if (low_pfn == end_pfn) +- update_pageblock_skip(cc, valid_page, nr_isolated, true); ++ update_pageblock_skip(cc, valid_page, nr_isolated, ++ set_unsuitable, true); + + trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); + +@@ -657,7 +704,9 @@ static void isolate_freepages(struct zone *zone, + struct compact_control *cc) + { + struct page *page; +- unsigned long high_pfn, low_pfn, pfn, z_end_pfn; ++ unsigned long block_start_pfn; /* start of current pageblock */ ++ unsigned long block_end_pfn; /* end of current pageblock */ ++ unsigned long low_pfn; /* lowest pfn scanner is able to scan */ + int nr_freepages = cc->nr_freepages; + struct list_head *freelist = &cc->freepages; + +@@ -665,41 +714,38 @@ static void isolate_freepages(struct zone *zone, + * Initialise the free scanner. The starting point is where we last + * successfully isolated from, zone-cached value, or the end of the + * zone when isolating for the first time. We need this aligned to +- * the pageblock boundary, because we do pfn -= pageblock_nr_pages +- * in the for loop. ++ * the pageblock boundary, because we do ++ * block_start_pfn -= pageblock_nr_pages in the for loop. ++ * For ending point, take care when isolating in last pageblock of a ++ * a zone which ends in the middle of a pageblock. + * The low boundary is the end of the pageblock the migration scanner + * is using. + */ +- pfn = cc->free_pfn & ~(pageblock_nr_pages-1); ++ block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); ++ block_end_pfn = min(block_start_pfn + pageblock_nr_pages, ++ zone_end_pfn(zone)); + low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); + + /* +- * Take care that if the migration scanner is at the end of the zone +- * that the free scanner does not accidentally move to the next zone +- * in the next isolation cycle. +- */ +- high_pfn = min(low_pfn, pfn); +- +- z_end_pfn = zone_end_pfn(zone); +- +- /* + * Isolate free pages until enough are available to migrate the + * pages on cc->migratepages. We stop searching if the migrate + * and free page scanners meet or enough free pages are isolated. + */ +- for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; +- pfn -= pageblock_nr_pages) { ++ for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; ++ block_end_pfn = block_start_pfn, ++ block_start_pfn -= pageblock_nr_pages) { + unsigned long isolated; +- unsigned long end_pfn; + + /* + * This can iterate a massively long zone without finding any + * suitable migration targets, so periodically check if we need +- * to schedule. ++ * to schedule, or even abort async compaction. + */ +- cond_resched(); ++ if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) ++ && compact_should_abort(cc)) ++ break; + +- if (!pfn_valid(pfn)) ++ if (!pfn_valid(block_start_pfn)) + continue; + + /* +@@ -709,7 +755,7 @@ static void isolate_freepages(struct zone *zone, + * i.e. it's possible that all pages within a zones range of + * pages do not belong to a single zone. + */ +- page = pfn_to_page(pfn); ++ page = pfn_to_page(block_start_pfn); + if (page_zone(page) != zone) + continue; + +@@ -722,26 +768,26 @@ static void isolate_freepages(struct zone *zone, + continue; + + /* Found a block suitable for isolating free pages from */ +- isolated = 0; ++ cc->free_pfn = block_start_pfn; ++ isolated = isolate_freepages_block(cc, block_start_pfn, ++ block_end_pfn, freelist, false); ++ nr_freepages += isolated; + + /* +- * Take care when isolating in last pageblock of a zone which +- * ends in the middle of a pageblock. ++ * Set a flag that we successfully isolated in this pageblock. ++ * In the next loop iteration, zone->compact_cached_free_pfn ++ * will not be updated and thus it will effectively contain the ++ * highest pageblock we isolated pages from. + */ +- end_pfn = min(pfn + pageblock_nr_pages, z_end_pfn); +- isolated = isolate_freepages_block(cc, pfn, end_pfn, +- freelist, false); +- nr_freepages += isolated; ++ if (isolated) ++ cc->finished_update_free = true; + + /* +- * Record the highest PFN we isolated pages from. When next +- * looking for free pages, the search will restart here as +- * page migration may have returned some pages to the allocator ++ * isolate_freepages_block() might have aborted due to async ++ * compaction being contended + */ +- if (isolated) { +- cc->finished_update_free = true; +- high_pfn = max(high_pfn, pfn); +- } ++ if (cc->contended) ++ break; + } + + /* split_free_page does not map the pages */ +@@ -751,10 +797,9 @@ static void isolate_freepages(struct zone *zone, + * If we crossed the migrate scanner, we want to keep it that way + * so that compact_finished() may detect this + */ +- if (pfn < low_pfn) +- cc->free_pfn = max(pfn, zone->zone_start_pfn); +- else +- cc->free_pfn = high_pfn; ++ if (block_start_pfn < low_pfn) ++ cc->free_pfn = cc->migrate_pfn; ++ + cc->nr_freepages = nr_freepages; + } + +@@ -769,9 +814,13 @@ static struct page *compaction_alloc(struct page *migratepage, + struct compact_control *cc = (struct compact_control *)data; + struct page *freepage; + +- /* Isolate free pages if necessary */ ++ /* ++ * Isolate free pages if necessary, and if we are not aborting due to ++ * contention. ++ */ + if (list_empty(&cc->freepages)) { +- isolate_freepages(cc->zone, cc); ++ if (!cc->contended) ++ isolate_freepages(cc->zone, cc); + + if (list_empty(&cc->freepages)) + return NULL; +@@ -785,23 +834,16 @@ static struct page *compaction_alloc(struct page *migratepage, + } + + /* +- * We cannot control nr_migratepages and nr_freepages fully when migration is +- * running as migrate_pages() has no knowledge of compact_control. When +- * migration is complete, we count the number of pages on the lists by hand. ++ * This is a migrate-callback that "frees" freepages back to the isolated ++ * freelist. All pages on the freelist are from the same zone, so there is no ++ * special handling needed for NUMA. + */ +-static void update_nr_listpages(struct compact_control *cc) ++static void compaction_free(struct page *page, unsigned long data) + { +- int nr_migratepages = 0; +- int nr_freepages = 0; +- struct page *page; +- +- list_for_each_entry(page, &cc->migratepages, lru) +- nr_migratepages++; +- list_for_each_entry(page, &cc->freepages, lru) +- nr_freepages++; ++ struct compact_control *cc = (struct compact_control *)data; + +- cc->nr_migratepages = nr_migratepages; +- cc->nr_freepages = nr_freepages; ++ list_add(&page->lru, &cc->freepages); ++ cc->nr_freepages++; + } + + /* possible outcome of isolate_migratepages */ +@@ -848,11 +890,16 @@ static int compact_finished(struct zone *zone, + unsigned int order; + unsigned long watermark; + +- if (fatal_signal_pending(current)) ++ if (cc->contended || fatal_signal_pending(current)) + return COMPACT_PARTIAL; + + /* Compaction run completes if the migrate and free scanner meet */ + if (cc->free_pfn <= cc->migrate_pfn) { ++ /* Let the next compaction start anew. */ ++ zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; ++ zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; ++ zone->compact_cached_free_pfn = zone_end_pfn(zone); ++ + /* + * Mark that the PG_migrate_skip information should be cleared + * by kswapd when it goes to sleep. kswapd does not set the +@@ -950,6 +997,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) + int ret; + unsigned long start_pfn = zone->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(zone); ++ const bool sync = cc->mode != MIGRATE_ASYNC; + + ret = compaction_suitable(zone, cc->order); + switch (ret) { +@@ -975,7 +1023,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) + * information on where the scanners should start but check that it + * is initialised by ensuring the values are within zone boundaries. + */ +- cc->migrate_pfn = zone->compact_cached_migrate_pfn; ++ cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; + cc->free_pfn = zone->compact_cached_free_pfn; + if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { + cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); +@@ -983,13 +1031,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) + } + if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { + cc->migrate_pfn = start_pfn; +- zone->compact_cached_migrate_pfn = cc->migrate_pfn; ++ zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; ++ zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; + } + ++ trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); ++ + migrate_prep_local(); + + while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { +- unsigned long nr_migrate, nr_remaining; + int err; + + switch (isolate_migratepages(zone, cc)) { +@@ -1004,21 +1054,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) + ; + } + +- nr_migrate = cc->nr_migratepages; ++ if (!cc->nr_migratepages) ++ continue; ++ + err = migrate_pages(&cc->migratepages, compaction_alloc, +- (unsigned long)cc, +- cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, ++ compaction_free, (unsigned long)cc, cc->mode, + MR_COMPACTION); +- update_nr_listpages(cc); +- nr_remaining = cc->nr_migratepages; + +- trace_mm_compaction_migratepages(nr_migrate - nr_remaining, +- nr_remaining); ++ trace_mm_compaction_migratepages(cc->nr_migratepages, err, ++ &cc->migratepages); + +- /* Release isolated pages not migrated */ ++ /* All pages were either migrated or will be released */ ++ cc->nr_migratepages = 0; + if (err) { + putback_movable_pages(&cc->migratepages); +- cc->nr_migratepages = 0; + /* + * migrate_pages() may return -ENOMEM when scanners meet + * and we want compact_finished() to detect it +@@ -1035,12 +1084,13 @@ out: + cc->nr_freepages -= release_freepages(&cc->freepages); + VM_BUG_ON(cc->nr_freepages != 0); + ++ trace_mm_compaction_end(ret); ++ + return ret; + } + +-static unsigned long compact_zone_order(struct zone *zone, +- int order, gfp_t gfp_mask, +- bool sync, bool *contended) ++static unsigned long compact_zone_order(struct zone *zone, int order, ++ gfp_t gfp_mask, enum migrate_mode mode, bool *contended) + { + unsigned long ret; + struct compact_control cc = { +@@ -1049,7 +1099,7 @@ static unsigned long compact_zone_order(struct zone *zone, + .order = order, + .migratetype = allocflags_to_migratetype(gfp_mask), + .zone = zone, +- .sync = sync, ++ .mode = mode, + }; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); +@@ -1071,7 +1121,7 @@ int sysctl_extfrag_threshold = 500; + * @order: The order of the current allocation + * @gfp_mask: The GFP mask of the current allocation + * @nodemask: The allowed nodes to allocate from +- * @sync: Whether migration is synchronous or not ++ * @mode: The migration mode for async, sync light, or sync migration + * @contended: Return value that is true if compaction was aborted due to lock contention + * @page: Optionally capture a free page of the requested order during compaction + * +@@ -1079,7 +1129,7 @@ int sysctl_extfrag_threshold = 500; + */ + unsigned long try_to_compact_pages(struct zonelist *zonelist, + int order, gfp_t gfp_mask, nodemask_t *nodemask, +- bool sync, bool *contended) ++ enum migrate_mode mode, bool *contended) + { + enum zone_type high_zoneidx = gfp_zone(gfp_mask); + int may_enter_fs = gfp_mask & __GFP_FS; +@@ -1104,7 +1154,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, + nodemask) { + int status; + +- status = compact_zone_order(zone, order, gfp_mask, sync, ++ status = compact_zone_order(zone, order, gfp_mask, mode, + contended); + rc = max(status, rc); + +@@ -1140,13 +1190,9 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) + compact_zone(zone, cc); + + if (cc->order > 0) { +- int ok = zone_watermark_ok(zone, cc->order, +- low_wmark_pages(zone), 0, 0); +- if (ok && cc->order >= zone->compact_order_failed) +- zone->compact_order_failed = cc->order + 1; +- /* Currently async compaction is never deferred. */ +- else if (!ok && cc->sync) +- defer_compaction(zone, cc->order); ++ if (zone_watermark_ok(zone, cc->order, ++ low_wmark_pages(zone), 0, 0)) ++ compaction_defer_reset(zone, cc->order, false); + } + + VM_BUG_ON(!list_empty(&cc->freepages)); +@@ -1158,7 +1204,7 @@ void compact_pgdat(pg_data_t *pgdat, int order) + { + struct compact_control cc = { + .order = order, +- .sync = false, ++ .mode = MIGRATE_ASYNC, + }; + + if (!order) +@@ -1171,7 +1217,8 @@ static void compact_node(int nid) + { + struct compact_control cc = { + .order = -1, +- .sync = true, ++ .mode = MIGRATE_SYNC, ++ .ignore_skip_hint = true, + }; + + __compact_pgdat(NODE_DATA(nid), &cc); +diff --git a/mm/filemap.c b/mm/filemap.c +index ae4846ff4849..b012daefc2d7 100644 +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -192,9 +192,11 @@ static int filemap_check_errors(struct address_space *mapping) + { + int ret = 0; + /* Check for outstanding write errors */ +- if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) ++ if (test_bit(AS_ENOSPC, &mapping->flags) && ++ test_and_clear_bit(AS_ENOSPC, &mapping->flags)) + ret = -ENOSPC; +- if (test_and_clear_bit(AS_EIO, &mapping->flags)) ++ if (test_bit(AS_EIO, &mapping->flags) && ++ test_and_clear_bit(AS_EIO, &mapping->flags)) + ret = -EIO; + return ret; + } +@@ -446,6 +448,29 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) + } + EXPORT_SYMBOL_GPL(replace_page_cache_page); + ++static int page_cache_tree_insert(struct address_space *mapping, ++ struct page *page) ++{ ++ void **slot; ++ int error; ++ ++ slot = radix_tree_lookup_slot(&mapping->page_tree, page->index); ++ if (slot) { ++ void *p; ++ ++ p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); ++ if (!radix_tree_exceptional_entry(p)) ++ return -EEXIST; ++ radix_tree_replace_slot(slot, page); ++ mapping->nrpages++; ++ return 0; ++ } ++ error = radix_tree_insert(&mapping->page_tree, page->index, page); ++ if (!error) ++ mapping->nrpages++; ++ return error; ++} ++ + /** + * add_to_page_cache_locked - add a locked page to the pagecache + * @page: page to add +@@ -480,11 +505,10 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, + page->index = offset; + + spin_lock_irq(&mapping->tree_lock); +- error = radix_tree_insert(&mapping->page_tree, offset, page); ++ error = page_cache_tree_insert(mapping, page); + radix_tree_preload_end(); + if (unlikely(error)) + goto err_insert; +- mapping->nrpages++; + __inc_zone_page_state(page, NR_FILE_PAGES); + spin_unlock_irq(&mapping->tree_lock); + trace_mm_filemap_add_to_page_cache(page); +@@ -520,10 +544,10 @@ struct page *__page_cache_alloc(gfp_t gfp) + if (cpuset_do_page_mem_spread()) { + unsigned int cpuset_mems_cookie; + do { +- cpuset_mems_cookie = get_mems_allowed(); ++ cpuset_mems_cookie = read_mems_allowed_begin(); + n = cpuset_mem_spread_node(); + page = alloc_pages_exact_node(n, gfp, 0); +- } while (!put_mems_allowed(cpuset_mems_cookie) && !page); ++ } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); + + return page; + } +@@ -620,8 +644,17 @@ EXPORT_SYMBOL(unlock_page); + */ + void end_page_writeback(struct page *page) + { +- if (TestClearPageReclaim(page)) ++ /* ++ * TestClearPageReclaim could be used here but it is an atomic ++ * operation and overkill in this particular case. Failing to ++ * shuffle a page marked for immediate reclaim is too mild to ++ * justify taking an atomic operation penalty at the end of ++ * ever page writeback. ++ */ ++ if (PageReclaim(page)) { ++ ClearPageReclaim(page); + rotate_reclaimable_page(page); ++ } + + if (!test_clear_page_writeback(page)) + BUG(); +@@ -686,14 +719,101 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, + } + + /** +- * find_get_page - find and get a page reference ++ * page_cache_next_hole - find the next hole (not-present entry) ++ * @mapping: mapping ++ * @index: index ++ * @max_scan: maximum range to search ++ * ++ * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the ++ * lowest indexed hole. ++ * ++ * Returns: the index of the hole if found, otherwise returns an index ++ * outside of the set specified (in which case 'return - index >= ++ * max_scan' will be true). In rare cases of index wrap-around, 0 will ++ * be returned. ++ * ++ * page_cache_next_hole may be called under rcu_read_lock. However, ++ * like radix_tree_gang_lookup, this will not atomically search a ++ * snapshot of the tree at a single point in time. For example, if a ++ * hole is created at index 5, then subsequently a hole is created at ++ * index 10, page_cache_next_hole covering both indexes may return 10 ++ * if called under rcu_read_lock. ++ */ ++pgoff_t page_cache_next_hole(struct address_space *mapping, ++ pgoff_t index, unsigned long max_scan) ++{ ++ unsigned long i; ++ ++ for (i = 0; i < max_scan; i++) { ++ struct page *page; ++ ++ page = radix_tree_lookup(&mapping->page_tree, index); ++ if (!page || radix_tree_exceptional_entry(page)) ++ break; ++ index++; ++ if (index == 0) ++ break; ++ } ++ ++ return index; ++} ++EXPORT_SYMBOL(page_cache_next_hole); ++ ++/** ++ * page_cache_prev_hole - find the prev hole (not-present entry) ++ * @mapping: mapping ++ * @index: index ++ * @max_scan: maximum range to search ++ * ++ * Search backwards in the range [max(index-max_scan+1, 0), index] for ++ * the first hole. ++ * ++ * Returns: the index of the hole if found, otherwise returns an index ++ * outside of the set specified (in which case 'index - return >= ++ * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX ++ * will be returned. ++ * ++ * page_cache_prev_hole may be called under rcu_read_lock. However, ++ * like radix_tree_gang_lookup, this will not atomically search a ++ * snapshot of the tree at a single point in time. For example, if a ++ * hole is created at index 10, then subsequently a hole is created at ++ * index 5, page_cache_prev_hole covering both indexes may return 5 if ++ * called under rcu_read_lock. ++ */ ++pgoff_t page_cache_prev_hole(struct address_space *mapping, ++ pgoff_t index, unsigned long max_scan) ++{ ++ unsigned long i; ++ ++ for (i = 0; i < max_scan; i++) { ++ struct page *page; ++ ++ page = radix_tree_lookup(&mapping->page_tree, index); ++ if (!page || radix_tree_exceptional_entry(page)) ++ break; ++ index--; ++ if (index == ULONG_MAX) ++ break; ++ } ++ ++ return index; ++} ++EXPORT_SYMBOL(page_cache_prev_hole); ++ ++/** ++ * find_get_entry - find and get a page cache entry + * @mapping: the address_space to search +- * @offset: the page index ++ * @offset: the page cache index ++ * ++ * Looks up the page cache slot at @mapping & @offset. If there is a ++ * page cache page, it is returned with an increased refcount. + * +- * Is there a pagecache struct page at the given (mapping, offset) tuple? +- * If yes, increment its refcount and return it; if no, return NULL. ++ * If the slot holds a shadow entry of a previously evicted page, it ++ * is returned. ++ * ++ * Otherwise, %NULL is returned. + */ +-struct page *find_get_page(struct address_space *mapping, pgoff_t offset) ++struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) + { + void **pagep; + struct page *page; +@@ -734,24 +854,30 @@ out: + + return page; + } +-EXPORT_SYMBOL(find_get_page); ++EXPORT_SYMBOL(find_get_entry); + + /** +- * find_lock_page - locate, pin and lock a pagecache page ++ * find_lock_entry - locate, pin and lock a page cache entry + * @mapping: the address_space to search +- * @offset: the page index ++ * @offset: the page cache index ++ * ++ * Looks up the page cache slot at @mapping & @offset. If there is a ++ * page cache page, it is returned locked and with an increased ++ * refcount. + * +- * Locates the desired pagecache page, locks it, increments its reference +- * count and returns its address. ++ * If the slot holds a shadow entry of a previously evicted page, it ++ * is returned. + * +- * Returns zero if the page was not present. find_lock_page() may sleep. ++ * Otherwise, %NULL is returned. ++ * ++ * find_lock_entry() may sleep. + */ +-struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) ++struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) + { + struct page *page; + + repeat: +- page = find_get_page(mapping, offset); ++ page = find_get_entry(mapping, offset); + if (page && !radix_tree_exception(page)) { + lock_page(page); + /* Has the page been truncated? */ +@@ -764,44 +890,87 @@ repeat: + } + return page; + } +-EXPORT_SYMBOL(find_lock_page); ++EXPORT_SYMBOL(find_lock_entry); + + /** +- * find_or_create_page - locate or add a pagecache page +- * @mapping: the page's address_space +- * @index: the page's index into the mapping +- * @gfp_mask: page allocation mode ++ * pagecache_get_page - find and get a page reference ++ * @mapping: the address_space to search ++ * @offset: the page index ++ * @fgp_flags: PCG flags ++ * @gfp_mask: gfp mask to use if a page is to be allocated ++ * ++ * Looks up the page cache slot at @mapping & @offset. ++ * ++ * PCG flags modify how the page is returned + * +- * Locates a page in the pagecache. If the page is not present, a new page +- * is allocated using @gfp_mask and is added to the pagecache and to the VM's +- * LRU list. The returned page is locked and has its reference count +- * incremented. ++ * FGP_ACCESSED: the page will be marked accessed ++ * FGP_LOCK: Page is return locked ++ * FGP_CREAT: If page is not present then a new page is allocated using ++ * @gfp_mask and added to the page cache and the VM's LRU ++ * list. The page is returned locked and with an increased ++ * refcount. Otherwise, %NULL is returned. + * +- * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic +- * allocation! ++ * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even ++ * if the GFP flags specified for FGP_CREAT are atomic. + * +- * find_or_create_page() returns the desired page's address, or zero on +- * memory exhaustion. ++ * If there is a page cache page, it is returned with an increased refcount. + */ +-struct page *find_or_create_page(struct address_space *mapping, +- pgoff_t index, gfp_t gfp_mask) ++struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, ++ int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask) + { + struct page *page; +- int err; ++ + repeat: +- page = find_lock_page(mapping, index); +- if (!page) { +- page = __page_cache_alloc(gfp_mask); ++ page = find_get_entry(mapping, offset); ++ if (radix_tree_exceptional_entry(page)) ++ page = NULL; ++ if (!page) ++ goto no_page; ++ ++ if (fgp_flags & FGP_LOCK) { ++ if (fgp_flags & FGP_NOWAIT) { ++ if (!trylock_page(page)) { ++ page_cache_release(page); ++ return NULL; ++ } ++ } else { ++ lock_page(page); ++ } ++ ++ /* Has the page been truncated? */ ++ if (unlikely(page->mapping != mapping)) { ++ unlock_page(page); ++ page_cache_release(page); ++ goto repeat; ++ } ++ VM_BUG_ON(page->index != offset); ++ } ++ ++ if (page && (fgp_flags & FGP_ACCESSED)) ++ mark_page_accessed(page); ++ ++no_page: ++ if (!page && (fgp_flags & FGP_CREAT)) { ++ int err; ++ if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping)) ++ cache_gfp_mask |= __GFP_WRITE; ++ if (fgp_flags & FGP_NOFS) { ++ cache_gfp_mask &= ~__GFP_FS; ++ radix_gfp_mask &= ~__GFP_FS; ++ } ++ ++ page = __page_cache_alloc(cache_gfp_mask); + if (!page) + return NULL; +- /* +- * We want a regular kernel memory (not highmem or DMA etc) +- * allocation for the radix tree nodes, but we need to honour +- * the context-specific requirements the caller has asked for. +- * GFP_RECLAIM_MASK collects those requirements. +- */ +- err = add_to_page_cache_lru(page, mapping, index, +- (gfp_mask & GFP_RECLAIM_MASK)); ++ ++ if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) ++ fgp_flags |= FGP_LOCK; ++ ++ /* Init accessed so avoit atomic mark_page_accessed later */ ++ if (fgp_flags & FGP_ACCESSED) ++ init_page_accessed(page); ++ ++ err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask); + if (unlikely(err)) { + page_cache_release(page); + page = NULL; +@@ -809,9 +978,80 @@ repeat: + goto repeat; + } + } ++ + return page; + } +-EXPORT_SYMBOL(find_or_create_page); ++EXPORT_SYMBOL(pagecache_get_page); ++ ++/** ++ * find_get_entries - gang pagecache lookup ++ * @mapping: The address_space to search ++ * @start: The starting page cache index ++ * @nr_entries: The maximum number of entries ++ * @entries: Where the resulting entries are placed ++ * @indices: The cache indices corresponding to the entries in @entries ++ * ++ * find_get_entries() will search for and return a group of up to ++ * @nr_entries entries in the mapping. The entries are placed at ++ * @entries. find_get_entries() takes a reference against any actual ++ * pages it returns. ++ * ++ * The search returns a group of mapping-contiguous page cache entries ++ * with ascending indexes. There may be holes in the indices due to ++ * not-present pages. ++ * ++ * Any shadow entries of evicted pages are included in the returned ++ * array. ++ * ++ * find_get_entries() returns the number of pages and shadow entries ++ * which were found. ++ */ ++unsigned find_get_entries(struct address_space *mapping, ++ pgoff_t start, unsigned int nr_entries, ++ struct page **entries, pgoff_t *indices) ++{ ++ void **slot; ++ unsigned int ret = 0; ++ struct radix_tree_iter iter; ++ ++ if (!nr_entries) ++ return 0; ++ ++ rcu_read_lock(); ++restart: ++ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { ++ struct page *page; ++repeat: ++ page = radix_tree_deref_slot(slot); ++ if (unlikely(!page)) ++ continue; ++ if (radix_tree_exception(page)) { ++ if (radix_tree_deref_retry(page)) ++ goto restart; ++ /* ++ * Otherwise, we must be storing a swap entry ++ * here as an exceptional entry: so return it ++ * without attempting to raise page count. ++ */ ++ goto export; ++ } ++ if (!page_cache_get_speculative(page)) ++ goto repeat; ++ ++ /* Has the page moved? */ ++ if (unlikely(page != *slot)) { ++ page_cache_release(page); ++ goto repeat; ++ } ++export: ++ indices[ret] = iter.index; ++ entries[ret] = page; ++ if (++ret == nr_entries) ++ break; ++ } ++ rcu_read_unlock(); ++ return ret; ++} + + /** + * find_get_pages - gang pagecache lookup +@@ -1031,39 +1271,6 @@ repeat: + } + EXPORT_SYMBOL(find_get_pages_tag); + +-/** +- * grab_cache_page_nowait - returns locked page at given index in given cache +- * @mapping: target address_space +- * @index: the page index +- * +- * Same as grab_cache_page(), but do not wait if the page is unavailable. +- * This is intended for speculative data generators, where the data can +- * be regenerated if the page couldn't be grabbed. This routine should +- * be safe to call while holding the lock for another page. +- * +- * Clear __GFP_FS when allocating the page to avoid recursion into the fs +- * and deadlock against the caller's locked page. +- */ +-struct page * +-grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) +-{ +- struct page *page = find_get_page(mapping, index); +- +- if (page) { +- if (trylock_page(page)) +- return page; +- page_cache_release(page); +- return NULL; +- } +- page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); +- if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { +- page_cache_release(page); +- page = NULL; +- } +- return page; +-} +-EXPORT_SYMBOL(grab_cache_page_nowait); +- + /* + * CD/DVDs are error prone. When a medium error occurs, the driver may fail + * a _large_ part of the i/o request. Imagine the worst scenario: +@@ -1797,6 +2004,18 @@ int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) + EXPORT_SYMBOL(generic_file_mmap); + EXPORT_SYMBOL(generic_file_readonly_mmap); + ++static struct page *wait_on_page_read(struct page *page) ++{ ++ if (!IS_ERR(page)) { ++ wait_on_page_locked(page); ++ if (!PageUptodate(page)) { ++ page_cache_release(page); ++ page = ERR_PTR(-EIO); ++ } ++ } ++ return page; ++} ++ + static struct page *__read_cache_page(struct address_space *mapping, + pgoff_t index, + int (*filler)(void *, struct page *), +@@ -1823,6 +2042,8 @@ repeat: + if (err < 0) { + page_cache_release(page); + page = ERR_PTR(err); ++ } else { ++ page = wait_on_page_read(page); + } + } + return page; +@@ -1859,6 +2080,10 @@ retry: + if (err < 0) { + page_cache_release(page); + return ERR_PTR(err); ++ } else { ++ page = wait_on_page_read(page); ++ if (IS_ERR(page)) ++ return page; + } + out: + mark_page_accessed(page); +@@ -1866,40 +2091,25 @@ out: + } + + /** +- * read_cache_page_async - read into page cache, fill it if needed ++ * read_cache_page - read into page cache, fill it if needed + * @mapping: the page's address_space + * @index: the page index + * @filler: function to perform the read + * @data: first arg to filler(data, page) function, often left as NULL + * +- * Same as read_cache_page, but don't wait for page to become unlocked +- * after submitting it to the filler. +- * + * Read into the page cache. If a page already exists, and PageUptodate() is +- * not set, try to fill the page but don't wait for it to become unlocked. ++ * not set, try to fill the page and wait for it to become unlocked. + * + * If the page does not get brought uptodate, return -EIO. + */ +-struct page *read_cache_page_async(struct address_space *mapping, ++struct page *read_cache_page(struct address_space *mapping, + pgoff_t index, + int (*filler)(void *, struct page *), + void *data) + { + return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); + } +-EXPORT_SYMBOL(read_cache_page_async); +- +-static struct page *wait_on_page_read(struct page *page) +-{ +- if (!IS_ERR(page)) { +- wait_on_page_locked(page); +- if (!PageUptodate(page)) { +- page_cache_release(page); +- page = ERR_PTR(-EIO); +- } +- } +- return page; +-} ++EXPORT_SYMBOL(read_cache_page); + + /** + * read_cache_page_gfp - read into page cache, using specified page allocation flags. +@@ -1918,31 +2128,10 @@ struct page *read_cache_page_gfp(struct address_space *mapping, + { + filler_t *filler = (filler_t *)mapping->a_ops->readpage; + +- return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); ++ return do_read_cache_page(mapping, index, filler, NULL, gfp); + } + EXPORT_SYMBOL(read_cache_page_gfp); + +-/** +- * read_cache_page - read into page cache, fill it if needed +- * @mapping: the page's address_space +- * @index: the page index +- * @filler: function to perform the read +- * @data: first arg to filler(data, page) function, often left as NULL +- * +- * Read into the page cache. If a page already exists, and PageUptodate() is +- * not set, try to fill the page then wait for it to become unlocked. +- * +- * If the page does not get brought uptodate, return -EIO. +- */ +-struct page *read_cache_page(struct address_space *mapping, +- pgoff_t index, +- int (*filler)(void *, struct page *), +- void *data) +-{ +- return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); +-} +-EXPORT_SYMBOL(read_cache_page); +- + static size_t __iovec_copy_from_user_inatomic(char *vaddr, + const struct iovec *iov, size_t base, size_t bytes) + { +@@ -1976,7 +2165,6 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, + char *kaddr; + size_t copied; + +- BUG_ON(!in_atomic()); + kaddr = kmap_atomic(page); + if (likely(i->nr_segs == 1)) { + int left; +@@ -2186,7 +2374,6 @@ int pagecache_write_end(struct file *file, struct address_space *mapping, + { + const struct address_space_operations *aops = mapping->a_ops; + +- mark_page_accessed(page); + return aops->write_end(file, mapping, pos, len, copied, page, fsdata); + } + EXPORT_SYMBOL(pagecache_write_end); +@@ -2268,34 +2455,18 @@ EXPORT_SYMBOL(generic_file_direct_write); + struct page *grab_cache_page_write_begin(struct address_space *mapping, + pgoff_t index, unsigned flags) + { +- int status; +- gfp_t gfp_mask; + struct page *page; +- gfp_t gfp_notmask = 0; ++ int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT; + +- gfp_mask = mapping_gfp_mask(mapping); +- if (mapping_cap_account_dirty(mapping)) +- gfp_mask |= __GFP_WRITE; + if (flags & AOP_FLAG_NOFS) +- gfp_notmask = __GFP_FS; +-repeat: +- page = find_lock_page(mapping, index); ++ fgp_flags |= FGP_NOFS; ++ ++ page = pagecache_get_page(mapping, index, fgp_flags, ++ mapping_gfp_mask(mapping), ++ GFP_KERNEL); + if (page) +- goto found; ++ wait_for_stable_page(page); + +- page = __page_cache_alloc(gfp_mask & ~gfp_notmask); +- if (!page) +- return NULL; +- status = add_to_page_cache_lru(page, mapping, index, +- GFP_KERNEL & ~gfp_notmask); +- if (unlikely(status)) { +- page_cache_release(page); +- if (status == -EEXIST) +- goto repeat; +- return NULL; +- } +-found: +- wait_for_stable_page(page); + return page; + } + EXPORT_SYMBOL(grab_cache_page_write_begin); +@@ -2344,18 +2515,15 @@ again: + + status = a_ops->write_begin(file, mapping, pos, bytes, flags, + &page, &fsdata); +- if (unlikely(status)) ++ if (unlikely(status < 0)) + break; + + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + +- pagefault_disable(); + copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); +- pagefault_enable(); + flush_dcache_page(page); + +- mark_page_accessed(page); + status = a_ops->write_end(file, mapping, pos, bytes, copied, + page, fsdata); + if (unlikely(status < 0)) +diff --git a/mm/fremap.c b/mm/fremap.c +index bbc4d660221a..34feba60a17e 100644 +--- a/mm/fremap.c ++++ b/mm/fremap.c +@@ -23,28 +23,44 @@ + + #include "internal.h" + ++static int mm_counter(struct page *page) ++{ ++ return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES; ++} ++ + static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) + { + pte_t pte = *ptep; ++ struct page *page; ++ swp_entry_t entry; + + if (pte_present(pte)) { +- struct page *page; +- + flush_cache_page(vma, addr, pte_pfn(pte)); + pte = ptep_clear_flush(vma, addr, ptep); + page = vm_normal_page(vma, addr, pte); + if (page) { + if (pte_dirty(pte)) + set_page_dirty(page); ++ update_hiwater_rss(mm); ++ dec_mm_counter(mm, mm_counter(page)); + page_remove_rmap(page); + page_cache_release(page); ++ } ++ } else { /* zap_pte() is not called when pte_none() */ ++ if (!pte_file(pte)) { + update_hiwater_rss(mm); +- dec_mm_counter(mm, MM_FILEPAGES); ++ entry = pte_to_swp_entry(pte); ++ if (non_swap_entry(entry)) { ++ if (is_migration_entry(entry)) { ++ page = migration_entry_to_page(entry); ++ dec_mm_counter(mm, mm_counter(page)); ++ } ++ } else { ++ free_swap_and_cache(entry); ++ dec_mm_counter(mm, MM_SWAPENTS); ++ } + } +- } else { +- if (!pte_file(pte)) +- free_swap_and_cache(pte_to_swp_entry(pte)); + pte_clear_not_present_full(mm, addr, ptep, 0); + } + } +diff --git a/mm/frontswap.c b/mm/frontswap.c +index 1b24bdcb3197..c30eec536f03 100644 +--- a/mm/frontswap.c ++++ b/mm/frontswap.c +@@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area); + + static unsigned long __frontswap_curr_pages(void) + { +- int type; + unsigned long totalpages = 0; + struct swap_info_struct *si = NULL; + + assert_spin_locked(&swap_lock); +- for (type = swap_list.head; type >= 0; type = si->next) { +- si = swap_info[type]; ++ plist_for_each_entry(si, &swap_active_head, list) + totalpages += atomic_read(&si->frontswap_pages); +- } + return totalpages; + } + +@@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, + int si_frontswap_pages; + unsigned long total_pages_to_unuse = total; + unsigned long pages = 0, pages_to_unuse = 0; +- int type; + + assert_spin_locked(&swap_lock); +- for (type = swap_list.head; type >= 0; type = si->next) { +- si = swap_info[type]; ++ plist_for_each_entry(si, &swap_active_head, list) { + si_frontswap_pages = atomic_read(&si->frontswap_pages); + if (total_pages_to_unuse < si_frontswap_pages) { + pages = pages_to_unuse = total_pages_to_unuse; +@@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, + } + vm_unacct_memory(pages); + *unused = pages_to_unuse; +- *swapid = type; ++ *swapid = si->type; + ret = 0; + break; + } +@@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages) + /* + * we don't want to hold swap_lock while doing a very + * lengthy try_to_unuse, but swap_list may change +- * so restart scan from swap_list.head each time ++ * so restart scan from swap_active_head each time + */ + spin_lock(&swap_lock); + ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index 389973fd6bb7..2ee53749eb48 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -758,14 +758,6 @@ static inline struct page *alloc_hugepage_vma(int defrag, + HPAGE_PMD_ORDER, vma, haddr, nd); + } + +-#ifndef CONFIG_NUMA +-static inline struct page *alloc_hugepage(int defrag) +-{ +- return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), +- HPAGE_PMD_ORDER); +-} +-#endif +- + static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, + struct page *zero_page) +@@ -2197,7 +2189,58 @@ static void khugepaged_alloc_sleep(void) + msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); + } + ++static int khugepaged_node_load[MAX_NUMNODES]; ++ ++static bool khugepaged_scan_abort(int nid) ++{ ++ int i; ++ ++ /* ++ * If zone_reclaim_mode is disabled, then no extra effort is made to ++ * allocate memory locally. ++ */ ++ if (!zone_reclaim_mode) ++ return false; ++ ++ /* If there is a count for this node already, it must be acceptable */ ++ if (khugepaged_node_load[nid]) ++ return false; ++ ++ for (i = 0; i < MAX_NUMNODES; i++) { ++ if (!khugepaged_node_load[i]) ++ continue; ++ if (node_distance(nid, i) > RECLAIM_DISTANCE) ++ return true; ++ } ++ return false; ++} ++ + #ifdef CONFIG_NUMA ++static int khugepaged_find_target_node(void) ++{ ++ static int last_khugepaged_target_node = NUMA_NO_NODE; ++ int nid, target_node = 0, max_value = 0; ++ ++ /* find first node with max normal pages hit */ ++ for (nid = 0; nid < MAX_NUMNODES; nid++) ++ if (khugepaged_node_load[nid] > max_value) { ++ max_value = khugepaged_node_load[nid]; ++ target_node = nid; ++ } ++ ++ /* do some balance if several nodes have the same hit record */ ++ if (target_node <= last_khugepaged_target_node) ++ for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; ++ nid++) ++ if (max_value == khugepaged_node_load[nid]) { ++ target_node = nid; ++ break; ++ } ++ ++ last_khugepaged_target_node = target_node; ++ return target_node; ++} ++ + static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) + { + if (IS_ERR(*hpage)) { +@@ -2231,9 +2274,8 @@ static struct page + * mmap_sem in read mode is good idea also to allow greater + * scalability. + */ +- *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, +- node, __GFP_OTHER_NODE); +- ++ *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( ++ khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); + /* + * After allocating the hugepage, release the mmap_sem read lock in + * preparation for taking it in write mode. +@@ -2249,6 +2291,17 @@ static struct page + return *hpage; + } + #else ++static int khugepaged_find_target_node(void) ++{ ++ return 0; ++} ++ ++static inline struct page *alloc_hugepage(int defrag) ++{ ++ return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), ++ HPAGE_PMD_ORDER); ++} ++ + static struct page *khugepaged_alloc_hugepage(bool *wait) + { + struct page *hpage; +@@ -2455,6 +2508,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, + if (pmd_trans_huge(*pmd)) + goto out; + ++ memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; + _pte++, _address += PAGE_SIZE) { +@@ -2471,12 +2525,15 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, + if (unlikely(!page)) + goto out_unmap; + /* +- * Chose the node of the first page. This could +- * be more sophisticated and look at more pages, +- * but isn't for now. ++ * Record which node the original page is from and save this ++ * information to khugepaged_node_load[]. ++ * Khupaged will allocate hugepage from the node has the max ++ * hit record. + */ +- if (node == NUMA_NO_NODE) +- node = page_to_nid(page); ++ node = page_to_nid(page); ++ if (khugepaged_scan_abort(node)) ++ goto out_unmap; ++ khugepaged_node_load[node]++; + VM_BUG_ON(PageCompound(page)); + if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) + goto out_unmap; +@@ -2491,9 +2548,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, + ret = 1; + out_unmap: + pte_unmap_unlock(pte, ptl); +- if (ret) ++ if (ret) { ++ node = khugepaged_find_target_node(); + /* collapse_huge_page will return with the mmap_sem released */ + collapse_huge_page(mm, address, hpage, vma, node); ++ } + out: + return ret; + } +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index f80b17106d24..c33d8a65298c 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -574,7 +574,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, + goto err; + + retry_cpuset: +- cpuset_mems_cookie = get_mems_allowed(); ++ cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = huge_zonelist(vma, address, + htlb_alloc_mask(h), &mpol, &nodemask); + +@@ -596,7 +596,7 @@ retry_cpuset: + } + + mpol_cond_put(mpol); +- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) ++ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + return page; + +@@ -2114,6 +2114,9 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, + unsigned long tmp; + int ret; + ++ if (!hugepages_supported()) ++ return -ENOTSUPP; ++ + tmp = h->max_huge_pages; + + if (write && h->order >= MAX_ORDER) +@@ -2167,6 +2170,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, + unsigned long tmp; + int ret; + ++ if (!hugepages_supported()) ++ return -ENOTSUPP; ++ + tmp = h->nr_overcommit_huge_pages; + + if (write && h->order >= MAX_ORDER) +@@ -2192,6 +2198,8 @@ out: + void hugetlb_report_meminfo(struct seq_file *m) + { + struct hstate *h = &default_hstate; ++ if (!hugepages_supported()) ++ return; + seq_printf(m, + "HugePages_Total: %5lu\n" + "HugePages_Free: %5lu\n" +@@ -2208,6 +2216,8 @@ void hugetlb_report_meminfo(struct seq_file *m) + int hugetlb_report_node_meminfo(int nid, char *buf) + { + struct hstate *h = &default_hstate; ++ if (!hugepages_supported()) ++ return 0; + return sprintf(buf, + "Node %d HugePages_Total: %5u\n" + "Node %d HugePages_Free: %5u\n" +@@ -2222,6 +2232,9 @@ void hugetlb_show_meminfo(void) + struct hstate *h; + int nid; + ++ if (!hugepages_supported()) ++ return; ++ + for_each_node_state(nid, N_MEMORY) + for_each_hstate(h) + pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", +diff --git a/mm/internal.h b/mm/internal.h +index fdddbc83ac5f..d610f7ce4e9c 100644 +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -11,6 +11,7 @@ + #ifndef __MM_INTERNAL_H + #define __MM_INTERNAL_H + ++#include <linux/fs.h> + #include <linux/mm.h> + + void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, +@@ -21,6 +22,20 @@ static inline void set_page_count(struct page *page, int v) + atomic_set(&page->_count, v); + } + ++extern int __do_page_cache_readahead(struct address_space *mapping, ++ struct file *filp, pgoff_t offset, unsigned long nr_to_read, ++ unsigned long lookahead_size); ++ ++/* ++ * Submit IO for the read-ahead request in file_ra_state. ++ */ ++static inline unsigned long ra_submit(struct file_ra_state *ra, ++ struct address_space *mapping, struct file *filp) ++{ ++ return __do_page_cache_readahead(mapping, filp, ++ ra->start, ra->size, ra->async_size); ++} ++ + /* + * Turn a non-refcounted page (->_count == 0) into refcounted with + * a count of one. +@@ -120,7 +135,7 @@ struct compact_control { + unsigned long nr_migratepages; /* Number of pages to migrate */ + unsigned long free_pfn; /* isolate_freepages search base */ + unsigned long migrate_pfn; /* isolate_migratepages search base */ +- bool sync; /* Synchronous migration */ ++ enum migrate_mode mode; /* Async or sync migration mode */ + bool ignore_skip_hint; /* Scan blocks even if marked skip */ + bool finished_update_free; /* True when the zone cached pfns are + * no longer being updated +@@ -130,7 +145,10 @@ struct compact_control { + int order; /* order a direct compactor needs */ + int migratetype; /* MOVABLE, RECLAIMABLE etc */ + struct zone *zone; +- bool contended; /* True if a lock was contended */ ++ bool contended; /* True if a lock was contended, or ++ * need_resched() true during async ++ * compaction ++ */ + }; + + unsigned long +diff --git a/mm/madvise.c b/mm/madvise.c +index 539eeb96b323..a402f8fdc68e 100644 +--- a/mm/madvise.c ++++ b/mm/madvise.c +@@ -195,7 +195,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, + for (; start < end; start += PAGE_SIZE) { + index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + +- page = find_get_page(mapping, index); ++ page = find_get_entry(mapping, index); + if (!radix_tree_exceptional_entry(page)) { + if (page) + page_cache_release(page); +diff --git a/mm/memory-failure.c b/mm/memory-failure.c +index 6e3f9c39bc22..4ab233d4714a 100644 +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -1554,7 +1554,7 @@ static int soft_offline_huge_page(struct page *page, int flags) + + /* Keep page count to indicate a given hugepage is isolated. */ + list_move(&hpage->lru, &pagelist); +- ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, ++ ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, + MIGRATE_SYNC, MR_MEMORY_FAILURE); + if (ret) { + pr_info("soft offline: %#lx: migration failed %d, type %lx\n", +@@ -1635,7 +1635,7 @@ static int __soft_offline_page(struct page *page, int flags) + inc_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + list_add(&page->lru, &pagelist); +- ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, ++ ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, + MIGRATE_SYNC, MR_MEMORY_FAILURE); + if (ret) { + putback_lru_pages(&pagelist); +diff --git a/mm/memory.c b/mm/memory.c +index 99fe3aa1035c..b5901068495f 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -878,7 +878,7 @@ out_set_pte: + return 0; + } + +-int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, ++static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) + { +@@ -3698,7 +3698,7 @@ static int handle_pte_fault(struct mm_struct *mm, + pte_t entry; + spinlock_t *ptl; + +- entry = *pte; ++ entry = ACCESS_ONCE(*pte); + if (!pte_present(entry)) { + if (pte_none(entry)) { + if (vma->vm_ops) { +diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c +index ed85fe3870e2..d31730564617 100644 +--- a/mm/memory_hotplug.c ++++ b/mm/memory_hotplug.c +@@ -1321,7 +1321,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) + * alloc_migrate_target should be improooooved!! + * migrate_pages returns # of failed pages. + */ +- ret = migrate_pages(&source, alloc_migrate_target, 0, ++ ret = migrate_pages(&source, alloc_migrate_target, NULL, 0, + MIGRATE_SYNC, MR_MEMORY_HOTPLUG); + if (ret) + putback_movable_pages(&source); +diff --git a/mm/mempolicy.c b/mm/mempolicy.c +index 0437f3595b32..cc61c7a7d6a1 100644 +--- a/mm/mempolicy.c ++++ b/mm/mempolicy.c +@@ -1060,7 +1060,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, + flags | MPOL_MF_DISCONTIG_OK, &pagelist); + + if (!list_empty(&pagelist)) { +- err = migrate_pages(&pagelist, new_node_page, dest, ++ err = migrate_pages(&pagelist, new_node_page, NULL, dest, + MIGRATE_SYNC, MR_SYSCALL); + if (err) + putback_movable_pages(&pagelist); +@@ -1306,7 +1306,7 @@ static long do_mbind(unsigned long start, unsigned long len, + + if (!list_empty(&pagelist)) { + WARN_ON_ONCE(flags & MPOL_MF_LAZY); +- nr_failed = migrate_pages(&pagelist, new_page, ++ nr_failed = migrate_pages(&pagelist, new_page, NULL, + start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); + if (nr_failed) + putback_movable_pages(&pagelist); +@@ -1873,7 +1873,7 @@ int node_random(const nodemask_t *maskp) + * If the effective policy is 'BIND, returns a pointer to the mempolicy's + * @nodemask for filtering the zonelist. + * +- * Must be protected by get_mems_allowed() ++ * Must be protected by read_mems_allowed_begin() + */ + struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, + gfp_t gfp_flags, struct mempolicy **mpol, +@@ -2037,7 +2037,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, + + retry_cpuset: + pol = get_vma_policy(current, vma, addr); +- cpuset_mems_cookie = get_mems_allowed(); ++ cpuset_mems_cookie = read_mems_allowed_begin(); + + if (unlikely(pol->mode == MPOL_INTERLEAVE)) { + unsigned nid; +@@ -2045,7 +2045,7 @@ retry_cpuset: + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); + mpol_cond_put(pol); + page = alloc_page_interleave(gfp, order, nid); +- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) ++ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + + return page; +@@ -2055,7 +2055,7 @@ retry_cpuset: + policy_nodemask(gfp, pol)); + if (unlikely(mpol_needs_cond_ref(pol))) + __mpol_put(pol); +- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) ++ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + return page; + } +@@ -2089,7 +2089,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) + pol = &default_policy; + + retry_cpuset: +- cpuset_mems_cookie = get_mems_allowed(); ++ cpuset_mems_cookie = read_mems_allowed_begin(); + + /* + * No reference counting needed for current->mempolicy +@@ -2102,7 +2102,7 @@ retry_cpuset: + policy_zonelist(gfp, pol, numa_node_id()), + policy_nodemask(gfp, pol)); + +- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) ++ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + + return page; +diff --git a/mm/migrate.c b/mm/migrate.c +index e3cf71dd1288..96d4d814ae2f 100644 +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -867,8 +867,9 @@ out: + * Obtain the lock on page, remove all ptes and migrate the page + * to the newly allocated page in newpage. + */ +-static int unmap_and_move(new_page_t get_new_page, unsigned long private, +- struct page *page, int force, enum migrate_mode mode) ++static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, ++ unsigned long private, struct page *page, int force, ++ enum migrate_mode mode) + { + int rc = 0; + int *result = NULL; +@@ -912,11 +913,18 @@ out: + page_is_file_cache(page)); + putback_lru_page(page); + } ++ + /* +- * Move the new page to the LRU. If migration was not successful +- * then this will free the page. ++ * If migration was not successful and there's a freeing callback, use ++ * it. Otherwise, putback_lru_page() will drop the reference grabbed ++ * during isolation. + */ +- putback_lru_page(newpage); ++ if (rc != MIGRATEPAGE_SUCCESS && put_new_page) { ++ ClearPageSwapBacked(newpage); ++ put_new_page(newpage, private); ++ } else ++ putback_lru_page(newpage); ++ + if (result) { + if (rc) + *result = rc; +@@ -945,8 +953,9 @@ out: + * will wait in the page fault for migration to complete. + */ + static int unmap_and_move_huge_page(new_page_t get_new_page, +- unsigned long private, struct page *hpage, +- int force, enum migrate_mode mode) ++ free_page_t put_new_page, unsigned long private, ++ struct page *hpage, int force, ++ enum migrate_mode mode) + { + int rc = 0; + int *result = NULL; +@@ -982,20 +991,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, + if (!page_mapped(hpage)) + rc = move_to_new_page(new_hpage, hpage, 1, mode); + +- if (rc) ++ if (rc != MIGRATEPAGE_SUCCESS) + remove_migration_ptes(hpage, hpage); + + if (anon_vma) + put_anon_vma(anon_vma); + +- if (!rc) ++ if (rc == MIGRATEPAGE_SUCCESS) + hugetlb_cgroup_migrate(hpage, new_hpage); + + unlock_page(hpage); + out: + if (rc != -EAGAIN) + putback_active_hugepage(hpage); +- put_page(new_hpage); ++ ++ /* ++ * If migration was not successful and there's a freeing callback, use ++ * it. Otherwise, put_page() will drop the reference grabbed during ++ * isolation. ++ */ ++ if (rc != MIGRATEPAGE_SUCCESS && put_new_page) ++ put_new_page(new_hpage, private); ++ else ++ put_page(new_hpage); ++ + if (result) { + if (rc) + *result = rc; +@@ -1012,6 +1031,8 @@ out: + * @from: The list of pages to be migrated. + * @get_new_page: The function used to allocate free pages to be used + * as the target of the page migration. ++ * @put_new_page: The function used to free target pages if migration ++ * fails, or NULL if no special handling is necessary. + * @private: Private data to be passed on to get_new_page() + * @mode: The migration mode that specifies the constraints for + * page migration, if any. +@@ -1025,7 +1046,8 @@ out: + * Returns the number of pages that were not migrated, or an error code. + */ + int migrate_pages(struct list_head *from, new_page_t get_new_page, +- unsigned long private, enum migrate_mode mode, int reason) ++ free_page_t put_new_page, unsigned long private, ++ enum migrate_mode mode, int reason) + { + int retry = 1; + int nr_failed = 0; +@@ -1047,10 +1069,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + + if (PageHuge(page)) + rc = unmap_and_move_huge_page(get_new_page, +- private, page, pass > 2, mode); ++ put_new_page, private, page, ++ pass > 2, mode); + else +- rc = unmap_and_move(get_new_page, private, +- page, pass > 2, mode); ++ rc = unmap_and_move(get_new_page, put_new_page, ++ private, page, pass > 2, mode); + + switch(rc) { + case -ENOMEM: +@@ -1194,7 +1217,7 @@ set_status: + + err = 0; + if (!list_empty(&pagelist)) { +- err = migrate_pages(&pagelist, new_page_node, ++ err = migrate_pages(&pagelist, new_page_node, NULL, + (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); + if (err) + putback_movable_pages(&pagelist); +@@ -1643,7 +1666,8 @@ int migrate_misplaced_page(struct page *page, int node) + + list_add(&page->lru, &migratepages); + nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, +- node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); ++ NULL, node, MIGRATE_ASYNC, ++ MR_NUMA_MISPLACED); + if (nr_remaining) { + putback_lru_pages(&migratepages); + isolated = 0; +diff --git a/mm/mincore.c b/mm/mincore.c +index da2be56a7b8f..06cb81005c77 100644 +--- a/mm/mincore.c ++++ b/mm/mincore.c +@@ -70,13 +70,21 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) + * any other file mapping (ie. marked !present and faulted in with + * tmpfs's .fault). So swapped out tmpfs mappings are tested here. + */ +- page = find_get_page(mapping, pgoff); + #ifdef CONFIG_SWAP +- /* shmem/tmpfs may return swap: account for swapcache page too. */ +- if (radix_tree_exceptional_entry(page)) { +- swp_entry_t swap = radix_to_swp_entry(page); +- page = find_get_page(swap_address_space(swap), swap.val); +- } ++ if (shmem_mapping(mapping)) { ++ page = find_get_entry(mapping, pgoff); ++ /* ++ * shmem/tmpfs may return swap: account for swapcache ++ * page too. ++ */ ++ if (radix_tree_exceptional_entry(page)) { ++ swp_entry_t swp = radix_to_swp_entry(page); ++ page = find_get_page(swap_address_space(swp), swp.val); ++ } ++ } else ++ page = find_get_page(mapping, pgoff); ++#else ++ page = find_get_page(mapping, pgoff); + #endif + if (page) { + present = PageUptodate(page); +diff --git a/mm/mmap.c b/mm/mmap.c +index af99b9ed2007..c1249cb7dc15 100644 +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -10,6 +10,7 @@ + #include <linux/slab.h> + #include <linux/backing-dev.h> + #include <linux/mm.h> ++#include <linux/vmacache.h> + #include <linux/shm.h> + #include <linux/mman.h> + #include <linux/pagemap.h> +@@ -682,8 +683,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, + prev->vm_next = next = vma->vm_next; + if (next) + next->vm_prev = prev; +- if (mm->mmap_cache == vma) +- mm->mmap_cache = prev; ++ ++ /* Kill the cache */ ++ vmacache_invalidate(mm); + } + + /* +@@ -1980,34 +1982,33 @@ EXPORT_SYMBOL(get_unmapped_area); + /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ + struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) + { +- struct vm_area_struct *vma = NULL; ++ struct rb_node *rb_node; ++ struct vm_area_struct *vma; + + /* Check the cache first. */ +- /* (Cache hit rate is typically around 35%.) */ +- vma = ACCESS_ONCE(mm->mmap_cache); +- if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { +- struct rb_node *rb_node; ++ vma = vmacache_find(mm, addr); ++ if (likely(vma)) ++ return vma; + +- rb_node = mm->mm_rb.rb_node; +- vma = NULL; ++ rb_node = mm->mm_rb.rb_node; ++ vma = NULL; + +- while (rb_node) { +- struct vm_area_struct *vma_tmp; +- +- vma_tmp = rb_entry(rb_node, +- struct vm_area_struct, vm_rb); +- +- if (vma_tmp->vm_end > addr) { +- vma = vma_tmp; +- if (vma_tmp->vm_start <= addr) +- break; +- rb_node = rb_node->rb_left; +- } else +- rb_node = rb_node->rb_right; +- } +- if (vma) +- mm->mmap_cache = vma; ++ while (rb_node) { ++ struct vm_area_struct *tmp; ++ ++ tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); ++ ++ if (tmp->vm_end > addr) { ++ vma = tmp; ++ if (tmp->vm_start <= addr) ++ break; ++ rb_node = rb_node->rb_left; ++ } else ++ rb_node = rb_node->rb_right; + } ++ ++ if (vma) ++ vmacache_update(addr, vma); + return vma; + } + +@@ -2379,7 +2380,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, + } else + mm->highest_vm_end = prev ? prev->vm_end : 0; + tail_vma->vm_next = NULL; +- mm->mmap_cache = NULL; /* Kill the cache. */ ++ ++ /* Kill the cache */ ++ vmacache_invalidate(mm); + } + + /* +diff --git a/mm/nommu.c b/mm/nommu.c +index ecd1f158548e..1221d2b66e97 100644 +--- a/mm/nommu.c ++++ b/mm/nommu.c +@@ -15,6 +15,7 @@ + + #include <linux/export.h> + #include <linux/mm.h> ++#include <linux/vmacache.h> + #include <linux/mman.h> + #include <linux/swap.h> + #include <linux/file.h> +@@ -767,16 +768,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) + */ + static void delete_vma_from_mm(struct vm_area_struct *vma) + { ++ int i; + struct address_space *mapping; + struct mm_struct *mm = vma->vm_mm; ++ struct task_struct *curr = current; + + kenter("%p", vma); + + protect_vma(vma, 0); + + mm->map_count--; +- if (mm->mmap_cache == vma) +- mm->mmap_cache = NULL; ++ for (i = 0; i < VMACACHE_SIZE; i++) { ++ /* if the vma is cached, invalidate the entire cache */ ++ if (curr->vmacache[i] == vma) { ++ vmacache_invalidate(curr->mm); ++ break; ++ } ++ } + + /* remove the VMA from the mapping */ + if (vma->vm_file) { +@@ -824,8 +832,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) + struct vm_area_struct *vma; + + /* check the cache first */ +- vma = ACCESS_ONCE(mm->mmap_cache); +- if (vma && vma->vm_start <= addr && vma->vm_end > addr) ++ vma = vmacache_find(mm, addr); ++ if (likely(vma)) + return vma; + + /* trawl the list (there may be multiple mappings in which addr +@@ -834,7 +842,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) + if (vma->vm_start > addr) + return NULL; + if (vma->vm_end > addr) { +- mm->mmap_cache = vma; ++ vmacache_update(addr, vma); + return vma; + } + } +@@ -873,8 +881,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, + unsigned long end = addr + len; + + /* check the cache first */ +- vma = mm->mmap_cache; +- if (vma && vma->vm_start == addr && vma->vm_end == end) ++ vma = vmacache_find_exact(mm, addr, end); ++ if (vma) + return vma; + + /* trawl the list (there may be multiple mappings in which addr +@@ -885,7 +893,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, + if (vma->vm_start > addr) + return NULL; + if (vma->vm_end == end) { +- mm->mmap_cache = vma; ++ vmacache_update(addr, vma); + return vma; + } + } +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index a280f772bc66..2f91223dbe93 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -405,7 +405,8 @@ static int destroy_compound_page(struct page *page, unsigned long order) + return bad; + } + +-static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) ++static inline void prep_zero_page(struct page *page, unsigned int order, ++ gfp_t gfp_flags) + { + int i; + +@@ -449,7 +450,7 @@ static inline void set_page_guard_flag(struct page *page) { } + static inline void clear_page_guard_flag(struct page *page) { } + #endif + +-static inline void set_page_order(struct page *page, int order) ++static inline void set_page_order(struct page *page, unsigned int order) + { + set_page_private(page, order); + __SetPageBuddy(page); +@@ -500,21 +501,31 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) + * For recording page's order, we use page_private(page). + */ + static inline int page_is_buddy(struct page *page, struct page *buddy, +- int order) ++ unsigned int order) + { + if (!pfn_valid_within(page_to_pfn(buddy))) + return 0; + +- if (page_zone_id(page) != page_zone_id(buddy)) +- return 0; +- + if (page_is_guard(buddy) && page_order(buddy) == order) { + VM_BUG_ON(page_count(buddy) != 0); ++ ++ if (page_zone_id(page) != page_zone_id(buddy)) ++ return 0; ++ + return 1; + } + + if (PageBuddy(buddy) && page_order(buddy) == order) { + VM_BUG_ON(page_count(buddy) != 0); ++ ++ /* ++ * zone check is done late to avoid uselessly ++ * calculating zone/node ids for pages that could ++ * never merge. ++ */ ++ if (page_zone_id(page) != page_zone_id(buddy)) ++ return 0; ++ + return 1; + } + return 0; +@@ -546,6 +557,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, + */ + + static inline void __free_one_page(struct page *page, ++ unsigned long pfn, + struct zone *zone, unsigned int order, + int migratetype) + { +@@ -562,7 +574,7 @@ static inline void __free_one_page(struct page *page, + + VM_BUG_ON(migratetype == -1); + +- page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); ++ page_idx = pfn & ((1 << MAX_ORDER) - 1); + + VM_BUG_ON(page_idx & ((1 << order) - 1)); + VM_BUG_ON(bad_range(zone, page)); +@@ -652,9 +664,12 @@ static void free_pcppages_bulk(struct zone *zone, int count, + int migratetype = 0; + int batch_free = 0; + int to_free = count; ++ unsigned long nr_scanned; + + spin_lock(&zone->lock); +- zone->pages_scanned = 0; ++ nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); ++ if (nr_scanned) ++ __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + + while (to_free) { + struct page *page; +@@ -686,7 +701,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, + list_del(&page->lru); + mt = get_freepage_migratetype(page); + /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ +- __free_one_page(page, zone, 0, mt); ++ __free_one_page(page, page_to_pfn(page), zone, 0, mt); + trace_mm_page_pcpu_drain(page, 0, mt); + if (likely(!is_migrate_isolate_page(page))) { + __mod_zone_page_state(zone, NR_FREE_PAGES, 1); +@@ -698,13 +713,18 @@ static void free_pcppages_bulk(struct zone *zone, int count, + spin_unlock(&zone->lock); + } + +-static void free_one_page(struct zone *zone, struct page *page, int order, ++static void free_one_page(struct zone *zone, ++ struct page *page, unsigned long pfn, ++ unsigned int order, + int migratetype) + { ++ unsigned long nr_scanned; + spin_lock(&zone->lock); +- zone->pages_scanned = 0; ++ nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); ++ if (nr_scanned) ++ __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + +- __free_one_page(page, zone, order, migratetype); ++ __free_one_page(page, pfn, zone, order, migratetype); + if (unlikely(!is_migrate_isolate(migratetype))) + __mod_zone_freepage_state(zone, 1 << order, migratetype); + spin_unlock(&zone->lock); +@@ -741,15 +761,16 @@ static void __free_pages_ok(struct page *page, unsigned int order) + { + unsigned long flags; + int migratetype; ++ unsigned long pfn = page_to_pfn(page); + + if (!free_pages_prepare(page, order)) + return; + ++ migratetype = get_pfnblock_migratetype(page, pfn); + local_irq_save(flags); + __count_vm_events(PGFREE, 1 << order); +- migratetype = get_pageblock_migratetype(page); + set_freepage_migratetype(page, migratetype); +- free_one_page(page_zone(page), page, order, migratetype); ++ free_one_page(page_zone(page), page, pfn, order, migratetype); + local_irq_restore(flags); + } + +@@ -869,7 +890,7 @@ static inline int check_new_page(struct page *page) + return 0; + } + +-static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) ++static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) + { + int i; + +@@ -918,6 +939,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, + rmv_page_order(page); + area->nr_free--; + expand(zone, page, order, current_order, area, migratetype); ++ set_freepage_migratetype(page, migratetype); + return page; + } + +@@ -1042,6 +1064,12 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, + { + int current_order = page_order(page); + ++ /* ++ * When borrowing from MIGRATE_CMA, we need to release the excess ++ * buddy pages to CMA itself. We also ensure the freepage_migratetype ++ * is set to CMA so it is returned to the correct freelist in case ++ * the page ends up being not actually allocated from the pcp lists. ++ */ + if (is_migrate_cma(fallback_type)) + return fallback_type; + +@@ -1073,16 +1101,17 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, + + /* Remove an element from the buddy allocator from the fallback list */ + static inline struct page * +-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) ++__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) + { + struct free_area *area; +- int current_order; ++ unsigned int current_order; + struct page *page; + int migratetype, new_type, i; + + /* Find the largest possible block of pages in the other list */ +- for (current_order = MAX_ORDER-1; current_order >= order; +- --current_order) { ++ for (current_order = MAX_ORDER-1; ++ current_order >= order && current_order <= MAX_ORDER-1; ++ --current_order) { + for (i = 0;; i++) { + migratetype = fallbacks[start_migratetype][i]; + +@@ -1106,21 +1135,17 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) + list_del(&page->lru); + rmv_page_order(page); + +- /* +- * Borrow the excess buddy pages as well, irrespective +- * of whether we stole freepages, or took ownership of +- * the pageblock or not. +- * +- * Exception: When borrowing from MIGRATE_CMA, release +- * the excess buddy pages to CMA itself. +- */ + expand(zone, page, order, current_order, area, +- is_migrate_cma(migratetype) +- ? migratetype : start_migratetype); ++ new_type); ++ /* The freepage_migratetype may differ from pageblock's ++ * migratetype depending on the decisions in ++ * try_to_steal_freepages. This is OK as long as it does ++ * not differ for MIGRATE_CMA type. ++ */ ++ set_freepage_migratetype(page, new_type); + +- trace_mm_page_alloc_extfrag(page, order, +- current_order, start_migratetype, migratetype, +- new_type == start_migratetype); ++ trace_mm_page_alloc_extfrag(page, order, current_order, ++ start_migratetype, migratetype, new_type); + + return page; + } +@@ -1166,9 +1191,9 @@ retry_reserve: + */ + static int rmqueue_bulk(struct zone *zone, unsigned int order, + unsigned long count, struct list_head *list, +- int migratetype, int cold) ++ int migratetype, bool cold) + { +- int mt = migratetype, i; ++ int i; + + spin_lock(&zone->lock); + for (i = 0; i < count; ++i) { +@@ -1185,18 +1210,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, + * merge IO requests if the physical pages are ordered + * properly. + */ +- if (likely(cold == 0)) ++ if (likely(!cold)) + list_add(&page->lru, list); + else + list_add_tail(&page->lru, list); +- if (IS_ENABLED(CONFIG_CMA)) { +- mt = get_pageblock_migratetype(page); +- if (!is_migrate_cma(mt) && !is_migrate_isolate(mt)) +- mt = migratetype; +- } +- set_freepage_migratetype(page, mt); + list = &page->lru; +- if (is_migrate_cma(mt)) ++ if (is_migrate_cma(get_freepage_migratetype(page))) + __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, + -(1 << order)); + } +@@ -1320,7 +1339,7 @@ void mark_free_pages(struct zone *zone) + { + unsigned long pfn, max_zone_pfn; + unsigned long flags; +- int order, t; ++ unsigned int order, t; + struct list_head *curr; + + if (zone_is_empty(zone)) +@@ -1352,19 +1371,20 @@ void mark_free_pages(struct zone *zone) + + /* + * Free a 0-order page +- * cold == 1 ? free a cold page : free a hot page ++ * cold == true ? free a cold page : free a hot page + */ +-void free_hot_cold_page(struct page *page, int cold) ++void free_hot_cold_page(struct page *page, bool cold) + { + struct zone *zone = page_zone(page); + struct per_cpu_pages *pcp; + unsigned long flags; ++ unsigned long pfn = page_to_pfn(page); + int migratetype; + + if (!free_pages_prepare(page, 0)) + return; + +- migratetype = get_pageblock_migratetype(page); ++ migratetype = get_pfnblock_migratetype(page, pfn); + set_freepage_migratetype(page, migratetype); + local_irq_save(flags); + __count_vm_event(PGFREE); +@@ -1378,17 +1398,17 @@ void free_hot_cold_page(struct page *page, int cold) + */ + if (migratetype >= MIGRATE_PCPTYPES) { + if (unlikely(is_migrate_isolate(migratetype))) { +- free_one_page(zone, page, 0, migratetype); ++ free_one_page(zone, page, pfn, 0, migratetype); + goto out; + } + migratetype = MIGRATE_MOVABLE; + } + + pcp = &this_cpu_ptr(zone->pageset)->pcp; +- if (cold) +- list_add_tail(&page->lru, &pcp->lists[migratetype]); +- else ++ if (!cold) + list_add(&page->lru, &pcp->lists[migratetype]); ++ else ++ list_add_tail(&page->lru, &pcp->lists[migratetype]); + pcp->count++; + if (pcp->count >= pcp->high) { + unsigned long batch = ACCESS_ONCE(pcp->batch); +@@ -1403,7 +1423,7 @@ out: + /* + * Free a list of 0-order pages + */ +-void free_hot_cold_page_list(struct list_head *list, int cold) ++void free_hot_cold_page_list(struct list_head *list, bool cold) + { + struct page *page, *next; + +@@ -1515,12 +1535,12 @@ int split_free_page(struct page *page) + */ + static inline + struct page *buffered_rmqueue(struct zone *preferred_zone, +- struct zone *zone, int order, gfp_t gfp_flags, +- int migratetype) ++ struct zone *zone, unsigned int order, ++ gfp_t gfp_flags, int migratetype) + { + unsigned long flags; + struct page *page; +- int cold = !!(gfp_flags & __GFP_COLD); ++ bool cold = ((gfp_flags & __GFP_COLD) != 0); + + again: + if (likely(order == 0)) { +@@ -1565,10 +1585,13 @@ again: + if (!page) + goto failed; + __mod_zone_freepage_state(zone, -(1 << order), +- get_pageblock_migratetype(page)); ++ get_freepage_migratetype(page)); + } + + __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); ++ if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 && ++ !zone_is_fair_depleted(zone)) ++ zone_set_flag(zone, ZONE_FAIR_DEPLETED); + + __count_zone_vm_events(PGALLOC, zone, 1 << order); + zone_statistics(preferred_zone, zone, gfp_flags); +@@ -1665,12 +1688,12 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) + * Return true if free pages are above 'mark'. This takes into account the order + * of the allocation. + */ +-static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, +- int classzone_idx, int alloc_flags, long free_pages) ++static bool __zone_watermark_ok(struct zone *z, unsigned int order, ++ unsigned long mark, int classzone_idx, int alloc_flags, ++ long free_pages) + { + /* free_pages my go negative - that's OK */ + long min = mark; +- long lowmem_reserve = z->lowmem_reserve[classzone_idx]; + int o; + long free_cma = 0; + +@@ -1685,7 +1708,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, + free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); + #endif + +- if (free_pages - free_cma <= min + lowmem_reserve) ++ if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx]) + return false; + for (o = 0; o < order; o++) { + /* At the next order, this order's pages become unavailable */ +@@ -1700,15 +1723,15 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, + return true; + } + +-bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, ++bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, + int classzone_idx, int alloc_flags) + { + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + zone_page_state(z, NR_FREE_PAGES)); + } + +-bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, +- int classzone_idx, int alloc_flags) ++bool zone_watermark_ok_safe(struct zone *z, unsigned int order, ++ unsigned long mark, int classzone_idx, int alloc_flags) + { + long free_pages = zone_page_state(z, NR_FREE_PAGES); + +@@ -1850,7 +1873,7 @@ static void __paginginit init_zone_allows_reclaim(int nid) + { + int i; + +- for_each_online_node(i) ++ for_each_node_state(i, N_MEMORY) + if (node_distance(nid, i) <= RECLAIM_DISTANCE) + node_set(i, NODE_DATA(nid)->reclaim_nodes); + else +@@ -1893,6 +1916,18 @@ static inline void init_zone_allows_reclaim(int nid) + } + #endif /* CONFIG_NUMA */ + ++static void reset_alloc_batches(struct zone *preferred_zone) ++{ ++ struct zone *zone = preferred_zone->zone_pgdat->node_zones; ++ ++ do { ++ mod_zone_page_state(zone, NR_ALLOC_BATCH, ++ high_wmark_pages(zone) - low_wmark_pages(zone) - ++ atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); ++ zone_clear_flag(zone, ZONE_FAIR_DEPLETED); ++ } while (zone++ != preferred_zone); ++} ++ + /* + * get_page_from_freelist goes through the zonelist trying to allocate + * a page. +@@ -1900,18 +1935,22 @@ static inline void init_zone_allows_reclaim(int nid) + static struct page * + get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, + struct zonelist *zonelist, int high_zoneidx, int alloc_flags, +- struct zone *preferred_zone, int migratetype) ++ struct zone *preferred_zone, int classzone_idx, int migratetype) + { + struct zoneref *z; + struct page *page = NULL; +- int classzone_idx; + struct zone *zone; + nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ + int zlc_active = 0; /* set if using zonelist_cache */ + int did_zlc_setup = 0; /* just call zlc_setup() one time */ ++ bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && ++ (gfp_mask & __GFP_WRITE); ++ int nr_fair_skipped = 0; ++ bool zonelist_rescan; + +- classzone_idx = zone_idx(preferred_zone); + zonelist_scan: ++ zonelist_rescan = false; ++ + /* + * Scan zonelist, looking for a zone with enough free. + * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. +@@ -1923,12 +1962,10 @@ zonelist_scan: + if (IS_ENABLED(CONFIG_NUMA) && zlc_active && + !zlc_zone_worth_trying(zonelist, z, allowednodes)) + continue; +- if ((alloc_flags & ALLOC_CPUSET) && ++ if (cpusets_enabled() && ++ (alloc_flags & ALLOC_CPUSET) && + !cpuset_zone_allowed_softwall(zone, gfp_mask)) + continue; +- BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); +- if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS)) +- goto try_this_zone; + /* + * Distribute pages in proportion to the individual + * zone size to ensure fair page aging. The zone a +@@ -1937,9 +1974,11 @@ zonelist_scan: + */ + if (alloc_flags & ALLOC_FAIR) { + if (!zone_local(preferred_zone, zone)) ++ break; ++ if (zone_is_fair_depleted(zone)) { ++ nr_fair_skipped++; + continue; +- if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) +- continue; ++ } + } + /* + * When allocating a page cache page for writing, we +@@ -1967,15 +2006,19 @@ zonelist_scan: + * will require awareness of zones in the + * dirty-throttling and the flusher threads. + */ +- if ((alloc_flags & ALLOC_WMARK_LOW) && +- (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) +- goto this_zone_full; ++ if (consider_zone_dirty && !zone_dirty_ok(zone)) ++ continue; + + mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; + if (!zone_watermark_ok(zone, order, mark, + classzone_idx, alloc_flags)) { + int ret; + ++ /* Checked here to keep the fast path fast */ ++ BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); ++ if (alloc_flags & ALLOC_NO_WATERMARKS) ++ goto try_this_zone; ++ + if (IS_ENABLED(CONFIG_NUMA) && + !did_zlc_setup && nr_online_nodes > 1) { + /* +@@ -2037,17 +2080,11 @@ try_this_zone: + if (page) + break; + this_zone_full: +- if (IS_ENABLED(CONFIG_NUMA)) ++ if (IS_ENABLED(CONFIG_NUMA) && zlc_active) + zlc_mark_zone_full(zonelist, z); + } + +- if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { +- /* Disable zlc cache for second zonelist scan */ +- zlc_active = 0; +- goto zonelist_scan; +- } +- +- if (page) ++ if (page) { + /* + * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was + * necessary to allocate the page. The expectation is +@@ -2056,8 +2093,37 @@ this_zone_full: + * for !PFMEMALLOC purposes. + */ + page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); ++ return page; ++ } + +- return page; ++ /* ++ * The first pass makes sure allocations are spread fairly within the ++ * local node. However, the local node might have free pages left ++ * after the fairness batches are exhausted, and remote zones haven't ++ * even been considered yet. Try once more without fairness, and ++ * include remote zones now, before entering the slowpath and waking ++ * kswapd: prefer spilling to a remote zone over swapping locally. ++ */ ++ if (alloc_flags & ALLOC_FAIR) { ++ alloc_flags &= ~ALLOC_FAIR; ++ if (nr_fair_skipped) { ++ zonelist_rescan = true; ++ reset_alloc_batches(preferred_zone); ++ } ++ if (nr_online_nodes > 1) ++ zonelist_rescan = true; ++ } ++ ++ if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { ++ /* Disable zlc cache for second zonelist scan */ ++ zlc_active = 0; ++ zonelist_rescan = true; ++ } ++ ++ if (zonelist_rescan) ++ goto zonelist_scan; ++ ++ return NULL; + } + + /* +@@ -2173,7 +2239,7 @@ static inline struct page * + __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, struct zone *preferred_zone, +- int migratetype) ++ int classzone_idx, int migratetype) + { + struct page *page; + +@@ -2191,7 +2257,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, + order, zonelist, high_zoneidx, + ALLOC_WMARK_HIGH|ALLOC_CPUSET, +- preferred_zone, migratetype); ++ preferred_zone, classzone_idx, migratetype); + if (page) + goto out; + +@@ -2226,7 +2292,7 @@ static struct page * + __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, +- int migratetype, bool sync_migration, ++ int classzone_idx, int migratetype, enum migrate_mode mode, + bool *contended_compaction, bool *deferred_compaction, + unsigned long *did_some_progress) + { +@@ -2240,7 +2306,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, + + current->flags |= PF_MEMALLOC; + *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, +- nodemask, sync_migration, ++ nodemask, mode, + contended_compaction); + current->flags &= ~PF_MEMALLOC; + +@@ -2254,13 +2320,10 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, + page = get_page_from_freelist(gfp_mask, nodemask, + order, zonelist, high_zoneidx, + alloc_flags & ~ALLOC_NO_WATERMARKS, +- preferred_zone, migratetype); ++ preferred_zone, classzone_idx, migratetype); + if (page) { + preferred_zone->compact_blockskip_flush = false; +- preferred_zone->compact_considered = 0; +- preferred_zone->compact_defer_shift = 0; +- if (order >= preferred_zone->compact_order_failed) +- preferred_zone->compact_order_failed = order + 1; ++ compaction_defer_reset(preferred_zone, order, true); + count_vm_event(COMPACTSUCCESS); + return page; + } +@@ -2276,7 +2339,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, + * As async compaction considers a subset of pageblocks, only + * defer if the failure was a sync compaction failure. + */ +- if (sync_migration) ++ if (mode != MIGRATE_ASYNC) + defer_compaction(preferred_zone, order); + + cond_resched(); +@@ -2289,9 +2352,9 @@ static inline struct page * + __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, +- int migratetype, bool sync_migration, +- bool *contended_compaction, bool *deferred_compaction, +- unsigned long *did_some_progress) ++ int classzone_idx, int migratetype, ++ enum migrate_mode mode, bool *contended_compaction, ++ bool *deferred_compaction, unsigned long *did_some_progress) + { + return NULL; + } +@@ -2330,7 +2393,7 @@ static inline struct page * + __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, +- int migratetype, unsigned long *did_some_progress) ++ int classzone_idx, int migratetype, unsigned long *did_some_progress) + { + struct page *page = NULL; + bool drained = false; +@@ -2348,7 +2411,8 @@ retry: + page = get_page_from_freelist(gfp_mask, nodemask, order, + zonelist, high_zoneidx, + alloc_flags & ~ALLOC_NO_WATERMARKS, +- preferred_zone, migratetype); ++ preferred_zone, classzone_idx, ++ migratetype); + + /* + * If an allocation failed after direct reclaim, it could be because +@@ -2371,14 +2435,14 @@ static inline struct page * + __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, struct zone *preferred_zone, +- int migratetype) ++ int classzone_idx, int migratetype) + { + struct page *page; + + do { + page = get_page_from_freelist(gfp_mask, nodemask, order, + zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, +- preferred_zone, migratetype); ++ preferred_zone, classzone_idx, migratetype); + + if (!page && gfp_mask & __GFP_NOFAIL) + wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); +@@ -2387,28 +2451,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, + return page; + } + +-static void reset_alloc_batches(struct zonelist *zonelist, +- enum zone_type high_zoneidx, +- struct zone *preferred_zone) +-{ +- struct zoneref *z; +- struct zone *zone; +- +- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { +- /* +- * Only reset the batches of zones that were actually +- * considered in the fairness pass, we don't want to +- * trash fairness information for zones that are not +- * actually part of this zonelist's round-robin cycle. +- */ +- if (!zone_local(preferred_zone, zone)) +- continue; +- mod_zone_page_state(zone, NR_ALLOC_BATCH, +- high_wmark_pages(zone) - low_wmark_pages(zone) - +- atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); +- } +-} +- + static void wake_all_kswapds(unsigned int order, + struct zonelist *zonelist, + enum zone_type high_zoneidx, +@@ -2479,14 +2521,14 @@ static inline struct page * + __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, struct zone *preferred_zone, +- int migratetype) ++ int classzone_idx, int migratetype) + { + const gfp_t wait = gfp_mask & __GFP_WAIT; + struct page *page = NULL; + int alloc_flags; + unsigned long pages_reclaimed = 0; + unsigned long did_some_progress; +- bool sync_migration = false; ++ enum migrate_mode migration_mode = MIGRATE_ASYNC; + bool deferred_compaction = false; + bool contended_compaction = false; + +@@ -2528,15 +2570,19 @@ restart: + * Find the true preferred zone if the allocation is unconstrained by + * cpusets. + */ +- if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) +- first_zones_zonelist(zonelist, high_zoneidx, NULL, +- &preferred_zone); ++ if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { ++ struct zoneref *preferred_zoneref; ++ preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, ++ NULL, ++ &preferred_zone); ++ classzone_idx = zonelist_zone_idx(preferred_zoneref); ++ } + + rebalance: + /* This is the last chance, in general, before the goto nopage. */ + page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, + high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, +- preferred_zone, migratetype); ++ preferred_zone, classzone_idx, migratetype); + if (page) + goto got_pg; + +@@ -2551,7 +2597,7 @@ rebalance: + + page = __alloc_pages_high_priority(gfp_mask, order, + zonelist, high_zoneidx, nodemask, +- preferred_zone, migratetype); ++ preferred_zone, classzone_idx, migratetype); + if (page) { + goto got_pg; + } +@@ -2573,17 +2619,16 @@ rebalance: + * Try direct compaction. The first pass is asynchronous. Subsequent + * attempts after direct reclaim are synchronous + */ +- page = __alloc_pages_direct_compact(gfp_mask, order, +- zonelist, high_zoneidx, +- nodemask, +- alloc_flags, preferred_zone, +- migratetype, sync_migration, +- &contended_compaction, ++ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, ++ high_zoneidx, nodemask, alloc_flags, ++ preferred_zone, ++ classzone_idx, migratetype, ++ migration_mode, &contended_compaction, + &deferred_compaction, + &did_some_progress); + if (page) + goto got_pg; +- sync_migration = true; ++ migration_mode = MIGRATE_SYNC_LIGHT; + + /* + * If compaction is deferred for high-order allocations, it is because +@@ -2600,7 +2645,8 @@ rebalance: + zonelist, high_zoneidx, + nodemask, + alloc_flags, preferred_zone, +- migratetype, &did_some_progress); ++ classzone_idx, migratetype, ++ &did_some_progress); + if (page) + goto got_pg; + +@@ -2619,7 +2665,7 @@ rebalance: + page = __alloc_pages_may_oom(gfp_mask, order, + zonelist, high_zoneidx, + nodemask, preferred_zone, +- migratetype); ++ classzone_idx, migratetype); + if (page) + goto got_pg; + +@@ -2658,12 +2704,11 @@ rebalance: + * direct reclaim and reclaim/compaction depends on compaction + * being called after reclaim so call directly if necessary + */ +- page = __alloc_pages_direct_compact(gfp_mask, order, +- zonelist, high_zoneidx, +- nodemask, +- alloc_flags, preferred_zone, +- migratetype, sync_migration, +- &contended_compaction, ++ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, ++ high_zoneidx, nodemask, alloc_flags, ++ preferred_zone, ++ classzone_idx, migratetype, ++ migration_mode, &contended_compaction, + &deferred_compaction, + &did_some_progress); + if (page) +@@ -2689,11 +2734,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, + { + enum zone_type high_zoneidx = gfp_zone(gfp_mask); + struct zone *preferred_zone; ++ struct zoneref *preferred_zoneref; + struct page *page = NULL; + int migratetype = allocflags_to_migratetype(gfp_mask); + unsigned int cpuset_mems_cookie; + int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; + struct mem_cgroup *memcg = NULL; ++ int classzone_idx; + + gfp_mask &= gfp_allowed_mask; + +@@ -2720,42 +2767,26 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, + return NULL; + + retry_cpuset: +- cpuset_mems_cookie = get_mems_allowed(); ++ cpuset_mems_cookie = read_mems_allowed_begin(); + + /* The preferred zone is used for statistics later */ +- first_zones_zonelist(zonelist, high_zoneidx, ++ preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, + nodemask ? : &cpuset_current_mems_allowed, + &preferred_zone); + if (!preferred_zone) + goto out; ++ classzone_idx = zonelist_zone_idx(preferred_zoneref); + + #ifdef CONFIG_CMA + if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + alloc_flags |= ALLOC_CMA; + #endif +-retry: + /* First allocation attempt */ + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, + zonelist, high_zoneidx, alloc_flags, +- preferred_zone, migratetype); ++ preferred_zone, classzone_idx, migratetype); + if (unlikely(!page)) { + /* +- * The first pass makes sure allocations are spread +- * fairly within the local node. However, the local +- * node might have free pages left after the fairness +- * batches are exhausted, and remote zones haven't +- * even been considered yet. Try once more without +- * fairness, and include remote zones now, before +- * entering the slowpath and waking kswapd: prefer +- * spilling to a remote zone over swapping locally. +- */ +- if (alloc_flags & ALLOC_FAIR) { +- reset_alloc_batches(zonelist, high_zoneidx, +- preferred_zone); +- alloc_flags &= ~ALLOC_FAIR; +- goto retry; +- } +- /* + * Runtime PM, block IO and its error handling path + * can deadlock because I/O on the device might not + * complete. +@@ -2763,7 +2794,7 @@ retry: + gfp_mask = memalloc_noio_flags(gfp_mask); + page = __alloc_pages_slowpath(gfp_mask, order, + zonelist, high_zoneidx, nodemask, +- preferred_zone, migratetype); ++ preferred_zone, classzone_idx, migratetype); + } + + trace_mm_page_alloc(page, order, gfp_mask, migratetype); +@@ -2775,7 +2806,7 @@ out: + * the mask is being updated. If a page allocation is about to fail, + * check if the cpuset changed during allocation and if so, retry. + */ +- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) ++ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + + memcg_kmem_commit_charge(page, memcg, order); +@@ -2814,7 +2845,7 @@ void __free_pages(struct page *page, unsigned int order) + { + if (put_page_testzero(page)) { + if (order == 0) +- free_hot_cold_page(page, 0); ++ free_hot_cold_page(page, false); + else + __free_pages_ok(page, order); + } +@@ -3043,9 +3074,9 @@ bool skip_free_areas_node(unsigned int flags, int nid) + goto out; + + do { +- cpuset_mems_cookie = get_mems_allowed(); ++ cpuset_mems_cookie = read_mems_allowed_begin(); + ret = !node_isset(nid, cpuset_current_mems_allowed); +- } while (!put_mems_allowed(cpuset_mems_cookie)); ++ } while (read_mems_allowed_retry(cpuset_mems_cookie)); + out: + return ret; + } +@@ -3198,12 +3229,12 @@ void show_free_areas(unsigned int filter) + K(zone_page_state(zone, NR_BOUNCE)), + K(zone_page_state(zone, NR_FREE_CMA_PAGES)), + K(zone_page_state(zone, NR_WRITEBACK_TEMP)), +- zone->pages_scanned, ++ K(zone_page_state(zone, NR_PAGES_SCANNED)), + (!zone_reclaimable(zone) ? "yes" : "no") + ); + printk("lowmem_reserve[]:"); + for (i = 0; i < MAX_NR_ZONES; i++) +- printk(" %lu", zone->lowmem_reserve[i]); ++ printk(" %ld", zone->lowmem_reserve[i]); + printk("\n"); + } + +@@ -3943,6 +3974,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) + struct page *page; + unsigned long block_migratetype; + int reserve; ++ int old_reserve; + + /* + * Get the start pfn, end pfn and the number of blocks to reserve +@@ -3964,6 +3996,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) + * future allocation of hugepages at runtime. + */ + reserve = min(2, reserve); ++ old_reserve = zone->nr_migrate_reserve_block; ++ ++ /* When memory hot-add, we almost always need to do nothing */ ++ if (reserve == old_reserve) ++ return; ++ zone->nr_migrate_reserve_block = reserve; + + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { + if (!pfn_valid(pfn)) +@@ -4001,6 +4039,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) + reserve--; + continue; + } ++ } else if (!old_reserve) { ++ /* ++ * At boot time we don't need to scan the whole zone ++ * for turning off MIGRATE_RESERVE. ++ */ ++ break; + } + + /* +@@ -4080,7 +4124,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, + + static void __meminit zone_init_free_lists(struct zone *zone) + { +- int order, t; ++ unsigned int order, t; + for_each_migratetype_order(order, t) { + INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); + zone->free_area[order].nr_free = 0; +@@ -4903,7 +4947,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, + + pgdat->node_id = nid; + pgdat->node_start_pfn = node_start_pfn; +- init_zone_allows_reclaim(nid); ++ if (node_state(nid, N_MEMORY)) ++ init_zone_allows_reclaim(nid); + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + #endif +@@ -5492,7 +5537,7 @@ static void calculate_totalreserve_pages(void) + for_each_online_pgdat(pgdat) { + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *zone = pgdat->node_zones + i; +- unsigned long max = 0; ++ long max = 0; + + /* Find valid and maximum lowmem_reserve in the zone */ + for (j = i; j < MAX_NR_ZONES; j++) { +@@ -5734,7 +5779,12 @@ module_init(init_per_zone_wmark_min) + int min_free_kbytes_sysctl_handler(ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) + { +- proc_dointvec(table, write, buffer, length, ppos); ++ int rc; ++ ++ rc = proc_dointvec_minmax(table, write, buffer, length, ppos); ++ if (rc) ++ return rc; ++ + if (write) { + user_min_free_kbytes = min_free_kbytes; + setup_per_zone_wmarks(); +@@ -5976,17 +6026,16 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) + * @end_bitidx: The last bit of interest + * returns pageblock_bits flags + */ +-unsigned long get_pageblock_flags_mask(struct page *page, ++unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) + { + struct zone *zone; + unsigned long *bitmap; +- unsigned long pfn, bitidx, word_bitidx; ++ unsigned long bitidx, word_bitidx; + unsigned long word; + + zone = page_zone(page); +- pfn = page_to_pfn(page); + bitmap = get_pageblock_bitmap(zone, pfn); + bitidx = pfn_to_bitidx(zone, pfn); + word_bitidx = bitidx / BITS_PER_LONG; +@@ -5998,25 +6047,25 @@ unsigned long get_pageblock_flags_mask(struct page *page, + } + + /** +- * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages ++ * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @start_bitidx: The first bit of interest + * @end_bitidx: The last bit of interest + * @flags: The flags to set + */ +-void set_pageblock_flags_mask(struct page *page, unsigned long flags, ++void set_pfnblock_flags_mask(struct page *page, unsigned long flags, ++ unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) + { + struct zone *zone; + unsigned long *bitmap; +- unsigned long pfn, bitidx, word_bitidx; ++ unsigned long bitidx, word_bitidx; + unsigned long old_word, word; + + BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); + + zone = page_zone(page); +- pfn = page_to_pfn(page); + bitmap = get_pageblock_bitmap(zone, pfn); + bitidx = pfn_to_bitidx(zone, pfn); + word_bitidx = bitidx / BITS_PER_LONG; +@@ -6194,7 +6243,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, + cc->nr_migratepages -= nr_reclaimed; + + ret = migrate_pages(&cc->migratepages, alloc_migrate_target, +- 0, MIGRATE_SYNC, MR_CMA); ++ NULL, 0, cc->mode, MR_CMA); + } + if (ret < 0) { + putback_movable_pages(&cc->migratepages); +@@ -6233,7 +6282,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, + .nr_migratepages = 0, + .order = -1, + .zone = page_zone(pfn_to_page(start)), +- .sync = true, ++ .mode = MIGRATE_SYNC, + .ignore_skip_hint = true, + }; + INIT_LIST_HEAD(&cc.migratepages); +@@ -6388,7 +6437,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) + { + struct page *page; + struct zone *zone; +- int order, i; ++ unsigned int order, i; + unsigned long pfn; + unsigned long flags; + /* find the first valid pfn */ +@@ -6440,7 +6489,7 @@ bool is_free_buddy_page(struct page *page) + struct zone *zone = page_zone(page); + unsigned long pfn = page_to_pfn(page); + unsigned long flags; +- int order; ++ unsigned int order; + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { +diff --git a/mm/readahead.c b/mm/readahead.c +index e4ed04149785..0f35e983bffb 100644 +--- a/mm/readahead.c ++++ b/mm/readahead.c +@@ -8,9 +8,7 @@ + */ + + #include <linux/kernel.h> +-#include <linux/fs.h> + #include <linux/gfp.h> +-#include <linux/mm.h> + #include <linux/export.h> + #include <linux/blkdev.h> + #include <linux/backing-dev.h> +@@ -20,6 +18,8 @@ + #include <linux/syscalls.h> + #include <linux/file.h> + ++#include "internal.h" ++ + /* + * Initialise a struct file's readahead state. Assumes that the caller has + * memset *ra to zero. +@@ -149,8 +149,7 @@ out: + * + * Returns the number of pages requested, or the maximum amount of I/O allowed. + */ +-static int +-__do_page_cache_readahead(struct address_space *mapping, struct file *filp, ++int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, + pgoff_t offset, unsigned long nr_to_read, + unsigned long lookahead_size) + { +@@ -179,7 +178,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, + rcu_read_lock(); + page = radix_tree_lookup(&mapping->page_tree, page_offset); + rcu_read_unlock(); +- if (page) ++ if (page && !radix_tree_exceptional_entry(page)) + continue; + + page = page_cache_alloc_readahead(mapping); +@@ -237,28 +236,14 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, + return ret; + } + ++#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE) + /* + * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a + * sensible upper limit. + */ + unsigned long max_sane_readahead(unsigned long nr) + { +- return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE) +- + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); +-} +- +-/* +- * Submit IO for the read-ahead request in file_ra_state. +- */ +-unsigned long ra_submit(struct file_ra_state *ra, +- struct address_space *mapping, struct file *filp) +-{ +- int actual; +- +- actual = __do_page_cache_readahead(mapping, filp, +- ra->start, ra->size, ra->async_size); +- +- return actual; ++ return min(nr, MAX_READAHEAD); + } + + /* +@@ -351,7 +336,7 @@ static pgoff_t count_history_pages(struct address_space *mapping, + pgoff_t head; + + rcu_read_lock(); +- head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); ++ head = page_cache_prev_hole(mapping, offset - 1, max); + rcu_read_unlock(); + + return offset - 1 - head; +@@ -401,6 +386,7 @@ ondemand_readahead(struct address_space *mapping, + unsigned long req_size) + { + unsigned long max = max_sane_readahead(ra->ra_pages); ++ pgoff_t prev_offset; + + /* + * start of file +@@ -430,7 +416,7 @@ ondemand_readahead(struct address_space *mapping, + pgoff_t start; + + rcu_read_lock(); +- start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); ++ start = page_cache_next_hole(mapping, offset + 1, max); + rcu_read_unlock(); + + if (!start || start - offset > max) +@@ -452,8 +438,11 @@ ondemand_readahead(struct address_space *mapping, + + /* + * sequential cache miss ++ * trivial case: (offset - prev_offset) == 1 ++ * unaligned reads: (offset - prev_offset) == 0 + */ +- if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) ++ prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT; ++ if (offset - prev_offset <= 1UL) + goto initial_readahead; + + /* +diff --git a/mm/shmem.c b/mm/shmem.c +index 0da81aaeb4cc..ab05681f41cd 100644 +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -243,19 +243,17 @@ static int shmem_radix_tree_replace(struct address_space *mapping, + pgoff_t index, void *expected, void *replacement) + { + void **pslot; +- void *item = NULL; ++ void *item; + + VM_BUG_ON(!expected); ++ VM_BUG_ON(!replacement); + pslot = radix_tree_lookup_slot(&mapping->page_tree, index); +- if (pslot) +- item = radix_tree_deref_slot_protected(pslot, +- &mapping->tree_lock); ++ if (!pslot) ++ return -ENOENT; ++ item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock); + if (item != expected) + return -ENOENT; +- if (replacement) +- radix_tree_replace_slot(pslot, replacement); +- else +- radix_tree_delete(&mapping->page_tree, index); ++ radix_tree_replace_slot(pslot, replacement); + return 0; + } + +@@ -332,84 +330,20 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) + } + + /* +- * Like find_get_pages, but collecting swap entries as well as pages. +- */ +-static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, +- pgoff_t start, unsigned int nr_pages, +- struct page **pages, pgoff_t *indices) +-{ +- void **slot; +- unsigned int ret = 0; +- struct radix_tree_iter iter; +- +- if (!nr_pages) +- return 0; +- +- rcu_read_lock(); +-restart: +- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { +- struct page *page; +-repeat: +- page = radix_tree_deref_slot(slot); +- if (unlikely(!page)) +- continue; +- if (radix_tree_exception(page)) { +- if (radix_tree_deref_retry(page)) +- goto restart; +- /* +- * Otherwise, we must be storing a swap entry +- * here as an exceptional entry: so return it +- * without attempting to raise page count. +- */ +- goto export; +- } +- if (!page_cache_get_speculative(page)) +- goto repeat; +- +- /* Has the page moved? */ +- if (unlikely(page != *slot)) { +- page_cache_release(page); +- goto repeat; +- } +-export: +- indices[ret] = iter.index; +- pages[ret] = page; +- if (++ret == nr_pages) +- break; +- } +- rcu_read_unlock(); +- return ret; +-} +- +-/* + * Remove swap entry from radix tree, free the swap and its page cache. + */ + static int shmem_free_swap(struct address_space *mapping, + pgoff_t index, void *radswap) + { +- int error; ++ void *old; + + spin_lock_irq(&mapping->tree_lock); +- error = shmem_radix_tree_replace(mapping, index, radswap, NULL); ++ old = radix_tree_delete_item(&mapping->page_tree, index, radswap); + spin_unlock_irq(&mapping->tree_lock); +- if (!error) +- free_swap_and_cache(radix_to_swp_entry(radswap)); +- return error; +-} +- +-/* +- * Pagevec may contain swap entries, so shuffle up pages before releasing. +- */ +-static void shmem_deswap_pagevec(struct pagevec *pvec) +-{ +- int i, j; +- +- for (i = 0, j = 0; i < pagevec_count(pvec); i++) { +- struct page *page = pvec->pages[i]; +- if (!radix_tree_exceptional_entry(page)) +- pvec->pages[j++] = page; +- } +- pvec->nr = j; ++ if (old != radswap) ++ return -ENOENT; ++ free_swap_and_cache(radix_to_swp_entry(radswap)); ++ return 0; + } + + /* +@@ -430,12 +364,12 @@ void shmem_unlock_mapping(struct address_space *mapping) + * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it + * has finished, if it hits a row of PAGEVEC_SIZE swap entries. + */ +- pvec.nr = shmem_find_get_pages_and_swap(mapping, index, +- PAGEVEC_SIZE, pvec.pages, indices); ++ pvec.nr = find_get_entries(mapping, index, ++ PAGEVEC_SIZE, pvec.pages, indices); + if (!pvec.nr) + break; + index = indices[pvec.nr - 1] + 1; +- shmem_deswap_pagevec(&pvec); ++ pagevec_remove_exceptionals(&pvec); + check_move_unevictable_pages(pvec.pages, pvec.nr); + pagevec_release(&pvec); + cond_resched(); +@@ -467,9 +401,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, + pagevec_init(&pvec, 0); + index = start; + while (index < end) { +- pvec.nr = shmem_find_get_pages_and_swap(mapping, index, +- min(end - index, (pgoff_t)PAGEVEC_SIZE), +- pvec.pages, indices); ++ pvec.nr = find_get_entries(mapping, index, ++ min(end - index, (pgoff_t)PAGEVEC_SIZE), ++ pvec.pages, indices); + if (!pvec.nr) + break; + mem_cgroup_uncharge_start(); +@@ -498,7 +432,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, + } + unlock_page(page); + } +- shmem_deswap_pagevec(&pvec); ++ pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); + mem_cgroup_uncharge_end(); + cond_resched(); +@@ -536,9 +470,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, + index = start; + while (index < end) { + cond_resched(); +- pvec.nr = shmem_find_get_pages_and_swap(mapping, index, ++ ++ pvec.nr = find_get_entries(mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), +- pvec.pages, indices); ++ pvec.pages, indices); + if (!pvec.nr) { + /* If all gone or hole-punch or unfalloc, we're done */ + if (index == start || end != -1) +@@ -581,7 +516,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, + } + unlock_page(page); + } +- shmem_deswap_pagevec(&pvec); ++ pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); + mem_cgroup_uncharge_end(); + index++; +@@ -1090,7 +1025,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, + return -EFBIG; + repeat: + swap.val = 0; +- page = find_lock_page(mapping, index); ++ page = find_lock_entry(mapping, index); + if (radix_tree_exceptional_entry(page)) { + swap = radix_to_swp_entry(page); + page = NULL; +@@ -1102,6 +1037,9 @@ repeat: + goto failed; + } + ++ if (page && sgp == SGP_WRITE) ++ mark_page_accessed(page); ++ + /* fallocated page? */ + if (page && !PageUptodate(page)) { + if (sgp != SGP_READ) +@@ -1183,6 +1121,9 @@ repeat: + shmem_recalc_inode(inode); + spin_unlock(&info->lock); + ++ if (sgp == SGP_WRITE) ++ mark_page_accessed(page); ++ + delete_from_swap_cache(page); + set_page_dirty(page); + swap_free(swap); +@@ -1207,8 +1148,11 @@ repeat: + goto decused; + } + +- SetPageSwapBacked(page); ++ __SetPageSwapBacked(page); + __set_page_locked(page); ++ if (sgp == SGP_WRITE) ++ init_page_accessed(page); ++ + error = mem_cgroup_cache_charge(page, current->mm, + gfp & GFP_RECLAIM_MASK); + if (error) +@@ -1485,6 +1429,11 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode + return inode; + } + ++bool shmem_mapping(struct address_space *mapping) ++{ ++ return mapping->backing_dev_info == &shmem_backing_dev_info; ++} ++ + #ifdef CONFIG_TMPFS + static const struct inode_operations shmem_symlink_inode_operations; + static const struct inode_operations shmem_short_symlink_operations; +@@ -1797,7 +1746,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, + pagevec_init(&pvec, 0); + pvec.nr = 1; /* start small: we may be there already */ + while (!done) { +- pvec.nr = shmem_find_get_pages_and_swap(mapping, index, ++ pvec.nr = find_get_entries(mapping, index, + pvec.nr, pvec.pages, indices); + if (!pvec.nr) { + if (whence == SEEK_DATA) +@@ -1824,7 +1773,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, + break; + } + } +- shmem_deswap_pagevec(&pvec); ++ pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); + pvec.nr = PAGEVEC_SIZE; + cond_resched(); +diff --git a/mm/slab.c b/mm/slab.c +index 2580db062df9..eb4078c7d183 100644 +--- a/mm/slab.c ++++ b/mm/slab.c +@@ -930,7 +930,8 @@ static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, + { + if (unlikely(pfmemalloc_active)) { + /* Some pfmemalloc slabs exist, check if this is one */ +- struct page *page = virt_to_head_page(objp); ++ struct slab *slabp = virt_to_slab(objp); ++ struct page *page = virt_to_head_page(slabp->s_mem); + if (PageSlabPfmemalloc(page)) + set_obj_pfmemalloc(&objp); + } +@@ -1776,7 +1777,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) + __SetPageSlab(page + i); + + if (page->pfmemalloc) +- SetPageSlabPfmemalloc(page + i); ++ SetPageSlabPfmemalloc(page); + } + memcg_bind_pages(cachep, cachep->gfporder); + +@@ -1809,9 +1810,10 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) + else + sub_zone_page_state(page_zone(page), + NR_SLAB_UNRECLAIMABLE, nr_freed); ++ ++ __ClearPageSlabPfmemalloc(page); + while (i--) { + BUG_ON(!PageSlab(page)); +- __ClearPageSlabPfmemalloc(page); + __ClearPageSlab(page); + page++; + } +@@ -3220,7 +3222,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) + local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); + + retry_cpuset: +- cpuset_mems_cookie = get_mems_allowed(); ++ cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = node_zonelist(slab_node(), flags); + + retry: +@@ -3276,7 +3278,7 @@ retry: + } + } + +- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj)) ++ if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + return obj; + } +diff --git a/mm/slub.c b/mm/slub.c +index 5c1343a391d0..a88d94cfee20 100644 +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -1635,7 +1635,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, + return NULL; + + do { +- cpuset_mems_cookie = get_mems_allowed(); ++ cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = node_zonelist(slab_node(), flags); + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + struct kmem_cache_node *n; +@@ -1647,19 +1647,17 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, + object = get_partial_node(s, n, c, flags); + if (object) { + /* +- * Return the object even if +- * put_mems_allowed indicated that +- * the cpuset mems_allowed was +- * updated in parallel. It's a +- * harmless race between the alloc +- * and the cpuset update. ++ * Don't check read_mems_allowed_retry() ++ * here - if mems_allowed was updated in ++ * parallel, that was a harmless race ++ * between allocation and the cpuset ++ * update + */ +- put_mems_allowed(cpuset_mems_cookie); + return object; + } + } + } +- } while (!put_mems_allowed(cpuset_mems_cookie)); ++ } while (read_mems_allowed_retry(cpuset_mems_cookie)); + #endif + return NULL; + } +diff --git a/mm/swap.c b/mm/swap.c +index aa4da5d9401d..16e70ce1912a 100644 +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -68,7 +68,7 @@ static void __page_cache_release(struct page *page) + static void __put_single_page(struct page *page) + { + __page_cache_release(page); +- free_hot_cold_page(page, 0); ++ free_hot_cold_page(page, false); + } + + static void __put_compound_page(struct page *page) +@@ -437,7 +437,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, + SetPageActive(page); + lru += LRU_ACTIVE; + add_page_to_lru_list(page, lruvec, lru); +- trace_mm_lru_activate(page, page_to_pfn(page)); ++ trace_mm_lru_activate(page); + + __count_vm_event(PGACTIVATE); + update_page_reclaim_stat(lruvec, file, 1); +@@ -549,12 +549,17 @@ void mark_page_accessed(struct page *page) + EXPORT_SYMBOL(mark_page_accessed); + + /* +- * Queue the page for addition to the LRU via pagevec. The decision on whether +- * to add the page to the [in]active [file|anon] list is deferred until the +- * pagevec is drained. This gives a chance for the caller of __lru_cache_add() +- * have the page added to the active list using mark_page_accessed(). ++ * Used to mark_page_accessed(page) that is not visible yet and when it is ++ * still safe to use non-atomic ops + */ +-void __lru_cache_add(struct page *page) ++void init_page_accessed(struct page *page) ++{ ++ if (!PageReferenced(page)) ++ __SetPageReferenced(page); ++} ++EXPORT_SYMBOL(init_page_accessed); ++ ++static void __lru_cache_add(struct page *page) + { + struct pagevec *pvec = &get_cpu_var(lru_add_pvec); + +@@ -564,11 +569,34 @@ void __lru_cache_add(struct page *page) + pagevec_add(pvec, page); + put_cpu_var(lru_add_pvec); + } +-EXPORT_SYMBOL(__lru_cache_add); ++ ++/** ++ * lru_cache_add: add a page to the page lists ++ * @page: the page to add ++ */ ++void lru_cache_add_anon(struct page *page) ++{ ++ if (PageActive(page)) ++ ClearPageActive(page); ++ __lru_cache_add(page); ++} ++ ++void lru_cache_add_file(struct page *page) ++{ ++ if (PageActive(page)) ++ ClearPageActive(page); ++ __lru_cache_add(page); ++} ++EXPORT_SYMBOL(lru_cache_add_file); + + /** + * lru_cache_add - add a page to a page list + * @page: the page to be added to the LRU. ++ * ++ * Queue the page for addition to the LRU via pagevec. The decision on whether ++ * to add the page to the [in]active [file|anon] list is deferred until the ++ * pagevec is drained. This gives a chance for the caller of lru_cache_add() ++ * have the page added to the active list using mark_page_accessed(). + */ + void lru_cache_add(struct page *page) + { +@@ -779,7 +807,7 @@ void lru_add_drain_all(void) + * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() + * will free it. + */ +-void release_pages(struct page **pages, int nr, int cold) ++void release_pages(struct page **pages, int nr, bool cold) + { + int i; + LIST_HEAD(pages_to_free); +@@ -820,7 +848,7 @@ void release_pages(struct page **pages, int nr, int cold) + } + + /* Clear Active bit in case of parallel mark_page_accessed */ +- ClearPageActive(page); ++ __ClearPageActive(page); + + list_add(&page->lru, &pages_to_free); + } +@@ -902,7 +930,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, + SetPageLRU(page); + add_page_to_lru_list(page, lruvec, lru); + update_page_reclaim_stat(lruvec, file, active); +- trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); ++ trace_mm_lru_insertion(page, lru); + } + + /* +@@ -916,6 +944,57 @@ void __pagevec_lru_add(struct pagevec *pvec) + EXPORT_SYMBOL(__pagevec_lru_add); + + /** ++ * pagevec_lookup_entries - gang pagecache lookup ++ * @pvec: Where the resulting entries are placed ++ * @mapping: The address_space to search ++ * @start: The starting entry index ++ * @nr_entries: The maximum number of entries ++ * @indices: The cache indices corresponding to the entries in @pvec ++ * ++ * pagevec_lookup_entries() will search for and return a group of up ++ * to @nr_entries pages and shadow entries in the mapping. All ++ * entries are placed in @pvec. pagevec_lookup_entries() takes a ++ * reference against actual pages in @pvec. ++ * ++ * The search returns a group of mapping-contiguous entries with ++ * ascending indexes. There may be holes in the indices due to ++ * not-present entries. ++ * ++ * pagevec_lookup_entries() returns the number of entries which were ++ * found. ++ */ ++unsigned pagevec_lookup_entries(struct pagevec *pvec, ++ struct address_space *mapping, ++ pgoff_t start, unsigned nr_pages, ++ pgoff_t *indices) ++{ ++ pvec->nr = find_get_entries(mapping, start, nr_pages, ++ pvec->pages, indices); ++ return pagevec_count(pvec); ++} ++ ++/** ++ * pagevec_remove_exceptionals - pagevec exceptionals pruning ++ * @pvec: The pagevec to prune ++ * ++ * pagevec_lookup_entries() fills both pages and exceptional radix ++ * tree entries into the pagevec. This function prunes all ++ * exceptionals from @pvec without leaving holes, so that it can be ++ * passed on to page-only pagevec operations. ++ */ ++void pagevec_remove_exceptionals(struct pagevec *pvec) ++{ ++ int i, j; ++ ++ for (i = 0, j = 0; i < pagevec_count(pvec); i++) { ++ struct page *page = pvec->pages[i]; ++ if (!radix_tree_exceptional_entry(page)) ++ pvec->pages[j++] = page; ++ } ++ pvec->nr = j; ++} ++ ++/** + * pagevec_lookup - gang pagecache lookup + * @pvec: Where the resulting pages are placed + * @mapping: The address_space to search +diff --git a/mm/swap_state.c b/mm/swap_state.c +index e6f15f8ca2af..4079edfff2cc 100644 +--- a/mm/swap_state.c ++++ b/mm/swap_state.c +@@ -63,6 +63,8 @@ unsigned long total_swapcache_pages(void) + return ret; + } + ++static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); ++ + void show_swap_cache_info(void) + { + printk("%lu pages in swap cache\n", total_swapcache_pages()); +@@ -268,7 +270,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr) + + for (i = 0; i < todo; i++) + free_swap_cache(pagep[i]); +- release_pages(pagep, todo, 0); ++ release_pages(pagep, todo, false); + pagep += todo; + nr -= todo; + } +@@ -286,8 +288,11 @@ struct page * lookup_swap_cache(swp_entry_t entry) + + page = find_get_page(swap_address_space(entry), entry.val); + +- if (page) ++ if (page) { + INC_CACHE_INFO(find_success); ++ if (TestClearPageReadahead(page)) ++ atomic_inc(&swapin_readahead_hits); ++ } + + INC_CACHE_INFO(find_total); + return page; +@@ -389,6 +394,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + return found_page; + } + ++static unsigned long swapin_nr_pages(unsigned long offset) ++{ ++ static unsigned long prev_offset; ++ unsigned int pages, max_pages, last_ra; ++ static atomic_t last_readahead_pages; ++ ++ max_pages = 1 << ACCESS_ONCE(page_cluster); ++ if (max_pages <= 1) ++ return 1; ++ ++ /* ++ * This heuristic has been found to work well on both sequential and ++ * random loads, swapping to hard disk or to SSD: please don't ask ++ * what the "+ 2" means, it just happens to work well, that's all. ++ */ ++ pages = atomic_xchg(&swapin_readahead_hits, 0) + 2; ++ if (pages == 2) { ++ /* ++ * We can have no readahead hits to judge by: but must not get ++ * stuck here forever, so check for an adjacent offset instead ++ * (and don't even bother to check whether swap type is same). ++ */ ++ if (offset != prev_offset + 1 && offset != prev_offset - 1) ++ pages = 1; ++ prev_offset = offset; ++ } else { ++ unsigned int roundup = 4; ++ while (roundup < pages) ++ roundup <<= 1; ++ pages = roundup; ++ } ++ ++ if (pages > max_pages) ++ pages = max_pages; ++ ++ /* Don't shrink readahead too fast */ ++ last_ra = atomic_read(&last_readahead_pages) / 2; ++ if (pages < last_ra) ++ pages = last_ra; ++ atomic_set(&last_readahead_pages, pages); ++ ++ return pages; ++} ++ + /** + * swapin_readahead - swap in pages in hope we need them soon + * @entry: swap entry of this memory +@@ -412,11 +461,16 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, unsigned long addr) + { + struct page *page; +- unsigned long offset = swp_offset(entry); ++ unsigned long entry_offset = swp_offset(entry); ++ unsigned long offset = entry_offset; + unsigned long start_offset, end_offset; +- unsigned long mask = (1UL << page_cluster) - 1; ++ unsigned long mask; + struct blk_plug plug; + ++ mask = swapin_nr_pages(offset) - 1; ++ if (!mask) ++ goto skip; ++ + /* Read a page_cluster sized and aligned cluster around offset. */ + start_offset = offset & ~mask; + end_offset = offset | mask; +@@ -430,10 +484,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, + gfp_mask, vma, addr); + if (!page) + continue; ++ if (offset != entry_offset) ++ SetPageReadahead(page); + page_cache_release(page); + } + blk_finish_plug(&plug); + + lru_add_drain(); /* Push any new pages onto the LRU now */ ++skip: + return read_swap_cache_async(entry, gfp_mask, vma, addr); + } +diff --git a/mm/swapfile.c b/mm/swapfile.c +index 0ec2eaf3ccfd..660b9c0e2e40 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -51,14 +51,32 @@ atomic_long_t nr_swap_pages; + /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ + long total_swap_pages; + static int least_priority; +-static atomic_t highest_priority_index = ATOMIC_INIT(-1); + + static const char Bad_file[] = "Bad swap file entry "; + static const char Unused_file[] = "Unused swap file entry "; + static const char Bad_offset[] = "Bad swap offset entry "; + static const char Unused_offset[] = "Unused swap offset entry "; + +-struct swap_list_t swap_list = {-1, -1}; ++/* ++ * all active swap_info_structs ++ * protected with swap_lock, and ordered by priority. ++ */ ++PLIST_HEAD(swap_active_head); ++ ++/* ++ * all available (active, not full) swap_info_structs ++ * protected with swap_avail_lock, ordered by priority. ++ * This is used by get_swap_page() instead of swap_active_head ++ * because swap_active_head includes all swap_info_structs, ++ * but get_swap_page() doesn't need to look at full ones. ++ * This uses its own lock instead of swap_lock because when a ++ * swap_info_struct changes between not-full/full, it needs to ++ * add/remove itself to/from this list, but the swap_info_struct->lock ++ * is held and the locking order requires swap_lock to be taken ++ * before any swap_info_struct->lock. ++ */ ++static PLIST_HEAD(swap_avail_head); ++static DEFINE_SPINLOCK(swap_avail_lock); + + struct swap_info_struct *swap_info[MAX_SWAPFILES]; + +@@ -591,6 +609,9 @@ checks: + if (si->inuse_pages == si->pages) { + si->lowest_bit = si->max; + si->highest_bit = 0; ++ spin_lock(&swap_avail_lock); ++ plist_del(&si->avail_list, &swap_avail_head); ++ spin_unlock(&swap_avail_lock); + } + si->swap_map[offset] = usage; + inc_cluster_info_page(si, si->cluster_info, offset); +@@ -639,71 +660,65 @@ no_page: + + swp_entry_t get_swap_page(void) + { +- struct swap_info_struct *si; ++ struct swap_info_struct *si, *next; + pgoff_t offset; +- int type, next; +- int wrapped = 0; +- int hp_index; + +- spin_lock(&swap_lock); + if (atomic_long_read(&nr_swap_pages) <= 0) + goto noswap; + atomic_long_dec(&nr_swap_pages); + +- for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { +- hp_index = atomic_xchg(&highest_priority_index, -1); +- /* +- * highest_priority_index records current highest priority swap +- * type which just frees swap entries. If its priority is +- * higher than that of swap_list.next swap type, we use it. It +- * isn't protected by swap_lock, so it can be an invalid value +- * if the corresponding swap type is swapoff. We double check +- * the flags here. It's even possible the swap type is swapoff +- * and swapon again and its priority is changed. In such rare +- * case, low prority swap type might be used, but eventually +- * high priority swap will be used after several rounds of +- * swap. +- */ +- if (hp_index != -1 && hp_index != type && +- swap_info[type]->prio < swap_info[hp_index]->prio && +- (swap_info[hp_index]->flags & SWP_WRITEOK)) { +- type = hp_index; +- swap_list.next = type; +- } +- +- si = swap_info[type]; +- next = si->next; +- if (next < 0 || +- (!wrapped && si->prio != swap_info[next]->prio)) { +- next = swap_list.head; +- wrapped++; +- } ++ spin_lock(&swap_avail_lock); + ++start_over: ++ plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { ++ /* requeue si to after same-priority siblings */ ++ plist_requeue(&si->avail_list, &swap_avail_head); ++ spin_unlock(&swap_avail_lock); + spin_lock(&si->lock); +- if (!si->highest_bit) { ++ if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { ++ spin_lock(&swap_avail_lock); ++ if (plist_node_empty(&si->avail_list)) { ++ spin_unlock(&si->lock); ++ goto nextsi; ++ } ++ WARN(!si->highest_bit, ++ "swap_info %d in list but !highest_bit\n", ++ si->type); ++ WARN(!(si->flags & SWP_WRITEOK), ++ "swap_info %d in list but !SWP_WRITEOK\n", ++ si->type); ++ plist_del(&si->avail_list, &swap_avail_head); + spin_unlock(&si->lock); +- continue; ++ goto nextsi; + } +- if (!(si->flags & SWP_WRITEOK)) { +- spin_unlock(&si->lock); +- continue; +- } +- +- swap_list.next = next; + +- spin_unlock(&swap_lock); + /* This is called for allocating swap entry for cache */ + offset = scan_swap_map(si, SWAP_HAS_CACHE); + spin_unlock(&si->lock); + if (offset) +- return swp_entry(type, offset); +- spin_lock(&swap_lock); +- next = swap_list.next; ++ return swp_entry(si->type, offset); ++ pr_debug("scan_swap_map of si %d failed to find offset\n", ++ si->type); ++ spin_lock(&swap_avail_lock); ++nextsi: ++ /* ++ * if we got here, it's likely that si was almost full before, ++ * and since scan_swap_map() can drop the si->lock, multiple ++ * callers probably all tried to get a page from the same si ++ * and it filled up before we could get one; or, the si filled ++ * up between us dropping swap_avail_lock and taking si->lock. ++ * Since we dropped the swap_avail_lock, the swap_avail_head ++ * list may have been modified; so if next is still in the ++ * swap_avail_head list then try it, otherwise start over. ++ */ ++ if (plist_node_empty(&next->avail_list)) ++ goto start_over; + } + ++ spin_unlock(&swap_avail_lock); ++ + atomic_long_inc(&nr_swap_pages); + noswap: +- spin_unlock(&swap_lock); + return (swp_entry_t) {0}; + } + +@@ -765,27 +780,6 @@ out: + return NULL; + } + +-/* +- * This swap type frees swap entry, check if it is the highest priority swap +- * type which just frees swap entry. get_swap_page() uses +- * highest_priority_index to search highest priority swap type. The +- * swap_info_struct.lock can't protect us if there are multiple swap types +- * active, so we use atomic_cmpxchg. +- */ +-static void set_highest_priority_index(int type) +-{ +- int old_hp_index, new_hp_index; +- +- do { +- old_hp_index = atomic_read(&highest_priority_index); +- if (old_hp_index != -1 && +- swap_info[old_hp_index]->prio >= swap_info[type]->prio) +- break; +- new_hp_index = type; +- } while (atomic_cmpxchg(&highest_priority_index, +- old_hp_index, new_hp_index) != old_hp_index); +-} +- + static unsigned char swap_entry_free(struct swap_info_struct *p, + swp_entry_t entry, unsigned char usage) + { +@@ -827,9 +821,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, + dec_cluster_info_page(p, p->cluster_info, offset); + if (offset < p->lowest_bit) + p->lowest_bit = offset; +- if (offset > p->highest_bit) ++ if (offset > p->highest_bit) { ++ bool was_full = !p->highest_bit; + p->highest_bit = offset; +- set_highest_priority_index(p->type); ++ if (was_full && (p->flags & SWP_WRITEOK)) { ++ spin_lock(&swap_avail_lock); ++ WARN_ON(!plist_node_empty(&p->avail_list)); ++ if (plist_node_empty(&p->avail_list)) ++ plist_add(&p->avail_list, ++ &swap_avail_head); ++ spin_unlock(&swap_avail_lock); ++ } ++ } + atomic_long_inc(&nr_swap_pages); + p->inuse_pages--; + frontswap_invalidate_page(p->type, offset); +@@ -1764,30 +1767,37 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, + unsigned char *swap_map, + struct swap_cluster_info *cluster_info) + { +- int i, prev; +- + if (prio >= 0) + p->prio = prio; + else + p->prio = --least_priority; ++ /* ++ * the plist prio is negated because plist ordering is ++ * low-to-high, while swap ordering is high-to-low ++ */ ++ p->list.prio = -p->prio; ++ p->avail_list.prio = -p->prio; + p->swap_map = swap_map; + p->cluster_info = cluster_info; + p->flags |= SWP_WRITEOK; + atomic_long_add(p->pages, &nr_swap_pages); + total_swap_pages += p->pages; + +- /* insert swap space into swap_list: */ +- prev = -1; +- for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { +- if (p->prio >= swap_info[i]->prio) +- break; +- prev = i; +- } +- p->next = i; +- if (prev < 0) +- swap_list.head = swap_list.next = p->type; +- else +- swap_info[prev]->next = p->type; ++ assert_spin_locked(&swap_lock); ++ /* ++ * both lists are plists, and thus priority ordered. ++ * swap_active_head needs to be priority ordered for swapoff(), ++ * which on removal of any swap_info_struct with an auto-assigned ++ * (i.e. negative) priority increments the auto-assigned priority ++ * of any lower-priority swap_info_structs. ++ * swap_avail_head needs to be priority ordered for get_swap_page(), ++ * which allocates swap pages from the highest available priority ++ * swap_info_struct. ++ */ ++ plist_add(&p->list, &swap_active_head); ++ spin_lock(&swap_avail_lock); ++ plist_add(&p->avail_list, &swap_avail_head); ++ spin_unlock(&swap_avail_lock); + } + + static void enable_swap_info(struct swap_info_struct *p, int prio, +@@ -1822,8 +1832,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) + struct address_space *mapping; + struct inode *inode; + struct filename *pathname; +- int i, type, prev; +- int err; ++ int err, found = 0; + unsigned int old_block_size; + + if (!capable(CAP_SYS_ADMIN)) +@@ -1841,17 +1850,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) + goto out; + + mapping = victim->f_mapping; +- prev = -1; + spin_lock(&swap_lock); +- for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { +- p = swap_info[type]; ++ plist_for_each_entry(p, &swap_active_head, list) { + if (p->flags & SWP_WRITEOK) { +- if (p->swap_file->f_mapping == mapping) ++ if (p->swap_file->f_mapping == mapping) { ++ found = 1; + break; ++ } + } +- prev = type; + } +- if (type < 0) { ++ if (!found) { + err = -EINVAL; + spin_unlock(&swap_lock); + goto out_dput; +@@ -1863,20 +1871,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) + spin_unlock(&swap_lock); + goto out_dput; + } +- if (prev < 0) +- swap_list.head = p->next; +- else +- swap_info[prev]->next = p->next; +- if (type == swap_list.next) { +- /* just pick something that's safe... */ +- swap_list.next = swap_list.head; +- } ++ spin_lock(&swap_avail_lock); ++ plist_del(&p->avail_list, &swap_avail_head); ++ spin_unlock(&swap_avail_lock); + spin_lock(&p->lock); + if (p->prio < 0) { +- for (i = p->next; i >= 0; i = swap_info[i]->next) +- swap_info[i]->prio = p->prio--; ++ struct swap_info_struct *si = p; ++ ++ plist_for_each_entry_continue(si, &swap_active_head, list) { ++ si->prio++; ++ si->list.prio--; ++ si->avail_list.prio--; ++ } + least_priority++; + } ++ plist_del(&p->list, &swap_active_head); + atomic_long_sub(p->pages, &nr_swap_pages); + total_swap_pages -= p->pages; + p->flags &= ~SWP_WRITEOK; +@@ -1884,7 +1893,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) + spin_unlock(&swap_lock); + + set_current_oom_origin(); +- err = try_to_unuse(type, false, 0); /* force all pages to be unused */ ++ err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ + clear_current_oom_origin(); + + if (err) { +@@ -1926,7 +1935,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) + frontswap_map_set(p, NULL); + spin_unlock(&p->lock); + spin_unlock(&swap_lock); +- frontswap_invalidate_area(type); ++ frontswap_invalidate_area(p->type); + mutex_unlock(&swapon_mutex); + free_percpu(p->percpu_cluster); + p->percpu_cluster = NULL; +@@ -1934,7 +1943,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) + vfree(cluster_info); + vfree(frontswap_map); + /* Destroy swap account informatin */ +- swap_cgroup_swapoff(type); ++ swap_cgroup_swapoff(p->type); + + inode = mapping->host; + if (S_ISBLK(inode->i_mode)) { +@@ -2141,8 +2150,9 @@ static struct swap_info_struct *alloc_swap_info(void) + */ + } + INIT_LIST_HEAD(&p->first_swap_extent.list); ++ plist_node_init(&p->list, 0); ++ plist_node_init(&p->avail_list, 0); + p->flags = SWP_USED; +- p->next = -1; + spin_unlock(&swap_lock); + spin_lock_init(&p->lock); + +diff --git a/mm/truncate.c b/mm/truncate.c +index 353b683afd6e..2e84fe59190b 100644 +--- a/mm/truncate.c ++++ b/mm/truncate.c +@@ -22,6 +22,22 @@ + #include <linux/cleancache.h> + #include "internal.h" + ++static void clear_exceptional_entry(struct address_space *mapping, ++ pgoff_t index, void *entry) ++{ ++ /* Handled by shmem itself */ ++ if (shmem_mapping(mapping)) ++ return; ++ ++ spin_lock_irq(&mapping->tree_lock); ++ /* ++ * Regular page slots are stabilized by the page lock even ++ * without the tree itself locked. These unlocked entries ++ * need verification under the tree lock. ++ */ ++ radix_tree_delete_item(&mapping->page_tree, index, entry); ++ spin_unlock_irq(&mapping->tree_lock); ++} + + /** + * do_invalidatepage - invalidate part or all of a page +@@ -208,6 +224,7 @@ void truncate_inode_pages_range(struct address_space *mapping, + unsigned int partial_start; /* inclusive */ + unsigned int partial_end; /* exclusive */ + struct pagevec pvec; ++ pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t index; + int i; + +@@ -238,17 +255,23 @@ void truncate_inode_pages_range(struct address_space *mapping, + + pagevec_init(&pvec, 0); + index = start; +- while (index < end && pagevec_lookup(&pvec, mapping, index, +- min(end - index, (pgoff_t)PAGEVEC_SIZE))) { ++ while (index < end && pagevec_lookup_entries(&pvec, mapping, index, ++ min(end - index, (pgoff_t)PAGEVEC_SIZE), ++ indices)) { + mem_cgroup_uncharge_start(); + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + /* We rely upon deletion not changing page->index */ +- index = page->index; ++ index = indices[i]; + if (index >= end) + break; + ++ if (radix_tree_exceptional_entry(page)) { ++ clear_exceptional_entry(mapping, index, page); ++ continue; ++ } ++ + if (!trylock_page(page)) + continue; + WARN_ON(page->index != index); +@@ -259,6 +282,7 @@ void truncate_inode_pages_range(struct address_space *mapping, + truncate_inode_page(mapping, page); + unlock_page(page); + } ++ pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); + mem_cgroup_uncharge_end(); + cond_resched(); +@@ -307,14 +331,16 @@ void truncate_inode_pages_range(struct address_space *mapping, + index = start; + for ( ; ; ) { + cond_resched(); +- if (!pagevec_lookup(&pvec, mapping, index, +- min(end - index, (pgoff_t)PAGEVEC_SIZE))) { ++ if (!pagevec_lookup_entries(&pvec, mapping, index, ++ min(end - index, (pgoff_t)PAGEVEC_SIZE), ++ indices)) { + if (index == start) + break; + index = start; + continue; + } +- if (index == start && pvec.pages[0]->index >= end) { ++ if (index == start && indices[0] >= end) { ++ pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); + break; + } +@@ -323,16 +349,22 @@ void truncate_inode_pages_range(struct address_space *mapping, + struct page *page = pvec.pages[i]; + + /* We rely upon deletion not changing page->index */ +- index = page->index; ++ index = indices[i]; + if (index >= end) + break; + ++ if (radix_tree_exceptional_entry(page)) { ++ clear_exceptional_entry(mapping, index, page); ++ continue; ++ } ++ + lock_page(page); + WARN_ON(page->index != index); + wait_on_page_writeback(page); + truncate_inode_page(mapping, page); + unlock_page(page); + } ++ pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); + mem_cgroup_uncharge_end(); + index++; +@@ -375,6 +407,7 @@ EXPORT_SYMBOL(truncate_inode_pages); + unsigned long invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end) + { ++ pgoff_t indices[PAGEVEC_SIZE]; + struct pagevec pvec; + pgoff_t index = start; + unsigned long ret; +@@ -390,17 +423,23 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, + */ + + pagevec_init(&pvec, 0); +- while (index <= end && pagevec_lookup(&pvec, mapping, index, +- min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { ++ while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, ++ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, ++ indices)) { + mem_cgroup_uncharge_start(); + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + /* We rely upon deletion not changing page->index */ +- index = page->index; ++ index = indices[i]; + if (index > end) + break; + ++ if (radix_tree_exceptional_entry(page)) { ++ clear_exceptional_entry(mapping, index, page); ++ continue; ++ } ++ + if (!trylock_page(page)) + continue; + WARN_ON(page->index != index); +@@ -414,6 +453,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, + deactivate_page(page); + count += ret; + } ++ pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); + mem_cgroup_uncharge_end(); + cond_resched(); +@@ -481,6 +521,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page) + int invalidate_inode_pages2_range(struct address_space *mapping, + pgoff_t start, pgoff_t end) + { ++ pgoff_t indices[PAGEVEC_SIZE]; + struct pagevec pvec; + pgoff_t index; + int i; +@@ -491,17 +532,23 @@ int invalidate_inode_pages2_range(struct address_space *mapping, + cleancache_invalidate_inode(mapping); + pagevec_init(&pvec, 0); + index = start; +- while (index <= end && pagevec_lookup(&pvec, mapping, index, +- min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { ++ while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, ++ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, ++ indices)) { + mem_cgroup_uncharge_start(); + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + /* We rely upon deletion not changing page->index */ +- index = page->index; ++ index = indices[i]; + if (index > end) + break; + ++ if (radix_tree_exceptional_entry(page)) { ++ clear_exceptional_entry(mapping, index, page); ++ continue; ++ } ++ + lock_page(page); + WARN_ON(page->index != index); + if (page->mapping != mapping) { +@@ -539,6 +586,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, + ret = ret2; + unlock_page(page); + } ++ pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); + mem_cgroup_uncharge_end(); + cond_resched(); +diff --git a/mm/vmacache.c b/mm/vmacache.c +new file mode 100644 +index 000000000000..1037a3bab505 +--- /dev/null ++++ b/mm/vmacache.c +@@ -0,0 +1,114 @@ ++/* ++ * Copyright (C) 2014 Davidlohr Bueso. ++ */ ++#include <linux/sched.h> ++#include <linux/mm.h> ++#include <linux/vmacache.h> ++ ++/* ++ * Flush vma caches for threads that share a given mm. ++ * ++ * The operation is safe because the caller holds the mmap_sem ++ * exclusively and other threads accessing the vma cache will ++ * have mmap_sem held at least for read, so no extra locking ++ * is required to maintain the vma cache. ++ */ ++void vmacache_flush_all(struct mm_struct *mm) ++{ ++ struct task_struct *g, *p; ++ ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * Only flush the vmacache pointers as the ++ * mm seqnum is already set and curr's will ++ * be set upon invalidation when the next ++ * lookup is done. ++ */ ++ if (mm == p->mm) ++ vmacache_flush(p); ++ } ++ rcu_read_unlock(); ++} ++ ++/* ++ * This task may be accessing a foreign mm via (for example) ++ * get_user_pages()->find_vma(). The vmacache is task-local and this ++ * task's vmacache pertains to a different mm (ie, its own). There is ++ * nothing we can do here. ++ * ++ * Also handle the case where a kernel thread has adopted this mm via use_mm(). ++ * That kernel thread's vmacache is not applicable to this mm. ++ */ ++static bool vmacache_valid_mm(struct mm_struct *mm) ++{ ++ return current->mm == mm && !(current->flags & PF_KTHREAD); ++} ++ ++void vmacache_update(unsigned long addr, struct vm_area_struct *newvma) ++{ ++ if (vmacache_valid_mm(newvma->vm_mm)) ++ current->vmacache[VMACACHE_HASH(addr)] = newvma; ++} ++ ++static bool vmacache_valid(struct mm_struct *mm) ++{ ++ struct task_struct *curr; ++ ++ if (!vmacache_valid_mm(mm)) ++ return false; ++ ++ curr = current; ++ if (mm->vmacache_seqnum != curr->vmacache_seqnum) { ++ /* ++ * First attempt will always be invalid, initialize ++ * the new cache for this task here. ++ */ ++ curr->vmacache_seqnum = mm->vmacache_seqnum; ++ vmacache_flush(curr); ++ return false; ++ } ++ return true; ++} ++ ++struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) ++{ ++ int i; ++ ++ if (!vmacache_valid(mm)) ++ return NULL; ++ ++ for (i = 0; i < VMACACHE_SIZE; i++) { ++ struct vm_area_struct *vma = current->vmacache[i]; ++ ++ if (!vma) ++ continue; ++ if (WARN_ON_ONCE(vma->vm_mm != mm)) ++ break; ++ if (vma->vm_start <= addr && vma->vm_end > addr) ++ return vma; ++ } ++ ++ return NULL; ++} ++ ++#ifndef CONFIG_MMU ++struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, ++ unsigned long start, ++ unsigned long end) ++{ ++ int i; ++ ++ if (!vmacache_valid(mm)) ++ return NULL; ++ ++ for (i = 0; i < VMACACHE_SIZE; i++) { ++ struct vm_area_struct *vma = current->vmacache[i]; ++ ++ if (vma && vma->vm_start == start && vma->vm_end == end) ++ return vma; ++ } ++ ++ return NULL; ++} ++#endif +diff --git a/mm/vmalloc.c b/mm/vmalloc.c +index e2be0f802ccf..060dc366ac44 100644 +--- a/mm/vmalloc.c ++++ b/mm/vmalloc.c +@@ -2685,14 +2685,14 @@ void get_vmalloc_info(struct vmalloc_info *vmi) + + prev_end = VMALLOC_START; + +- spin_lock(&vmap_area_lock); ++ rcu_read_lock(); + + if (list_empty(&vmap_area_list)) { + vmi->largest_chunk = VMALLOC_TOTAL; + goto out; + } + +- list_for_each_entry(va, &vmap_area_list, list) { ++ list_for_each_entry_rcu(va, &vmap_area_list, list) { + unsigned long addr = va->va_start; + + /* +@@ -2719,7 +2719,7 @@ void get_vmalloc_info(struct vmalloc_info *vmi) + vmi->largest_chunk = VMALLOC_END - prev_end; + + out: +- spin_unlock(&vmap_area_lock); ++ rcu_read_unlock(); + } + #endif + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 5ad29b2925a0..5461d02ea718 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -163,7 +163,8 @@ static unsigned long zone_reclaimable_pages(struct zone *zone) + + bool zone_reclaimable(struct zone *zone) + { +- return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; ++ return zone_page_state(zone, NR_PAGES_SCANNED) < ++ zone_reclaimable_pages(zone) * 6; + } + + static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) +@@ -224,15 +225,15 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, + unsigned long freed = 0; + unsigned long long delta; + long total_scan; +- long max_pass; ++ long freeable; + long nr; + long new_nr; + int nid = shrinkctl->nid; + long batch_size = shrinker->batch ? shrinker->batch + : SHRINK_BATCH; + +- max_pass = shrinker->count_objects(shrinker, shrinkctl); +- if (max_pass == 0) ++ freeable = shrinker->count_objects(shrinker, shrinkctl); ++ if (freeable == 0) + return 0; + + /* +@@ -244,14 +245,14 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, + + total_scan = nr; + delta = (4 * nr_pages_scanned) / shrinker->seeks; +- delta *= max_pass; ++ delta *= freeable; + do_div(delta, lru_pages + 1); + total_scan += delta; + if (total_scan < 0) { + printk(KERN_ERR + "shrink_slab: %pF negative objects to delete nr=%ld\n", + shrinker->scan_objects, total_scan); +- total_scan = max_pass; ++ total_scan = freeable; + } + + /* +@@ -260,38 +261,55 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, + * shrinkers to return -1 all the time. This results in a large + * nr being built up so when a shrink that can do some work + * comes along it empties the entire cache due to nr >>> +- * max_pass. This is bad for sustaining a working set in ++ * freeable. This is bad for sustaining a working set in + * memory. + * + * Hence only allow the shrinker to scan the entire cache when + * a large delta change is calculated directly. + */ +- if (delta < max_pass / 4) +- total_scan = min(total_scan, max_pass / 2); ++ if (delta < freeable / 4) ++ total_scan = min(total_scan, freeable / 2); + + /* + * Avoid risking looping forever due to too large nr value: + * never try to free more than twice the estimate number of + * freeable entries. + */ +- if (total_scan > max_pass * 2) +- total_scan = max_pass * 2; ++ if (total_scan > freeable * 2) ++ total_scan = freeable * 2; + + trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, + nr_pages_scanned, lru_pages, +- max_pass, delta, total_scan); ++ freeable, delta, total_scan); + +- while (total_scan >= batch_size) { ++ /* ++ * Normally, we should not scan less than batch_size objects in one ++ * pass to avoid too frequent shrinker calls, but if the slab has less ++ * than batch_size objects in total and we are really tight on memory, ++ * we will try to reclaim all available objects, otherwise we can end ++ * up failing allocations although there are plenty of reclaimable ++ * objects spread over several slabs with usage less than the ++ * batch_size. ++ * ++ * We detect the "tight on memory" situations by looking at the total ++ * number of objects we want to scan (total_scan). If it is greater ++ * than the total number of objects on slab (freeable), we must be ++ * scanning at high prio and therefore should try to reclaim as much as ++ * possible. ++ */ ++ while (total_scan >= batch_size || ++ total_scan >= freeable) { + unsigned long ret; ++ unsigned long nr_to_scan = min(batch_size, total_scan); + +- shrinkctl->nr_to_scan = batch_size; ++ shrinkctl->nr_to_scan = nr_to_scan; + ret = shrinker->scan_objects(shrinker, shrinkctl); + if (ret == SHRINK_STOP) + break; + freed += ret; + +- count_vm_events(SLABS_SCANNED, batch_size); +- total_scan -= batch_size; ++ count_vm_events(SLABS_SCANNED, nr_to_scan); ++ total_scan -= nr_to_scan; + + cond_resched(); + } +@@ -352,16 +370,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl, + } + + list_for_each_entry(shrinker, &shrinker_list, list) { +- for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { +- if (!node_online(shrinkctl->nid)) +- continue; +- +- if (!(shrinker->flags & SHRINKER_NUMA_AWARE) && +- (shrinkctl->nid != 0)) +- break; +- ++ if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) { ++ shrinkctl->nid = 0; + freed += shrink_slab_node(shrinkctl, shrinker, +- nr_pages_scanned, lru_pages); ++ nr_pages_scanned, lru_pages); ++ continue; ++ } ++ ++ for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { ++ if (node_online(shrinkctl->nid)) ++ freed += shrink_slab_node(shrinkctl, shrinker, ++ nr_pages_scanned, lru_pages); + + } + } +@@ -1089,7 +1108,7 @@ keep: + VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); + } + +- free_hot_cold_page_list(&free_pages, 1); ++ free_hot_cold_page_list(&free_pages, true); + + list_splice(&ret_pages, page_list); + count_vm_events(PGACTIVATE, pgactivate); +@@ -1126,7 +1145,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, + TTU_UNMAP|TTU_IGNORE_ACCESS, + &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); + list_splice(&clean_pages, page_list); +- __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); ++ mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); + return ret; + } + +@@ -1452,7 +1471,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + + if (global_reclaim(sc)) { +- zone->pages_scanned += nr_scanned; ++ __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); + if (current_is_kswapd()) + __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); + else +@@ -1487,7 +1506,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, + + spin_unlock_irq(&zone->lru_lock); + +- free_hot_cold_page_list(&page_list, 1); ++ free_hot_cold_page_list(&page_list, true); + + /* + * If reclaim is isolating dirty pages under writeback, it implies +@@ -1641,7 +1660,7 @@ static void shrink_active_list(unsigned long nr_to_scan, + nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, + &nr_scanned, sc, isolate_mode, lru); + if (global_reclaim(sc)) +- zone->pages_scanned += nr_scanned; ++ __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); + + reclaim_stat->recent_scanned[file] += nr_taken; + +@@ -1707,7 +1726,7 @@ static void shrink_active_list(unsigned long nr_to_scan, + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); + spin_unlock_irq(&zone->lru_lock); + +- free_hot_cold_page_list(&l_hold, 1); ++ free_hot_cold_page_list(&l_hold, true); + } + + #ifdef CONFIG_SWAP +@@ -1829,7 +1848,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, + struct zone *zone = lruvec_zone(lruvec); + unsigned long anon_prio, file_prio; + enum scan_balance scan_balance; +- unsigned long anon, file, free; ++ unsigned long anon, file; + bool force_scan = false; + unsigned long ap, fp; + enum lru_list lru; +@@ -1877,11 +1896,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, + goto out; + } + +- anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + +- get_lru_size(lruvec, LRU_INACTIVE_ANON); +- file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + +- get_lru_size(lruvec, LRU_INACTIVE_FILE); +- + /* + * If it's foreseeable that reclaiming the file cache won't be + * enough to get the zone back into a desirable shape, we have +@@ -1889,8 +1903,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, + * thrashing - remaining file pages alone. + */ + if (global_reclaim(sc)) { +- free = zone_page_state(zone, NR_FREE_PAGES); +- if (unlikely(file + free <= high_wmark_pages(zone))) { ++ unsigned long zonefile; ++ unsigned long zonefree; ++ ++ zonefree = zone_page_state(zone, NR_FREE_PAGES); ++ zonefile = zone_page_state(zone, NR_ACTIVE_FILE) + ++ zone_page_state(zone, NR_INACTIVE_FILE); ++ ++ if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) { + scan_balance = SCAN_ANON; + goto out; + } +@@ -1925,6 +1945,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, + * + * anon in [0], file in [1] + */ ++ ++ anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + ++ get_lru_size(lruvec, LRU_INACTIVE_ANON); ++ file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + ++ get_lru_size(lruvec, LRU_INACTIVE_FILE); ++ + spin_lock_irq(&zone->lru_lock); + if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { + reclaim_stat->recent_scanned[0] /= 2; +@@ -2000,13 +2026,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + unsigned long nr_reclaimed = 0; + unsigned long nr_to_reclaim = sc->nr_to_reclaim; + struct blk_plug plug; +- bool scan_adjusted = false; ++ bool scan_adjusted; + + get_scan_count(lruvec, sc, nr); + + /* Record the original scan target for proportional adjustments later */ + memcpy(targets, nr, sizeof(nr)); + ++ /* ++ * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal ++ * event that can occur when there is little memory pressure e.g. ++ * multiple streaming readers/writers. Hence, we do not abort scanning ++ * when the requested number of pages are reclaimed when scanning at ++ * DEF_PRIORITY on the assumption that the fact we are direct ++ * reclaiming implies that kswapd is not keeping up and it is best to ++ * do a batch of work at once. For memcg reclaim one check is made to ++ * abort proportional reclaim if either the file or anon lru has already ++ * dropped to zero at the first pass. ++ */ ++ scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && ++ sc->priority == DEF_PRIORITY); ++ + blk_start_plug(&plug); + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || + nr[LRU_INACTIVE_FILE]) { +@@ -2027,17 +2067,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + continue; + + /* +- * For global direct reclaim, reclaim only the number of pages +- * requested. Less care is taken to scan proportionally as it +- * is more important to minimise direct reclaim stall latency +- * than it is to properly age the LRU lists. +- */ +- if (global_reclaim(sc) && !current_is_kswapd()) +- break; +- +- /* + * For kswapd and memcg, reclaim at least the number of pages +- * requested. Ensure that the anon and file LRUs shrink ++ * requested. Ensure that the anon and file LRUs are scanned + * proportionally what was requested by get_scan_count(). We + * stop reclaiming one LRU and reduce the amount scanning + * proportional to the original scan target. +@@ -2045,6 +2076,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; + nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + ++ /* ++ * It's just vindictive to attack the larger once the smaller ++ * has gone to zero. And given the way we stop scanning the ++ * smaller below, this makes sure that we only make one nudge ++ * towards proportionality once we've got nr_to_reclaim. ++ */ ++ if (!nr_file || !nr_anon) ++ break; ++ + if (nr_file > nr_anon) { + unsigned long scan_target = targets[LRU_INACTIVE_ANON] + + targets[LRU_ACTIVE_ANON] + 1; +@@ -2406,8 +2446,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, + unsigned long lru_pages = 0; + + nodes_clear(shrink->nodes_to_scan); +- for_each_zone_zonelist(zone, z, zonelist, +- gfp_zone(sc->gfp_mask)) { ++ for_each_zone_zonelist_nodemask(zone, z, zonelist, ++ gfp_zone(sc->gfp_mask), sc->nodemask) { + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + continue; + +diff --git a/mm/vmstat.c b/mm/vmstat.c +index 5a442a723d79..f7ca04482299 100644 +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -200,7 +200,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, + continue; + + threshold = (*calculate_pressure)(zone); +- for_each_possible_cpu(cpu) ++ for_each_online_cpu(cpu) + per_cpu_ptr(zone->pageset, cpu)->stat_threshold + = threshold; + } +@@ -761,6 +761,7 @@ const char * const vmstat_text[] = { + "nr_shmem", + "nr_dirtied", + "nr_written", ++ "nr_pages_scanned", + + #ifdef CONFIG_NUMA + "numa_hit", +@@ -851,12 +852,14 @@ const char * const vmstat_text[] = { + "thp_zero_page_alloc", + "thp_zero_page_alloc_failed", + #endif ++#ifdef CONFIG_DEBUG_TLBFLUSH + #ifdef CONFIG_SMP + "nr_tlb_remote_flush", + "nr_tlb_remote_flush_received", +-#endif ++#endif /* CONFIG_SMP */ + "nr_tlb_local_flush_all", + "nr_tlb_local_flush_one", ++#endif /* CONFIG_DEBUG_TLBFLUSH */ + + #endif /* CONFIG_VM_EVENTS_COUNTERS */ + }; +@@ -1053,7 +1056,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, + min_wmark_pages(zone), + low_wmark_pages(zone), + high_wmark_pages(zone), +- zone->pages_scanned, ++ zone_page_state(zone, NR_PAGES_SCANNED), + zone->spanned_pages, + zone->present_pages, + zone->managed_pages); +@@ -1063,10 +1066,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, + zone_page_state(zone, i)); + + seq_printf(m, +- "\n protection: (%lu", ++ "\n protection: (%ld", + zone->lowmem_reserve[0]); + for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) +- seq_printf(m, ", %lu", zone->lowmem_reserve[i]); ++ seq_printf(m, ", %ld", zone->lowmem_reserve[i]); + seq_printf(m, + ")" + "\n pagesets"); |