Remove special cased hugetlb handling code within the page cache by
changing the granularity of ->index to the base page size rather than the
huge page size. The motivation of this patch is to reduce complexity
within the filemap code while also increasing performance by removing
branches that are evaluated on every page cache lookup.
To support the change in index, new wrappers for hugetlb page cache
interactions are added. These wrappers perform the conversion to a linear
index which is now expected by the page cache for huge pages.
========================= PERFORMANCE ======================================
Perf was used to check the performance differences after the patch.
Overall the performance is similar to mainline with a very small larger
overhead that occurs in __filemap_add_folio() and
hugetlb_add_to_page_cache(). This is because of the larger overhead that
occurs in xa_load() and xa_store() as the xarray is now using more entries
to store hugetlb folios in the page cache.
Timing
aarch64
2MB Page Size
6.5-rc3 + this patch:
[root@sidhakum-ol9-1 hugepages]# time fallocate -l 700GB test.txt
real 1m49.568s
user 0m0.000s
sys 1m49.461s
6.5-rc3:
[root]# time fallocate -l 700GB test.txt
real 1m47.495s
user 0m0.000s
sys 1m47.370s
1GB Page Size
6.5-rc3 + this patch:
[root@sidhakum-ol9-1 hugepages1G]# time fallocate -l 700GB test.txt
real 1m47.024s
user 0m0.000s
sys 1m46.921s
6.5-rc3:
[root@sidhakum-ol9-1 hugepages1G]# time fallocate -l 700GB test.txt
real 1m44.551s
user 0m0.000s
sys 1m44.438s
x86
2MB Page Size
6.5-rc3 + this patch:
[root@sidhakum-ol9-2 hugepages]# time fallocate -l 100GB test.txt
real 0m22.383s
user 0m0.000s
sys 0m22.255s
6.5-rc3:
[opc@sidhakum-ol9-2 hugepages]$ time sudo fallocate -l 100GB /dev/hugepages/test.txt
real 0m22.735s
user 0m0.038s
sys 0m22.567s
1GB Page Size
6.5-rc3 + this patch:
[root@sidhakum-ol9-2 hugepages1GB]# time fallocate -l 100GB test.txt
real 0m25.786s
user 0m0.001s
sys 0m25.589s
6.5-rc3:
[root@sidhakum-ol9-2 hugepages1G]# time fallocate -l 100GB test.txt
real 0m33.454s
user 0m0.001s
sys 0m33.193s
aarch64:
workload - fallocate a 700GB file backed by huge pages
6.5-rc3 + this patch:
2MB Page Size:
--100.00%--__arm64_sys_fallocate
ksys_fallocate
vfs_fallocate
hugetlbfs_fallocate
|
|--95.04%--__pi_clear_page
|
|--3.57%--clear_huge_page
| |
| |--2.63%--rcu_all_qs
| |
| --0.91%--__cond_resched
|
--0.67%--__cond_resched
0.17% 0.00% 0 fallocate [kernel.vmlinux] [k] hugetlb_add_to_page_cache
0.14% 0.10% 11 fallocate [kernel.vmlinux] [k] __filemap_add_folio
6.5-rc3
2MB Page Size:
--100.00%--__arm64_sys_fallocate
ksys_fallocate
vfs_fallocate
hugetlbfs_fallocate
|
|--94.91%--__pi_clear_page
|
|--4.11%--clear_huge_page
| |
| |--3.00%--rcu_all_qs
| |
| --1.10%--__cond_resched
|
--0.59%--__cond_resched
0.08% 0.01% 1 fallocate [kernel.kallsyms] [k] hugetlb_add_to_page_cache
0.05% 0.03% 3 fallocate [kernel.kallsyms] [k] __filemap_add_folio
x86
workload - fallocate a 100GB file backed by huge pages
6.5-rc3 + this patch:
2MB Page Size:
hugetlbfs_fallocate
|
--99.57%--clear_huge_page
|
--98.47%--clear_page_erms
|
--0.53%--asm_sysvec_apic_timer_interrupt
0.04% 0.04% 1 fallocate [kernel.kallsyms] [k] xa_load
0.04% 0.00% 0 fallocate [kernel.kallsyms] [k] hugetlb_add_to_page_cache
0.04% 0.00% 0 fallocate [kernel.kallsyms] [k] __filemap_add_folio
0.04% 0.00% 0 fallocate [kernel.kallsyms] [k] xas_store
6.5-rc3
2MB Page Size:
--99.93%--__x64_sys_fallocate
vfs_fallocate
hugetlbfs_fallocate
|
--99.38%--clear_huge_page
|
|--98.40%--clear_page_erms
|
--0.59%--__cond_resched
0.03% 0.03% 1 fallocate [kernel.kallsyms] [k] __filemap_add_folio
========================= TESTING ======================================
This patch passes libhugetlbfs tests and LTP hugetlb tests
********** TEST SUMMARY
* 2M
* 32-bit 64-bit
* Total testcases: 110 113
* Skipped: 0 0
* PASS: 107 113
* FAIL: 0 0
* Killed by signal: 3 0
* Bad configuration: 0 0
* Expected FAIL: 0 0
* Unexpected PASS: 0 0
* Test not present: 0 0
* Strange test result: 0 0
**********
Done executing testcases.
LTP Version:
20220527-178-g2761a81c4
page migration was also tested using Mike Kravetz's test program.[8]
[dan.carpenter@linaro.org: fix an NULL vs IS_ERR() bug]
Link: https://lkml.kernel.org/r/1772c296-1417-486f-8eef-171af2192681@moroto.mountain
Link: https://lkml.kernel.org/r/20230926192017.98183-1-sidhartha.kumar@oracle.com
Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reported-and-tested-by: syzbot+c225dea486da4d5592bd@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=c225dea486da4d5592bd
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
ssize_t retval = 0;
while (iov_iter_count(to)) {
- struct page *page;
+ struct folio *folio;
size_t nr, copied, want;
/* nr is the maximum number of bytes to copy from this page */
}
nr = nr - offset;
- /* Find the page */
- page = find_lock_page(mapping, index);
- if (unlikely(page == NULL)) {
+ /* Find the folio */
+ folio = filemap_lock_hugetlb_folio(h, mapping, index);
+ if (IS_ERR(folio)) {
/*
* We have a HOLE, zero out the user-buffer for the
* length of the hole or request.
*/
copied = iov_iter_zero(nr, to);
} else {
- unlock_page(page);
+ folio_unlock(folio);
- if (!PageHWPoison(page))
+ if (!folio_test_has_hwpoisoned(folio))
want = nr;
else {
/*
* touching the 1st raw HWPOISON subpage after
* offset.
*/
- want = adjust_range_hwpoison(page, offset, nr);
+ want = adjust_range_hwpoison(&folio->page, offset, nr);
if (want == 0) {
- put_page(page);
+ folio_put(folio);
retval = -EIO;
break;
}
}
/*
- * We have the page, copy it to user space buffer.
+ * We have the folio, copy it to user space buffer.
*/
- copied = copy_page_to_iter(page, offset, want, to);
- put_page(page);
+ copied = copy_folio_to_iter(folio, offset, want, to);
+ folio_put(folio);
}
offset += copied;
retval += copied;
{
struct hstate *h = hstate_inode(inode);
struct address_space *mapping = &inode->i_data;
- const pgoff_t start = lstart >> huge_page_shift(h);
- const pgoff_t end = lend >> huge_page_shift(h);
+ const pgoff_t end = lend >> PAGE_SHIFT;
struct folio_batch fbatch;
pgoff_t next, index;
int i, freed = 0;
bool truncate_op = (lend == LLONG_MAX);
folio_batch_init(&fbatch);
- next = start;
+ next = lstart >> PAGE_SHIFT;
while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
for (i = 0; i < folio_batch_count(&fbatch); ++i) {
struct folio *folio = fbatch.folios[i];
u32 hash = 0;
- index = folio->index;
+ index = folio->index >> huge_page_order(h);
hash = hugetlb_fault_mutex_hash(mapping, index);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
}
if (truncate_op)
- (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
+ (void)hugetlb_unreserve_pages(inode,
+ lstart >> huge_page_shift(h),
+ LONG_MAX, freed);
}
static void hugetlbfs_evict_inode(struct inode *inode)
pgoff_t idx = start >> huge_page_shift(h);
struct folio *folio;
- folio = filemap_lock_folio(mapping, idx);
+ folio = filemap_lock_hugetlb_folio(h, mapping, idx);
if (IS_ERR(folio))
return;
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/* See if already present in mapping to avoid alloc/free */
- folio = filemap_get_folio(mapping, index);
+ folio = filemap_get_folio(mapping, index << huge_page_order(h));
if (!IS_ERR(folio)) {
folio_put(folio);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
return huge_page_size(h) / 512;
}
+static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
+ struct address_space *mapping, pgoff_t idx)
+{
+ return filemap_lock_folio(mapping, idx << huge_page_order(h));
+}
+
#include <asm/hugetlb.h>
#ifndef is_hugepage_only_range
return NULL;
}
+static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
+ struct address_space *mapping, pgoff_t idx)
+{
+ return NULL;
+}
+
static inline int isolate_or_dissolve_huge_page(struct page *page,
struct list_head *list)
{
*/
static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
{
- /* HugeTLBfs indexes the page cache in units of hpage_size */
- if (folio_test_hugetlb(folio))
- return &folio->page;
return folio_page(folio, index & (folio_nr_pages(folio) - 1));
}
*/
static inline bool folio_contains(struct folio *folio, pgoff_t index)
{
- /* HugeTLBfs indexes the page cache in units of hpage_size */
- if (folio_test_hugetlb(folio))
- return folio->index == index;
return index - folio_index(folio) < folio_nr_pages(folio);
}
}
/*
- * Get index of the page within radix-tree (but not for hugetlb pages).
- * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE)
+ * Get the offset in PAGE_SIZE (even for hugetlb pages).
*/
-static inline pgoff_t page_to_index(struct page *page)
+static inline pgoff_t page_to_pgoff(struct page *page)
{
struct page *head;
return head->index + page - head;
}
-extern pgoff_t hugetlb_basepage_index(struct page *page);
-
-/*
- * Get the offset in PAGE_SIZE (even for hugetlb pages).
- * (TODO: hugetlb pages should have ->index in PAGE_SIZE)
- */
-static inline pgoff_t page_to_pgoff(struct page *page)
-{
- if (unlikely(PageHuge(page)))
- return hugetlb_basepage_index(page);
- return page_to_index(page);
-}
-
/*
* Return byte-offset into filesystem object for page.
*/
/*
* Get the offset in PAGE_SIZE (even for hugetlb folios).
- * (TODO: hugetlb folios should have ->index in PAGE_SIZE)
*/
static inline pgoff_t folio_pgoff(struct folio *folio)
{
- if (unlikely(folio_test_hugetlb(folio)))
- return hugetlb_basepage_index(&folio->page);
return folio->index;
}
-extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
- unsigned long address);
-
static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
unsigned long address)
{
pgoff_t pgoff;
- if (unlikely(is_vm_hugetlb_page(vma)))
- return linear_hugepage_index(vma, address);
pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
pgoff += vma->vm_pgoff;
return pgoff;
mapping_set_update(&xas, mapping);
- /* hugetlb pages are represented by a single entry in the xarray */
- if (!folio_test_hugetlb(folio)) {
- xas_set_order(&xas, folio->index, folio_order(folio));
- nr = folio_nr_pages(folio);
- }
+ xas_set_order(&xas, folio->index, folio_order(folio));
+ nr = folio_nr_pages(folio);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
if (free_folio)
free_folio(folio);
- if (folio_test_large(folio) && !folio_test_hugetlb(folio))
+ if (folio_test_large(folio))
refs = folio_nr_pages(folio);
folio_put_refs(folio, refs);
}
if (!huge) {
int error = mem_cgroup_charge(folio, NULL, gfp);
- VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
if (error)
return error;
charged = true;
- xas_set_order(&xas, index, folio_order(folio));
- nr = folio_nr_pages(folio);
}
+ VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
+ xas_set_order(&xas, index, folio_order(folio));
+ nr = folio_nr_pages(folio);
+
gfp &= GFP_RECLAIM_MASK;
folio_ref_add(folio, nr);
folio->mapping = mapping;
int idx = folio_batch_count(fbatch) - 1;
folio = fbatch->folios[idx];
- if (!xa_is_value(folio) && !folio_test_hugetlb(folio))
+ if (!xa_is_value(folio))
nr = folio_nr_pages(folio);
*start = indices[idx] + nr;
}
int idx = folio_batch_count(fbatch) - 1;
folio = fbatch->folios[idx];
- if (!xa_is_value(folio) && !folio_test_hugetlb(folio))
+ if (!xa_is_value(folio))
nr = folio_nr_pages(folio);
*start = indices[idx] + nr;
}
continue;
if (!folio_batch_add(fbatch, folio)) {
unsigned long nr = folio_nr_pages(folio);
-
- if (folio_test_hugetlb(folio))
- nr = 1;
*start = folio->index + nr;
goto out;
}
if (!folio_batch_add(fbatch, folio)) {
nr = folio_nr_pages(folio);
-
- if (folio_test_hugetlb(folio))
- nr = 1;
*start = folio->index + nr;
goto out;
}
if (nr) {
folio = fbatch->folios[nr - 1];
- if (folio_test_hugetlb(folio))
- *start = folio->index + 1;
- else
- *start = folio_next_index(folio);
+ *start = folio->index + folio_nr_pages(folio);
}
out:
rcu_read_unlock();
continue;
if (!folio_batch_add(fbatch, folio)) {
unsigned long nr = folio_nr_pages(folio);
-
- if (folio_test_hugetlb(folio))
- nr = 1;
*start = folio->index + nr;
goto out;
}
/*
* Convert the address within this vma to the page offset within
- * the mapping, in pagecache page units; huge pages here.
+ * the mapping, huge page units here.
*/
static pgoff_t vma_hugecache_offset(struct hstate *h,
struct vm_area_struct *vma, unsigned long address)
(vma->vm_pgoff >> huge_page_order(h));
}
-pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
- unsigned long address)
-{
- return vma_hugecache_offset(hstate_vma(vma), vma, address);
-}
-EXPORT_SYMBOL_GPL(linear_hugepage_index);
-
/**
* vma_kernel_pagesize - Page size granularity for this VMA.
* @vma: The user mapping.
return NULL;
}
-pgoff_t hugetlb_basepage_index(struct page *page)
-{
- struct page *page_head = compound_head(page);
- pgoff_t index = page_index(page_head);
- unsigned long compound_idx;
-
- if (compound_order(page_head) > MAX_ORDER)
- compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
- else
- compound_idx = page - page_head;
-
- return (index << compound_order(page_head)) + compound_idx;
-}
-
static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
gfp_t gfp_mask, int nid, nodemask_t *nmask,
nodemask_t *node_alloc_noretry)
struct vm_area_struct *vma, unsigned long address)
{
struct address_space *mapping = vma->vm_file->f_mapping;
- pgoff_t idx = vma_hugecache_offset(h, vma, address);
+ pgoff_t idx = linear_page_index(vma, address);
struct folio *folio;
folio = filemap_get_folio(mapping, idx);
struct hstate *h = hstate_inode(inode);
int err;
+ idx <<= huge_page_order(h);
__folio_set_locked(folio);
err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL);
* before we get page_table_lock.
*/
new_folio = false;
- folio = filemap_lock_folio(mapping, idx);
+ folio = filemap_lock_hugetlb_folio(h, mapping, idx);
if (IS_ERR(folio)) {
size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
/* Just decrements count, does not deallocate */
vma_end_reservation(h, vma, haddr);
- pagecache_folio = filemap_lock_folio(mapping, idx);
+ pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, idx);
if (IS_ERR(pagecache_folio))
pagecache_folio = NULL;
}
if (is_continue) {
ret = -EFAULT;
- folio = filemap_lock_folio(mapping, idx);
+ folio = filemap_lock_hugetlb_folio(h, mapping, idx);
if (IS_ERR(folio))
goto out;
folio_in_pagecache = true;
int expected_count;
xas_lock_irq(&xas);
- expected_count = 2 + folio_has_private(src);
+ expected_count = folio_expected_refs(mapping, src);
if (!folio_ref_freeze(src, expected_count)) {
xas_unlock_irq(&xas);
return -EAGAIN;
dst->index = src->index;
dst->mapping = src->mapping;
- folio_get(dst);
+ folio_ref_add(dst, folio_nr_pages(dst));
xas_store(&xas, dst);
- folio_ref_unfreeze(src, expected_count - 1);
+ folio_ref_unfreeze(src, expected_count - folio_nr_pages(src));
xas_unlock_irq(&xas);