mirror of
				git://git.openwrt.org/openwrt/openwrt.git
				synced 2025-10-31 14:04:26 -04:00 
			
		
		
		
	Refresh kernel patches for generic kernel 5.15 due to new backport version of MGLRU patchset. Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
		
			
				
	
	
		
			316 lines
		
	
	
		
			9.9 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
			
		
		
	
	
			316 lines
		
	
	
		
			9.9 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
| From 36a18a68ea458e8f4db2ca86b00091daf32c6c74 Mon Sep 17 00:00:00 2001
 | |
| From: Yu Zhao <yuzhao@google.com>
 | |
| Date: Sun, 18 Sep 2022 02:00:06 -0600
 | |
| Subject: [PATCH 09/29] mm: multi-gen LRU: optimize multiple memcgs
 | |
| MIME-Version: 1.0
 | |
| Content-Type: text/plain; charset=UTF-8
 | |
| Content-Transfer-Encoding: 8bit
 | |
| 
 | |
| When multiple memcgs are available, it is possible to use generations as a
 | |
| frame of reference to make better choices and improve overall performance
 | |
| under global memory pressure.  This patch adds a basic optimization to
 | |
| select memcgs that can drop single-use unmapped clean pages first.  Doing
 | |
| so reduces the chance of going into the aging path or swapping, which can
 | |
| be costly.
 | |
| 
 | |
| A typical example that benefits from this optimization is a server running
 | |
| mixed types of workloads, e.g., heavy anon workload in one memcg and heavy
 | |
| buffered I/O workload in the other.
 | |
| 
 | |
| Though this optimization can be applied to both kswapd and direct reclaim,
 | |
| it is only added to kswapd to keep the patchset manageable.  Later
 | |
| improvements may cover the direct reclaim path.
 | |
| 
 | |
| While ensuring certain fairness to all eligible memcgs, proportional scans
 | |
| of individual memcgs also require proper backoff to avoid overshooting
 | |
| their aggregate reclaim target by too much.  Otherwise it can cause high
 | |
| direct reclaim latency.  The conditions for backoff are:
 | |
| 
 | |
| 1. At low priorities, for direct reclaim, if aging fairness or direct
 | |
|    reclaim latency is at risk, i.e., aging one memcg multiple times or
 | |
|    swapping after the target is met.
 | |
| 2. At high priorities, for global reclaim, if per-zone free pages are
 | |
|    above respective watermarks.
 | |
| 
 | |
| Server benchmark results:
 | |
|   Mixed workloads:
 | |
|     fio (buffered I/O): +[19, 21]%
 | |
|                 IOPS         BW
 | |
|       patch1-8: 1880k        7343MiB/s
 | |
|       patch1-9: 2252k        8796MiB/s
 | |
| 
 | |
|     memcached (anon): +[119, 123]%
 | |
|                 Ops/sec      KB/sec
 | |
|       patch1-8: 862768.65    33514.68
 | |
|       patch1-9: 1911022.12   74234.54
 | |
| 
 | |
|   Mixed workloads:
 | |
|     fio (buffered I/O): +[75, 77]%
 | |
|                 IOPS         BW
 | |
|       5.19-rc1: 1279k        4996MiB/s
 | |
|       patch1-9: 2252k        8796MiB/s
 | |
| 
 | |
|     memcached (anon): +[13, 15]%
 | |
|                 Ops/sec      KB/sec
 | |
|       5.19-rc1: 1673524.04   65008.87
 | |
|       patch1-9: 1911022.12   74234.54
 | |
| 
 | |
|   Configurations:
 | |
|     (changes since patch 6)
 | |
| 
 | |
|     cat mixed.sh
 | |
|     modprobe brd rd_nr=2 rd_size=56623104
 | |
| 
 | |
|     swapoff -a
 | |
|     mkswap /dev/ram0
 | |
|     swapon /dev/ram0
 | |
| 
 | |
|     mkfs.ext4 /dev/ram1
 | |
|     mount -t ext4 /dev/ram1 /mnt
 | |
| 
 | |
|     memtier_benchmark -S /var/run/memcached/memcached.sock \
 | |
|       -P memcache_binary -n allkeys --key-minimum=1 \
 | |
|       --key-maximum=50000000 --key-pattern=P:P -c 1 -t 36 \
 | |
|       --ratio 1:0 --pipeline 8 -d 2000
 | |
| 
 | |
|     fio -name=mglru --numjobs=36 --directory=/mnt --size=1408m \
 | |
|       --buffered=1 --ioengine=io_uring --iodepth=128 \
 | |
|       --iodepth_batch_submit=32 --iodepth_batch_complete=32 \
 | |
|       --rw=randread --random_distribution=random --norandommap \
 | |
|       --time_based --ramp_time=10m --runtime=90m --group_reporting &
 | |
|     pid=$!
 | |
| 
 | |
|     sleep 200
 | |
| 
 | |
|     memtier_benchmark -S /var/run/memcached/memcached.sock \
 | |
|       -P memcache_binary -n allkeys --key-minimum=1 \
 | |
|       --key-maximum=50000000 --key-pattern=R:R -c 1 -t 36 \
 | |
|       --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed
 | |
| 
 | |
|     kill -INT $pid
 | |
|     wait
 | |
| 
 | |
| Client benchmark results:
 | |
|   no change (CONFIG_MEMCG=n)
 | |
| 
 | |
| Link: https://lkml.kernel.org/r/20220918080010.2920238-10-yuzhao@google.com
 | |
| Signed-off-by: Yu Zhao <yuzhao@google.com>
 | |
| Acked-by: Brian Geffon <bgeffon@google.com>
 | |
| Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
 | |
| Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
 | |
| Acked-by: Steven Barrett <steven@liquorix.net>
 | |
| Acked-by: Suleiman Souhlal <suleiman@google.com>
 | |
| Tested-by: Daniel Byrne <djbyrne@mtu.edu>
 | |
| Tested-by: Donald Carr <d@chaos-reins.com>
 | |
| Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
 | |
| Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
 | |
| Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
 | |
| Tested-by: Sofia Trinh <sofia.trinh@edi.works>
 | |
| Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
 | |
| Cc: Andi Kleen <ak@linux.intel.com>
 | |
| Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
 | |
| Cc: Barry Song <baohua@kernel.org>
 | |
| Cc: Catalin Marinas <catalin.marinas@arm.com>
 | |
| Cc: Dave Hansen <dave.hansen@linux.intel.com>
 | |
| Cc: Hillf Danton <hdanton@sina.com>
 | |
| Cc: Jens Axboe <axboe@kernel.dk>
 | |
| Cc: Johannes Weiner <hannes@cmpxchg.org>
 | |
| Cc: Jonathan Corbet <corbet@lwn.net>
 | |
| Cc: Linus Torvalds <torvalds@linux-foundation.org>
 | |
| Cc: Matthew Wilcox <willy@infradead.org>
 | |
| Cc: Mel Gorman <mgorman@suse.de>
 | |
| Cc: Miaohe Lin <linmiaohe@huawei.com>
 | |
| Cc: Michael Larabel <Michael@MichaelLarabel.com>
 | |
| Cc: Michal Hocko <mhocko@kernel.org>
 | |
| Cc: Mike Rapoport <rppt@kernel.org>
 | |
| Cc: Mike Rapoport <rppt@linux.ibm.com>
 | |
| Cc: Peter Zijlstra <peterz@infradead.org>
 | |
| Cc: Qi Zheng <zhengqi.arch@bytedance.com>
 | |
| Cc: Tejun Heo <tj@kernel.org>
 | |
| Cc: Vlastimil Babka <vbabka@suse.cz>
 | |
| Cc: Will Deacon <will@kernel.org>
 | |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 | |
| ---
 | |
|  mm/vmscan.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++-----
 | |
|  1 file changed, 96 insertions(+), 9 deletions(-)
 | |
| 
 | |
| --- a/mm/vmscan.c
 | |
| +++ b/mm/vmscan.c
 | |
| @@ -127,6 +127,12 @@ struct scan_control {
 | |
|  	/* Always discard instead of demoting to lower tier memory */
 | |
|  	unsigned int no_demotion:1;
 | |
|  
 | |
| +#ifdef CONFIG_LRU_GEN
 | |
| +	/* help kswapd make better choices among multiple memcgs */
 | |
| +	unsigned int memcgs_need_aging:1;
 | |
| +	unsigned long last_reclaimed;
 | |
| +#endif
 | |
| +
 | |
|  	/* Allocation order */
 | |
|  	s8 order;
 | |
|  
 | |
| @@ -4202,6 +4208,19 @@ static void lru_gen_age_node(struct pgli
 | |
|  
 | |
|  	VM_WARN_ON_ONCE(!current_is_kswapd());
 | |
|  
 | |
| +	sc->last_reclaimed = sc->nr_reclaimed;
 | |
| +
 | |
| +	/*
 | |
| +	 * To reduce the chance of going into the aging path, which can be
 | |
| +	 * costly, optimistically skip it if the flag below was cleared in the
 | |
| +	 * eviction path. This improves the overall performance when multiple
 | |
| +	 * memcgs are available.
 | |
| +	 */
 | |
| +	if (!sc->memcgs_need_aging) {
 | |
| +		sc->memcgs_need_aging = true;
 | |
| +		return;
 | |
| +	}
 | |
| +
 | |
|  	set_mm_walk(pgdat);
 | |
|  
 | |
|  	memcg = mem_cgroup_iter(NULL, NULL, NULL);
 | |
| @@ -4613,7 +4632,8 @@ static int isolate_pages(struct lruvec *
 | |
|  	return scanned;
 | |
|  }
 | |
|  
 | |
| -static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
 | |
| +static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
 | |
| +		       bool *need_swapping)
 | |
|  {
 | |
|  	int type;
 | |
|  	int scanned;
 | |
| @@ -4676,6 +4696,9 @@ static int evict_pages(struct lruvec *lr
 | |
|  
 | |
|  	sc->nr_reclaimed += reclaimed;
 | |
|  
 | |
| +	if (need_swapping && type == LRU_GEN_ANON)
 | |
| +		*need_swapping = true;
 | |
| +
 | |
|  	return scanned;
 | |
|  }
 | |
|  
 | |
| @@ -4685,9 +4708,8 @@ static int evict_pages(struct lruvec *lr
 | |
|   *    reclaim.
 | |
|   */
 | |
|  static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
 | |
| -				    bool can_swap)
 | |
| +				    bool can_swap, bool *need_aging)
 | |
|  {
 | |
| -	bool need_aging;
 | |
|  	unsigned long nr_to_scan;
 | |
|  	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 | |
|  	DEFINE_MAX_SEQ(lruvec);
 | |
| @@ -4697,8 +4719,8 @@ static unsigned long get_nr_to_scan(stru
 | |
|  	    (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
 | |
|  		return 0;
 | |
|  
 | |
| -	need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
 | |
| -	if (!need_aging)
 | |
| +	*need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
 | |
| +	if (!*need_aging)
 | |
|  		return nr_to_scan;
 | |
|  
 | |
|  	/* skip the aging path at the default priority */
 | |
| @@ -4715,10 +4737,68 @@ done:
 | |
|  	return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
 | |
|  }
 | |
|  
 | |
| +static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
 | |
| +			      struct scan_control *sc, bool need_swapping)
 | |
| +{
 | |
| +	int i;
 | |
| +	DEFINE_MAX_SEQ(lruvec);
 | |
| +
 | |
| +	if (!current_is_kswapd()) {
 | |
| +		/* age each memcg once to ensure fairness */
 | |
| +		if (max_seq - seq > 1)
 | |
| +			return true;
 | |
| +
 | |
| +		/* over-swapping can increase allocation latency */
 | |
| +		if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
 | |
| +			return true;
 | |
| +
 | |
| +		/* give this thread a chance to exit and free its memory */
 | |
| +		if (fatal_signal_pending(current)) {
 | |
| +			sc->nr_reclaimed += MIN_LRU_BATCH;
 | |
| +			return true;
 | |
| +		}
 | |
| +
 | |
| +		if (cgroup_reclaim(sc))
 | |
| +			return false;
 | |
| +	} else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
 | |
| +		return false;
 | |
| +
 | |
| +	/* keep scanning at low priorities to ensure fairness */
 | |
| +	if (sc->priority > DEF_PRIORITY - 2)
 | |
| +		return false;
 | |
| +
 | |
| +	/*
 | |
| +	 * A minimum amount of work was done under global memory pressure. For
 | |
| +	 * kswapd, it may be overshooting. For direct reclaim, the target isn't
 | |
| +	 * met, and yet the allocation may still succeed, since kswapd may have
 | |
| +	 * caught up. In either case, it's better to stop now, and restart if
 | |
| +	 * necessary.
 | |
| +	 */
 | |
| +	for (i = 0; i <= sc->reclaim_idx; i++) {
 | |
| +		unsigned long wmark;
 | |
| +		struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
 | |
| +
 | |
| +		if (!managed_zone(zone))
 | |
| +			continue;
 | |
| +
 | |
| +		wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
 | |
| +		if (wmark > zone_page_state(zone, NR_FREE_PAGES))
 | |
| +			return false;
 | |
| +	}
 | |
| +
 | |
| +	sc->nr_reclaimed += MIN_LRU_BATCH;
 | |
| +
 | |
| +	return true;
 | |
| +}
 | |
| +
 | |
|  static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 | |
|  {
 | |
|  	struct blk_plug plug;
 | |
| +	bool need_aging = false;
 | |
| +	bool need_swapping = false;
 | |
|  	unsigned long scanned = 0;
 | |
| +	unsigned long reclaimed = sc->nr_reclaimed;
 | |
| +	DEFINE_MAX_SEQ(lruvec);
 | |
|  
 | |
|  	lru_add_drain();
 | |
|  
 | |
| @@ -4738,21 +4818,28 @@ static void lru_gen_shrink_lruvec(struct
 | |
|  		else
 | |
|  			swappiness = 0;
 | |
|  
 | |
| -		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
 | |
| +		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
 | |
|  		if (!nr_to_scan)
 | |
| -			break;
 | |
| +			goto done;
 | |
|  
 | |
| -		delta = evict_pages(lruvec, sc, swappiness);
 | |
| +		delta = evict_pages(lruvec, sc, swappiness, &need_swapping);
 | |
|  		if (!delta)
 | |
| -			break;
 | |
| +			goto done;
 | |
|  
 | |
|  		scanned += delta;
 | |
|  		if (scanned >= nr_to_scan)
 | |
|  			break;
 | |
|  
 | |
| +		if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
 | |
| +			break;
 | |
| +
 | |
|  		cond_resched();
 | |
|  	}
 | |
|  
 | |
| +	/* see the comment in lru_gen_age_node() */
 | |
| +	if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
 | |
| +		sc->memcgs_need_aging = false;
 | |
| +done:
 | |
|  	clear_mm_walk();
 | |
|  
 | |
|  	blk_finish_plug(&plug);
 |