mirror of
				git://git.openwrt.org/openwrt/openwrt.git
				synced 2025-11-03 22:44:27 -05:00 
			
		
		
		
	This fixes a bug where the reclaim path could occasionally have long tail latency. Signed-off-by: Kazuki Hashimoto <kazukih0205@gmail.com>
		
			
				
	
	
		
			267 lines
		
	
	
		
			9.1 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
			
		
		
	
	
			267 lines
		
	
	
		
			9.1 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
From 087ed25eaf5a78a678508e893f80addab9b1c103 Mon Sep 17 00:00:00 2001
 | 
						|
From: Kalesh Singh <kaleshsingh@google.com>
 | 
						|
Date: Thu, 13 Apr 2023 14:43:26 -0700
 | 
						|
Subject: [PATCH] mm: Multi-gen LRU: remove wait_event_killable()
 | 
						|
 | 
						|
Android 14 and later default to MGLRU [1] and field telemetry showed
 | 
						|
occasional long tail latency (>100ms) in the reclaim path.
 | 
						|
 | 
						|
Tracing revealed priority inversion in the reclaim path.  In
 | 
						|
try_to_inc_max_seq(), when high priority tasks were blocked on
 | 
						|
wait_event_killable(), the preemption of the low priority task to call
 | 
						|
wake_up_all() caused those high priority tasks to wait longer than
 | 
						|
necessary.  In general, this problem is not different from others of its
 | 
						|
kind, e.g., one caused by mutex_lock().  However, it is specific to MGLRU
 | 
						|
because it introduced the new wait queue lruvec->mm_state.wait.
 | 
						|
 | 
						|
The purpose of this new wait queue is to avoid the thundering herd
 | 
						|
problem.  If many direct reclaimers rush into try_to_inc_max_seq(), only
 | 
						|
one can succeed, i.e., the one to wake up the rest, and the rest who
 | 
						|
failed might cause premature OOM kills if they do not wait.  So far there
 | 
						|
is no evidence supporting this scenario, based on how often the wait has
 | 
						|
been hit.  And this begs the question how useful the wait queue is in
 | 
						|
practice.
 | 
						|
 | 
						|
Based on Minchan's recommendation, which is in line with his commit
 | 
						|
6d4675e60135 ("mm: don't be stuck to rmap lock on reclaim path") and the
 | 
						|
rest of the MGLRU code which also uses trylock when possible, remove the
 | 
						|
wait queue.
 | 
						|
 | 
						|
[1] https://android-review.googlesource.com/q/I7ed7fbfd6ef9ce10053347528125dd98c39e50bf
 | 
						|
 | 
						|
Link: https://lkml.kernel.org/r/20230413214326.2147568-1-kaleshsingh@google.com
 | 
						|
Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
 | 
						|
Change-Id: I911f3968fd1adb25171279cc5b6f48ccb7efc8de
 | 
						|
Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
 | 
						|
Suggested-by: Minchan Kim <minchan@kernel.org>
 | 
						|
Reported-by: Wei Wang <wvw@google.com>
 | 
						|
Acked-by: Yu Zhao <yuzhao@google.com>
 | 
						|
Cc: Minchan Kim <minchan@kernel.org>
 | 
						|
Cc: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
 | 
						|
Cc: Oleksandr Natalenko <oleksandr@natalenko.name>
 | 
						|
Cc: Suleiman Souhlal <suleiman@google.com>
 | 
						|
Cc: Suren Baghdasaryan <surenb@google.com>
 | 
						|
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 | 
						|
---
 | 
						|
 include/linux/mmzone.h |   8 +--
 | 
						|
 mm/vmscan.c            | 111 +++++++++++++++--------------------------
 | 
						|
 2 files changed, 42 insertions(+), 77 deletions(-)
 | 
						|
 | 
						|
--- a/include/linux/mmzone.h
 | 
						|
+++ b/include/linux/mmzone.h
 | 
						|
@@ -452,18 +452,14 @@ enum {
 | 
						|
 struct lru_gen_mm_state {
 | 
						|
 	/* set to max_seq after each iteration */
 | 
						|
 	unsigned long seq;
 | 
						|
-	/* where the current iteration continues (inclusive) */
 | 
						|
+	/* where the current iteration continues after */
 | 
						|
 	struct list_head *head;
 | 
						|
-	/* where the last iteration ended (exclusive) */
 | 
						|
+	/* where the last iteration ended before */
 | 
						|
 	struct list_head *tail;
 | 
						|
-	/* to wait for the last page table walker to finish */
 | 
						|
-	struct wait_queue_head wait;
 | 
						|
 	/* Bloom filters flip after each iteration */
 | 
						|
 	unsigned long *filters[NR_BLOOM_FILTERS];
 | 
						|
 	/* the mm stats for debugging */
 | 
						|
 	unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
 | 
						|
-	/* the number of concurrent page table walkers */
 | 
						|
-	int nr_walkers;
 | 
						|
 };
 | 
						|
 
 | 
						|
 struct lru_gen_mm_walk {
 | 
						|
--- a/mm/vmscan.c
 | 
						|
+++ b/mm/vmscan.c
 | 
						|
@@ -2999,18 +2999,13 @@ void lru_gen_del_mm(struct mm_struct *mm
 | 
						|
 		if (!lruvec)
 | 
						|
 			continue;
 | 
						|
 
 | 
						|
-		/* where the last iteration ended (exclusive) */
 | 
						|
+		/* where the current iteration continues after */
 | 
						|
+		if (lruvec->mm_state.head == &mm->lru_gen.list)
 | 
						|
+			lruvec->mm_state.head = lruvec->mm_state.head->prev;
 | 
						|
+
 | 
						|
+		/* where the last iteration ended before */
 | 
						|
 		if (lruvec->mm_state.tail == &mm->lru_gen.list)
 | 
						|
 			lruvec->mm_state.tail = lruvec->mm_state.tail->next;
 | 
						|
-
 | 
						|
-		/* where the current iteration continues (inclusive) */
 | 
						|
-		if (lruvec->mm_state.head != &mm->lru_gen.list)
 | 
						|
-			continue;
 | 
						|
-
 | 
						|
-		lruvec->mm_state.head = lruvec->mm_state.head->next;
 | 
						|
-		/* the deletion ends the current iteration */
 | 
						|
-		if (lruvec->mm_state.head == &mm_list->fifo)
 | 
						|
-			WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1);
 | 
						|
 	}
 | 
						|
 
 | 
						|
 	list_del_init(&mm->lru_gen.list);
 | 
						|
@@ -3194,68 +3189,54 @@ static bool iterate_mm_list(struct lruve
 | 
						|
 			    struct mm_struct **iter)
 | 
						|
 {
 | 
						|
 	bool first = false;
 | 
						|
-	bool last = true;
 | 
						|
+	bool last = false;
 | 
						|
 	struct mm_struct *mm = NULL;
 | 
						|
 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 | 
						|
 	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
 | 
						|
 	struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
 | 
						|
 
 | 
						|
 	/*
 | 
						|
-	 * There are four interesting cases for this page table walker:
 | 
						|
-	 * 1. It tries to start a new iteration of mm_list with a stale max_seq;
 | 
						|
-	 *    there is nothing left to do.
 | 
						|
-	 * 2. It's the first of the current generation, and it needs to reset
 | 
						|
-	 *    the Bloom filter for the next generation.
 | 
						|
-	 * 3. It reaches the end of mm_list, and it needs to increment
 | 
						|
-	 *    mm_state->seq; the iteration is done.
 | 
						|
-	 * 4. It's the last of the current generation, and it needs to reset the
 | 
						|
-	 *    mm stats counters for the next generation.
 | 
						|
+	 * mm_state->seq is incremented after each iteration of mm_list. There
 | 
						|
+	 * are three interesting cases for this page table walker:
 | 
						|
+	 * 1. It tries to start a new iteration with a stale max_seq: there is
 | 
						|
+	 *    nothing left to do.
 | 
						|
+	 * 2. It started the next iteration: it needs to reset the Bloom filter
 | 
						|
+	 *    so that a fresh set of PTE tables can be recorded.
 | 
						|
+	 * 3. It ended the current iteration: it needs to reset the mm stats
 | 
						|
+	 *    counters and tell its caller to increment max_seq.
 | 
						|
 	 */
 | 
						|
 	spin_lock(&mm_list->lock);
 | 
						|
 
 | 
						|
 	VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
 | 
						|
-	VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq);
 | 
						|
-	VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers);
 | 
						|
 
 | 
						|
-	if (walk->max_seq <= mm_state->seq) {
 | 
						|
-		if (!*iter)
 | 
						|
-			last = false;
 | 
						|
+	if (walk->max_seq <= mm_state->seq)
 | 
						|
 		goto done;
 | 
						|
-	}
 | 
						|
 
 | 
						|
-	if (!mm_state->nr_walkers) {
 | 
						|
-		VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
 | 
						|
+	if (!mm_state->head)
 | 
						|
+		mm_state->head = &mm_list->fifo;
 | 
						|
 
 | 
						|
-		mm_state->head = mm_list->fifo.next;
 | 
						|
+	if (mm_state->head == &mm_list->fifo)
 | 
						|
 		first = true;
 | 
						|
-	}
 | 
						|
-
 | 
						|
-	while (!mm && mm_state->head != &mm_list->fifo) {
 | 
						|
-		mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
 | 
						|
 
 | 
						|
+	do {
 | 
						|
 		mm_state->head = mm_state->head->next;
 | 
						|
+		if (mm_state->head == &mm_list->fifo) {
 | 
						|
+			WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
 | 
						|
+			last = true;
 | 
						|
+			break;
 | 
						|
+		}
 | 
						|
 
 | 
						|
 		/* force scan for those added after the last iteration */
 | 
						|
-		if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) {
 | 
						|
-			mm_state->tail = mm_state->head;
 | 
						|
+		if (!mm_state->tail || mm_state->tail == mm_state->head) {
 | 
						|
+			mm_state->tail = mm_state->head->next;
 | 
						|
 			walk->force_scan = true;
 | 
						|
 		}
 | 
						|
 
 | 
						|
+		mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
 | 
						|
 		if (should_skip_mm(mm, walk))
 | 
						|
 			mm = NULL;
 | 
						|
-	}
 | 
						|
-
 | 
						|
-	if (mm_state->head == &mm_list->fifo)
 | 
						|
-		WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
 | 
						|
+	} while (!mm);
 | 
						|
 done:
 | 
						|
-	if (*iter && !mm)
 | 
						|
-		mm_state->nr_walkers--;
 | 
						|
-	if (!*iter && mm)
 | 
						|
-		mm_state->nr_walkers++;
 | 
						|
-
 | 
						|
-	if (mm_state->nr_walkers)
 | 
						|
-		last = false;
 | 
						|
-
 | 
						|
 	if (*iter || last)
 | 
						|
 		reset_mm_stats(lruvec, walk, last);
 | 
						|
 
 | 
						|
@@ -3283,9 +3264,9 @@ static bool iterate_mm_list_nowalk(struc
 | 
						|
 
 | 
						|
 	VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
 | 
						|
 
 | 
						|
-	if (max_seq > mm_state->seq && !mm_state->nr_walkers) {
 | 
						|
-		VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
 | 
						|
-
 | 
						|
+	if (max_seq > mm_state->seq) {
 | 
						|
+		mm_state->head = NULL;
 | 
						|
+		mm_state->tail = NULL;
 | 
						|
 		WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
 | 
						|
 		reset_mm_stats(lruvec, NULL, true);
 | 
						|
 		success = true;
 | 
						|
@@ -3894,10 +3875,6 @@ restart:
 | 
						|
 
 | 
						|
 		walk_pmd_range(&val, addr, next, args);
 | 
						|
 
 | 
						|
-		/* a racy check to curtail the waiting time */
 | 
						|
-		if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
 | 
						|
-			return 1;
 | 
						|
-
 | 
						|
 		if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
 | 
						|
 			end = (addr | ~PUD_MASK) + 1;
 | 
						|
 			goto done;
 | 
						|
@@ -3930,8 +3907,14 @@ static void walk_mm(struct lruvec *lruve
 | 
						|
 	walk->next_addr = FIRST_USER_ADDRESS;
 | 
						|
 
 | 
						|
 	do {
 | 
						|
+		DEFINE_MAX_SEQ(lruvec);
 | 
						|
+
 | 
						|
 		err = -EBUSY;
 | 
						|
 
 | 
						|
+		/* another thread might have called inc_max_seq() */
 | 
						|
+		if (walk->max_seq != max_seq)
 | 
						|
+			break;
 | 
						|
+
 | 
						|
 		/* page_update_gen() requires stable page_memcg() */
 | 
						|
 		if (!mem_cgroup_trylock_pages(memcg))
 | 
						|
 			break;
 | 
						|
@@ -4164,25 +4147,12 @@ static bool try_to_inc_max_seq(struct lr
 | 
						|
 		success = iterate_mm_list(lruvec, walk, &mm);
 | 
						|
 		if (mm)
 | 
						|
 			walk_mm(lruvec, mm, walk);
 | 
						|
-
 | 
						|
-		cond_resched();
 | 
						|
 	} while (mm);
 | 
						|
 done:
 | 
						|
-	if (!success) {
 | 
						|
-		if (sc->priority <= DEF_PRIORITY - 2)
 | 
						|
-			wait_event_killable(lruvec->mm_state.wait,
 | 
						|
-					    max_seq < READ_ONCE(lrugen->max_seq));
 | 
						|
-		return false;
 | 
						|
-	}
 | 
						|
+	if (success)
 | 
						|
+		inc_max_seq(lruvec, can_swap, force_scan);
 | 
						|
 
 | 
						|
-	VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
 | 
						|
-
 | 
						|
-	inc_max_seq(lruvec, can_swap, force_scan);
 | 
						|
-	/* either this sees any waiters or they will see updated max_seq */
 | 
						|
-	if (wq_has_sleeper(&lruvec->mm_state.wait))
 | 
						|
-		wake_up_all(&lruvec->mm_state.wait);
 | 
						|
-
 | 
						|
-	return true;
 | 
						|
+	return success;
 | 
						|
 }
 | 
						|
 
 | 
						|
 static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
 | 
						|
@@ -5746,7 +5716,6 @@ void lru_gen_init_lruvec(struct lruvec *
 | 
						|
 		INIT_LIST_HEAD(&lrugen->pages[gen][type][zone]);
 | 
						|
 
 | 
						|
 	lruvec->mm_state.seq = MIN_NR_GENS;
 | 
						|
-	init_waitqueue_head(&lruvec->mm_state.wait);
 | 
						|
 }
 | 
						|
 
 | 
						|
 #ifdef CONFIG_MEMCG
 |