From 121755d4eb128a1294918ae8569d75b6005efd6d Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 8 May 2023 12:46:21 +0300
Subject: [PATCH 01/21] lockdep: Swap storage for pin_count and references

As a lockmap takes a reference for every ww_mutex used together, this
can be an arbitrarily large number and under control of userspace --
easily overflowing the arbitrary limit of 4096. However, the pin_count
(used for detecting unexpected lock dropping) is a full 32b despite
nesting being extremely rare (see lockdep_pin_lock).

References: https://gitlab.freedesktop.org/drm/intel/-/issues/8028
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20190425092004.9995-33-chris@chris-wilson.co.uk
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
[Joonas: Converting to pin_count:11 as per addition of sync:1]
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 include/linux/lockdep_types.h |  4 ++--
 kernel/locking/lockdep.c      | 11 +++++++----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
index 9f361d3ab9d95d..cb943f4b238569 100644
--- a/include/linux/lockdep_types.h
+++ b/include/linux/lockdep_types.h
@@ -252,8 +252,8 @@ struct held_lock {
 	unsigned int check:1;       /* see lock_acquire() comment */
 	unsigned int hardirqs_off:1;
 	unsigned int sync:1;
-	unsigned int references:11;					/* 32 bits */
-	unsigned int pin_count;
+	unsigned int pin_count:11;					/* 32 bits */
+	unsigned int references;
 };
 
 #else /* !CONFIG_LOCKDEP */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 58d78a33ac65bf..eeb8320e1a627d 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -5621,11 +5621,14 @@ static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock)
 
 		if (match_held_lock(hlock, lock)) {
 			/*
-			 * Grab 16bits of randomness; this is sufficient to not
-			 * be guessable and still allows some pin nesting in
-			 * our u32 pin_count.
+			 * Grab 6bits of randomness; this is barely sufficient
+			 * to not be guessable and still allows some 32 levels
+			 * of pin nesting in our u11 pin_count.
 			 */
-			cookie.val = 1 + (sched_clock() & 0xffff);
+			cookie.val = 1 + (sched_clock() & 0x3f);
+			if (DEBUG_LOCKS_WARN_ON(hlock->pin_count + cookie.val >= 1 << 11))
+				return NIL_COOKIE;
+
 			hlock->pin_count += cookie.val;
 			return cookie;
 		}

From 00b6ad8a555a535d046af97254dda8b5e6fe4c20 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 13 Nov 2017 12:57:06 +0000
Subject: [PATCH 02/21] ftrace: Allow configuring global trace buffer size (for
 dump-on-oops)

We have recently turned on ftrace-dump-on-oops for i915's CI and an
issue we have encountered is that the trace buffer size greatly exceeds
the pstore capabilities; we get the tail of the oops but not the
introduction.

Currently the global buffer size is controllable on the cmdline, but at
the request of our CI sysadmin, we would like to add a control to the
Kconfig as well. The rationale being the cmdline carries the temporary
hacks that we want to eradicate, and we want to track the permanent
configuration in .config.

I have kept the Kconfig option hidden from the user as the default
should suffice for the majority of users; reserving the configuration
for those that eschew the cmdline option.

v2: Add an expert prompt to stop the default value overriding .config
changes.

References: https://gitlab.freedesktop.org/drm/intel/-/issues/8029
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 kernel/trace/Kconfig | 7 +++++++
 kernel/trace/trace.c | 4 +---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a3f35c7d83b6c5..012db82731e296 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -176,6 +176,13 @@ config TRACING
 	select TRACE_CLOCK
 	select NEED_TASKS_RCU
 
+config GLOBAL_TRACE_BUF_SIZE
+	int
+	prompt "Global ftrace buffer size (for trace_printk)" if EXPERT
+	range 0 4194034
+	default 1441792 # 16384 * 88 (sizeof(struct print_entry))
+	depends on TRACING
+
 config GENERIC_TRACER
 	bool
 	select TRACING
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b581e388a9d9f7..30fc05335b9222 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -888,9 +888,7 @@ int tracing_is_enabled(void)
  * to not have to wait for all that output. Anyway this can be
  * boot time and run time configurable.
  */
-#define TRACE_BUF_SIZE_DEFAULT	1441792UL /* 16384 * 88 (sizeof(entry)) */
-
-static unsigned long		trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
+static unsigned long		trace_buf_size = CONFIG_GLOBAL_TRACE_BUF_SIZE;
 
 /* trace_types holds a link list of available tracers. */
 static struct tracer		*trace_types __read_mostly;

From b387b7e2c4f39a9304d22ff3c38ae06e8880d0bb Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 3 Sep 2018 14:17:45 +0100
Subject: [PATCH 03/21] kernel/panic: Show the stacktrace after additional
 notifier messages

Most systems keep the last messages from the panic, and we value the
stacktrace most, so dump it last in order to preserve it for
post-mortems.

References: https://gitlab.freedesktop.org/drm/intel/-/issues/8030
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Acked-by: Martin Peres <martin.peres@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180903131745.30593-1-chris@chris-wilson.co.uk
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 kernel/panic.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/kernel/panic.c b/kernel/panic.c
index a3889f38153d96..5b2bc2ac8f67ea 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -346,13 +346,6 @@ void panic(const char *fmt, ...)
 		buf[len - 1] = '\0';
 
 	pr_emerg("Kernel panic - not syncing: %s\n", buf);
-#ifdef CONFIG_DEBUG_BUGVERBOSE
-	/*
-	 * Avoid nested stack-dumping if a panic occurs during oops processing
-	 */
-	if (!test_taint(TAINT_DIE) && oops_in_progress <= 1)
-		dump_stack();
-#endif
 
 	/*
 	 * If kgdb is enabled, give it a chance to run before we stop all
@@ -384,6 +377,14 @@ void panic(const char *fmt, ...)
 
 	panic_print_sys_info(false);
 
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+	/*
+	 * Avoid nested stack-dumping if a panic occurs during oops processing
+	 */
+	if (!test_taint(TAINT_DIE) && oops_in_progress <= 1)
+		dump_stack();
+#endif
+
 	kmsg_dump_desc(KMSG_DUMP_PANIC, buf);
 
 	/*

From 43cf688985bc6b59eb999f9b5b60cc49073d7546 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 9 Oct 2018 12:35:21 +0100
Subject: [PATCH 04/21] x86: Downgrade clock throttling thermal event critical
 error

Under CI testing, it is common for the cpus to overheat with the
continuous workloads and end up being throttled. As the cpus still
function, it is less of a critical error meriting urgent action, but an
expected yet significant condition (pr_note).

References: https://gitlab.freedesktop.org/drm/intel/-/issues/8031
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Petri Latvala <petri.latvala@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/thermal/intel/therm_throt.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c
index e69868e868eb9e..9d18f2fba7a009 100644
--- a/drivers/thermal/intel/therm_throt.c
+++ b/drivers/thermal/intel/therm_throt.c
@@ -345,10 +345,10 @@ static void __maybe_unused throttle_active_work(struct work_struct *work)
 	avg /= ARRAY_SIZE(state->temp_samples);
 
 	if (state->average > avg) {
-		pr_warn("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n",
-			this_cpu,
-			state->level == CORE_LEVEL ? "Core" : "Package",
-			state->count);
+		pr_notice("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n",
+			  this_cpu,
+			  state->level == CORE_LEVEL ? "Core" : "Package",
+			  state->count);
 		state->rate_control_active = true;
 	}
 

From 3f412047c54e28ecd50c10bdcec698f166c861e8 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Tue, 16 Nov 2021 09:22:48 +0100
Subject: [PATCH 05/21] libata: Downgrade unsupported feature warnings to
 notifications

References: https://gitlab.freedesktop.org/drm/intel/-/issues/8032
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Petri Latvala <petri.latvala@intel.com>
[danvet: Rebase]
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/ata/libata-core.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 773799cfd44308..14bed90d833676 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2178,7 +2178,7 @@ static bool ata_identify_page_supported(struct ata_device *dev, u8 page)
 		 * for drives which implement this ATA level or above.
 		 */
 		if (ata_id_major_version(dev->id) >= 10)
-			ata_dev_warn(dev,
+			ata_dev_notice(dev,
 				"ATA Identify Device Log not supported\n");
 		dev->quirks |= ATA_QUIRK_NO_ID_DEV_LOG;
 		return false;
@@ -2249,7 +2249,7 @@ static void ata_dev_config_ncq_send_recv(struct ata_device *dev)
 	unsigned int err_mask;
 
 	if (!ata_log_supported(dev, ATA_LOG_NCQ_SEND_RECV)) {
-		ata_dev_warn(dev, "NCQ Send/Recv Log not supported\n");
+		ata_dev_notice(dev, "NCQ Send/Recv Log not supported\n");
 		return;
 	}
 	err_mask = ata_read_log_page(dev, ATA_LOG_NCQ_SEND_RECV,
@@ -2273,8 +2273,8 @@ static void ata_dev_config_ncq_non_data(struct ata_device *dev)
 	unsigned int err_mask;
 
 	if (!ata_log_supported(dev, ATA_LOG_NCQ_NON_DATA)) {
-		ata_dev_warn(dev,
-			     "NCQ Non-Data Log not supported\n");
+		ata_dev_notice(dev,
+			       "NCQ Non-Data Log not supported\n");
 		return;
 	}
 	err_mask = ata_read_log_page(dev, ATA_LOG_NCQ_NON_DATA,
@@ -2937,14 +2937,14 @@ int ata_dev_configure(struct ata_device *dev)
 		if (ata_id_is_cfa(id)) {
 			/* CPRM may make this media unusable */
 			if (id[ATA_ID_CFA_KEY_MGMT] & 1)
-				ata_dev_warn(dev,
+				ata_dev_notice(dev,
 	"supports DRM functions and may not be fully accessible\n");
 			snprintf(revbuf, 7, "CFA");
 		} else {
 			snprintf(revbuf, 7, "ATA-%d", ata_id_major_version(id));
 			/* Warn the user if the device has TPM extensions */
 			if (ata_id_has_tpm(id))
-				ata_dev_warn(dev,
+				ata_dev_notice(dev,
 	"supports DRM functions and may not be fully accessible\n");
 		}
 
@@ -3100,8 +3100,8 @@ int ata_dev_configure(struct ata_device *dev)
 	}
 
 	if ((dev->quirks & ATA_QUIRK_FIRMWARE_WARN) && print_info) {
-		ata_dev_warn(dev, "WARNING: device requires firmware update to be fully functional\n");
-		ata_dev_warn(dev, "         contact the vendor or visit http://ata.wiki.kernel.org\n");
+		ata_dev_notice(dev, "WARNING: device requires firmware update to be fully functional\n");
+		ata_dev_notice(dev, "         contact the vendor or visit http://ata.wiki.kernel.org\n");
 	}
 
 	return 0;

From 117031f2fdc406ebee33afc84102e65fea3501c1 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Thu, 2 May 2019 22:46:48 +0200
Subject: [PATCH 06/21] RFC: hung_task: taint kernel

There's the hung_task_panic sysctl, but that's a bit an extreme measure.
As a fallback taint at least the machine.

Our CI uses this to decide when a reboot is necessary, plus to figure
out whether the kernel is still happy.

v2: Works much better when I put the else { add_taint() } at the right
place.

References: https://gitlab.freedesktop.org/drm/intel/-/issues/8034
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: "Paul E. McKenney" <paulmck@linux.ibm.com>
Cc: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: "Liu, Chuansheng" <chuansheng.liu@intel.com>
Acked-by: Chris Wilson <chris@chris-wilson.co.uk> (for core-for-CI)
Link: https://patchwork.freedesktop.org/patch/msgid/20190502204648.5537-1-daniel.vetter@ffwll.ch
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 kernel/hung_task.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index dc898ec93463f6..d8c5e7bc8826e6 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -169,6 +169,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 		console_verbose();
 		hung_task_show_lock = true;
 		hung_task_call_panic = true;
+	} else {
+		add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 	}
 
 	/*

From 6d4322c03d37c088ffe1149f40a6cff93183a0a5 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Thu, 2 May 2019 21:42:08 +0200
Subject: [PATCH 07/21] RFC: soft/hardlookup: taint kernel

There's the soft/hardlookup_panic sysctls, but that's a bit an extreme
measure. As a fallback taint at least the machine.

Our CI uses this to decide when a reboot is necessary, plus to figure
out whether the kernel is still happy.

References: https://gitlab.freedesktop.org/drm/intel/-/issues/8035
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Cc: Laurence Oberman <loberman@redhat.com>
Cc: Vincent Whitchurch <vincent.whitchurch@axis.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Cc: Sinan Kaya <okaya@kernel.org>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Acked-by: Chris Wilson <chris@chris-wilson.co.uk> (for core-for-CI)
Link: https://patchwork.freedesktop.org/patch/msgid/20190502194208.3535-2-daniel.vetter@ffwll.ch
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 kernel/watchdog.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 9fa2af9dbf2cec..f1281e9d2bf36c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -214,6 +214,8 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
 
 		if (hardlockup_panic)
 			nmi_panic(regs, "Hard LOCKUP");
+		else
+			add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 
 		per_cpu(watchdog_hardlockup_warned, cpu) = true;
 	} else {
@@ -776,6 +778,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 		add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
 		if (softlockup_panic)
 			panic("softlockup: hung tasks");
+		else
+			add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 	}
 
 	return HRTIMER_RESTART;

From 9d4e25c945ca0023f23d8be78a613aa822be3cdf Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Mon, 8 May 2023 12:53:35 +0300
Subject: [PATCH 08/21] net/sch_generic: Shut up noise

We can't allow spam in CI.

Update 26th June 2018: This is still an issue:
Update 23rd May 2019: You guessed it, still ocurring.

[  224.739686] ------------[ cut here ]------------
[  224.739712] WARNING: CPU: 3 PID: 2982 at net/sched/sch_generic.c:461 dev_watchdog+0x1fd/0x210
[  224.739714] Modules linked in: vgem snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core x86_pkg_temp_thermal intel_powerclamp coretemp crct10dif_pclmul crc32_pclmul ghash_clmulni_intel snd_pcm i915 asix usbnet mii mei_me mei prime_numbers i2c_hid pinctrl_sunrisepoint pinctrl_intel btusb btrtl btbcm btintel bluetooth ecdh_generic
[  224.739775] CPU: 3 PID: 2982 Comm: gem_exec_suspen Tainted: G     U  W         4.18.0-rc2-CI-Patchwork_9414+ #1
[  224.739777] Hardware name: Dell Inc. XPS 13 9350/, BIOS 1.4.12 11/30/2016
[  224.739780] RIP: 0010:dev_watchdog+0x1fd/0x210
[  224.739781] Code: 49 63 4c 24 f0 eb 92 4c 89 ef c6 05 21 46 ad 00 01 e8 77 ee fc ff 89 d9 48 89 c2 4c 89 ee 48 c7 c7 88 4c 14 82 e8 a3 fe 84 ff <0f> 0b eb be 0f 1f 44 00 00 66 2e 0f 1f 84 00 00 00 00 00 48 c7 47
[  224.739866] RSP: 0018:ffff88027dd83e40 EFLAGS: 00010286
[  224.739869] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000102
[  224.739871] RDX: 0000000080000102 RSI: ffffffff820c8c6c RDI: 00000000ffffffff
[  224.739873] RBP: ffff8802644c1540 R08: 0000000071be9b33 R09: 0000000000000000
[  224.739874] R10: ffff88027dd83dc0 R11: 0000000000000000 R12: ffff8802644c1588
[  224.739876] R13: ffff8802644c1160 R14: 0000000000000001 R15: ffff88026a5dc728
[  224.739878] FS:  00007f18f4887980(0000) GS:ffff88027dd80000(0000) knlGS:0000000000000000
[  224.739880] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  224.739881] CR2: 00007f4c627ae548 CR3: 000000022ca1a002 CR4: 00000000003606e0
[  224.739883] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  224.739885] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  224.739886] Call Trace:
[  224.739888]  <IRQ>
[  224.739892]  ? qdisc_reset+0xe0/0xe0
[  224.739894]  ? qdisc_reset+0xe0/0xe0
[  224.739897]  call_timer_fn+0x93/0x360
[  224.739903]  expire_timers+0xc1/0x1d0
[  224.739908]  run_timer_softirq+0xc7/0x170
[  224.739916]  __do_softirq+0xd9/0x505
[  224.739923]  irq_exit+0xa9/0xc0
[  224.739926]  smp_apic_timer_interrupt+0x9c/0x2d0
[  224.739929]  apic_timer_interrupt+0xf/0x20
[  224.739931]  </IRQ>
[  224.739934] RIP: 0010:delay_tsc+0x2e/0xb0
[  224.739936] Code: 49 89 fc 55 53 bf 01 00 00 00 e8 6d 2c 78 ff e8 88 9d b6 ff 41 89 c5 0f ae e8 0f 31 48 c1 e2 20 48 09 c2 48 89 d5 eb 16 f3 90 <bf> 01 00 00 00 e8 48 2c 78 ff e8 63 9d b6 ff 44 39 e8 75 36 0f ae
[  224.740021] RSP: 0018:ffffc900002f7d48 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff13
[  224.740024] RAX: 0000000080000000 RBX: 0000000649565ca9 RCX: 0000000000000001
[  224.740026] RDX: 0000000080000001 RSI: ffffffff820c8c6c RDI: 00000000ffffffff
[  224.740027] RBP: 00000006493ea9ce R08: 000000005e81e2ee R09: 0000000000000000
[  224.740029] R10: 0000000000000120 R11: 0000000000000000 R12: 00000000002ad8d6
[  224.740030] R13: 0000000000000003 R14: 0000000000000004 R15: ffff88025caf5408
[  224.740040]  ? delay_tsc+0x66/0xb0
[  224.740045]  hibernation_debug_sleep+0x1c/0x30
[  224.740048]  hibernation_snapshot+0x2c1/0x690
[  224.740053]  hibernate+0x142/0x2a4
[  224.740057]  state_store+0xd0/0xe0
[  224.740063]  kernfs_fop_write+0x104/0x190
[  224.740068]  __vfs_write+0x31/0x180
[  224.740072]  ? rcu_read_lock_sched_held+0x6f/0x80
[  224.740075]  ? rcu_sync_lockdep_assert+0x29/0x50
[  224.740078]  ? __sb_start_write+0x152/0x1f0
[  224.740080]  ? __sb_start_write+0x168/0x1f0
[  224.740084]  vfs_write+0xbd/0x1a0
[  224.740088]  ksys_write+0x50/0xc0
[  224.740094]  do_syscall_64+0x55/0x190
[  224.740097]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  224.740099] RIP: 0033:0x7f18f400a281
[  224.740100] Code: c3 0f 1f 84 00 00 00 00 00 48 8b 05 59 8d 20 00 c3 0f 1f 84 00 00 00 00 00 8b 05 8a d1 20 00 85 c0 75 16 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 57 f3 c3 0f 1f 44 00 00 41 54 55 49 89 d4 53
[  224.740186] RSP: 002b:00007fffd1f4fec8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[  224.740189] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f18f400a281
[  224.740190] RDX: 0000000000000004 RSI: 00007f18f448069a RDI: 0000000000000006
[  224.740192] RBP: 00007fffd1f4fef0 R08: 0000000000000000 R09: 0000000000000000
[  224.740194] R10: 0000000000000000 R11: 0000000000000246 R12: 000055e795d03400
[  224.740195] R13: 00007fffd1f50500 R14: 0000000000000000 R15: 0000000000000000
[  224.740205] irq event stamp: 1582591
[  224.740207] hardirqs last  enabled at (1582590): [<ffffffff810f9f9c>] vprintk_emit+0x4bc/0x4d0
[  224.740210] hardirqs last disabled at (1582591): [<ffffffff81a0111c>] error_entry+0x7c/0x100
[  224.740212] softirqs last  enabled at (1582568): [<ffffffff81c0034f>] __do_softirq+0x34f/0x505
[  224.740215] softirqs last disabled at (1582571): [<ffffffff8108c959>] irq_exit+0xa9/0xc0
[  224.740218] WARNING: CPU: 3 PID: 2982 at net/sched/sch_generic.c:461 dev_watchdog+0x1fd/0x210
[  224.740219] ---[ end trace 6e41d690e611c338 ]---

References: https://gitlab.freedesktop.org/drm/intel/-/issues/8037
References: https://bugzilla.kernel.org/show_bug.cgi?id=196399
Acked-by: Martin Peres <martin.peres@linux.intel.com>
Cc: Martin Peres <martin.peres@linux.intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20170718082110.12524-1-daniel.vetter@ffwll.ch
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 net/sched/sch_generic.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 514b1b6ac68196..a518fcdb96ba41 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -530,7 +530,12 @@ static void dev_watchdog(struct timer_list *t)
 					oldest_start = trans_start;
 			}
 
-			if (unlikely(timedout_ms)) {
+			/* The noise is pissing off our CI and upstream doesn't
+			 * move on the bug report:
+			 *
+			 * https://bugzilla.kernel.org/show_bug.cgi?id=196399
+			 */
+			if (unlikely(timedout_ms) && 0) {
 				trace_net_dev_xmit_timeout(dev, i);
 				netdev_crit(dev, "NETDEV WATCHDOG: CPU: %d: transmit queue %u timed out %u ms\n",
 					    raw_smp_processor_id(),

From c36f2dc9f8c58946a054756de4a43754b09ebdc1 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 1 Jul 2019 15:29:03 +0100
Subject: [PATCH 09/21] mm: Show slab debug as offsets from section base not
 hashed pointers

Since the kernel now used hashed pointers for raw addresses, it is very
hard to guage the relative placement within a section, and since the
hash value will never match up with any contents, using it provides no
information relevant for slab debugging. Show the relative offset into
each section, so that some reference for the hexdump is provided.

References: https://gitlab.freedesktop.org/drm/intel/-/issues/8038
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 mm/slub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/slub.c b/mm/slub.c
index b46f87662e71d4..bfd924674ad8d0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -910,7 +910,7 @@ static void print_section(char *level, char *text, u8 *addr,
 			  unsigned int length)
 {
 	metadata_access_enable();
-	print_hex_dump(level, text, DUMP_PREFIX_ADDRESS,
+	print_hex_dump(level, text, DUMP_PREFIX_OFFSET,
 			16, 1, kasan_reset_tag((void *)addr), length, 1);
 	metadata_access_disable();
 }

From 320d83b9dcc4618faa31009a434e127e0262ec0e Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 23 Apr 2020 09:27:53 +0100
Subject: [PATCH 10/21] pci/msi: Stop warning for MSI enabling failure

If the MSI is already enabled, trying to enable it again results in an
-EINVAL and on the first attempt a WARN. That WARN causes our CI to
abort the run [on each first attempt to suspend]:

<4> [463.142025] WARNING: CPU: 0 PID: 2225 at drivers/pci/msi.c:1074 __pci_enable_msi_range+0x3cb/0x420
<4> [463.142026] Modules linked in: snd_hda_intel i915 snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic mei_hdcp x86_pkg_temp_thermal coretemp crct10dif_pclmul crc32_pclmul snd_intel_dspcfg ghash_clmulni_intel snd_hda_codec btusb btrtl btbcm btintel e1000e bluetooth snd_hwdep snd_hda_core ptp ecdh_generic snd_pcm ecc pps_core mei_me mei prime_numbers [last unloaded: i915]
<4> [463.142045] CPU: 0 PID: 2225 Comm: kworker/u8:14 Tainted: G     U            5.7.0-rc2-CI-CI_DRM_8350+ #1
<4> [463.142046] Hardware name: Intel Corporation NUC7i5BNH/NUC7i5BNB, BIOS BNKBL357.86A.0060.2017.1214.2013 12/14/2017
<4> [463.142049] Workqueue: events_unbound async_run_entry_fn
<4> [463.142051] RIP: 0010:__pci_enable_msi_range+0x3cb/0x420
<4> [463.142053] Code: 76 58 49 8d 56 48 48 89 df e8 31 73 fd ff e9 20 fe ff ff 31 f6 48 89 df e8 c2 e9 fd ff e9 d6 fe ff ff 45 89 fc e9 1a ff ff ff <0f> 0b 41 bc ea ff ff ff e9 0d ff ff ff 41 bc ea ff ff ff e9 02 ff
<4> [463.142054] RSP: 0018:ffffc90000593cd0 EFLAGS: 00010202
<4> [463.142056] RAX: 0000000000000010 RBX: ffff888274051000 RCX: 0000000000000000
<4> [463.142057] RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffff888274051000
<4> [463.142058] RBP: ffff888238aa1018 R08: 0000000000000001 R09: 0000000000000001
<4> [463.142060] R10: ffffc90000593d90 R11: 00000000c79cdfd5 R12: ffff8882740510b0
<4> [463.142061] R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000001
<4> [463.142062] FS:  0000000000000000(0000) GS:ffff888276c00000(0000) knlGS:0000000000000000
<4> [463.142064] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
<4> [463.142065] CR2: 000055706f347d80 CR3: 0000000005610003 CR4: 00000000003606f0
<4> [463.142066] Call Trace:
<4> [463.142073]  pci_enable_msi+0x11/0x20
<4> [463.142077]  azx_resume+0x1ab/0x200 [snd_hda_intel]
<4> [463.142080]  ? pci_pm_thaw+0x80/0x80
<4> [463.142084]  dpm_run_callback+0x64/0x280
<4> [463.142089]  device_resume+0xd4/0x1c0
<4> [463.142093]  ? dpm_watchdog_set+0x60/0

While this would appear to be a bug in snd-hda, it does appear
inconsequential, at least for gfx-ci.

Downgrade the warning to an info, like the other already-enabled error
for MSI-X.

References: https://gitlab.freedesktop.org/drm/intel/-/issues/8041
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/1687
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20200423082753.3899018-1-chris@chris-wilson.co.uk
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/pci/msi/msi.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c
index 6569ba3577fe63..a11fb9c2e857f2 100644
--- a/drivers/pci/msi/msi.c
+++ b/drivers/pci/msi/msi.c
@@ -429,8 +429,10 @@ int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
 	if (maxvec < minvec)
 		return -ERANGE;
 
-	if (WARN_ON_ONCE(dev->msi_enabled))
+	if (dev->msi_enabled) {
+		pci_info(dev, "can't enable MSI, already enabled\n");
 		return -EINVAL;
+	}
 
 	/* Test for the availability of MSI support */
 	if (!pci_msi_domain_supports(dev, 0, ALLOW_LEGACY))

From 016489e6a1d940bab49460ab70338deb0598cf0b Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 17 Dec 2020 16:47:00 +0000
Subject: [PATCH 11/21] HAX net/phy: Suppress WARN for calling stop while
 halted

References: https://gitlab.freedesktop.org/drm/intel/-/issues/8046
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2805
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/net/phy/phy.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 13df28445f0201..75654b5a8858bb 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -1626,11 +1626,8 @@ void phy_stop(struct phy_device *phydev)
 	enum phy_state old_state;
 
 	if (!phy_is_started(phydev) && phydev->state != PHY_DOWN &&
-	    phydev->state != PHY_ERROR) {
-		WARN(1, "called from state %s\n",
-		     phy_state_to_str(phydev->state));
+	    phydev->state != PHY_ERROR)
 		return;
-	}
 
 	mutex_lock(&phydev->lock);
 	old_state = phydev->state;

From 2822b185a2a63046a9f9c5c2c4759afbed22c824 Mon Sep 17 00:00:00 2001
From: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Date: Mon, 8 May 2023 13:09:13 +0300
Subject: [PATCH 12/21] HAX net/phy: Suppress WARN from phy_error

References: https://gitlab.freedesktop.org/drm/intel/-/issues/8047
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2874
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/net/phy/phy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 75654b5a8858bb..2e2fcb83141b89 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -1361,7 +1361,7 @@ static void phy_error_precise(struct phy_device *phydev,
  */
 void phy_error(struct phy_device *phydev)
 {
-	WARN_ON(1);
+	pr_notice_once("%s\n", __func__);
 	phy_process_error(phydev);
 }
 EXPORT_SYMBOL(phy_error);

From 4e8698d074c82bb9b09e37f7d5f0cd663551697d Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Tue, 4 Jun 2024 19:16:18 +0300
Subject: [PATCH 13/21] thunderbolt: Add Kconfig option to disable PCIe
 tunneling

In typical cases PCIe tunneling is needed to make the devices fully
usable for the host system. However, it poses a security issue because
they can also use DMA to access the host memory. We already have two
ways of preventing this, one an IOMMU that is enabled on recent systems
by default and the second is the "authorized" attribute under each
connected device that needs to be written by userspace before a PCIe
tunnel is created. This option adds one more by adding a Kconfig option,
which is enabled by default, that can be used to make kernel binaries
where PCIe tunneling is completely disabled.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
References: https://intel-gfx-ci.01.org/tree/drm-tip/Trybot_134314v1/bat-mtlp-9/boot0.txt
References: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/11261
Signed-off-by: Imre Deak <imre.deak@intel.com>
Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240604161618.1958674-1-imre.deak@intel.com
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/thunderbolt/Kconfig  | 18 ++++++++++++++++++
 drivers/thunderbolt/tb.c     |  2 +-
 drivers/thunderbolt/tb.h     |  9 +++++++++
 drivers/thunderbolt/tunnel.c |  8 ++++----
 drivers/thunderbolt/usb4.c   |  2 +-
 5 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/drivers/thunderbolt/Kconfig b/drivers/thunderbolt/Kconfig
index 0abdb69ee9f437..8bf4ecf7f76eef 100644
--- a/drivers/thunderbolt/Kconfig
+++ b/drivers/thunderbolt/Kconfig
@@ -18,6 +18,24 @@ menuconfig USB4
 
 if USB4
 
+config USB4_PCIE_TUNNELING
+	bool "Allow PCI Express tunneling over USB4 fabric"
+	depends on PCI
+	default y
+	help
+	  USB4 and Thunderbolt devices typically include PCIe switch
+	  with a number of PCIe endpoints such as USB host controllers,
+	  GPUs and network adapters. These are made available to the
+	  host system through PCIe tunneling. These can use DMA and
+	  therefore have access to the host memory which is typically
+	  guarded by an IOMMU. This option allows disabling PCIe
+	  tunneling completely.
+
+	  For devices to be usable it is recommended to say Y here.
+
+	  Note this only works with systems that use Software Based
+	  Connection Manager (this is most USB4 hosts).
+
 config USB4_DEBUGFS_WRITE
 	bool "Enable write by debugfs to configuration spaces (DANGEROUS)"
 	help
diff --git a/drivers/thunderbolt/tb.c b/drivers/thunderbolt/tb.c
index 8c527af989271c..f19839b3b2c6d1 100644
--- a/drivers/thunderbolt/tb.c
+++ b/drivers/thunderbolt/tb.c
@@ -3348,7 +3348,7 @@ struct tb *tb_probe(struct tb_nhi *nhi)
 	if (!tb)
 		return NULL;
 
-	if (tb_acpi_may_tunnel_pcie())
+	if (tb_may_tunnel_pcie())
 		tb->security_level = TB_SECURITY_USER;
 	else
 		tb->security_level = TB_SECURITY_NOPCIE;
diff --git a/drivers/thunderbolt/tb.h b/drivers/thunderbolt/tb.h
index b54147a1ba8778..d1ce8f1eccf6c1 100644
--- a/drivers/thunderbolt/tb.h
+++ b/drivers/thunderbolt/tb.h
@@ -1504,6 +1504,15 @@ static inline int tb_acpi_power_on_retimers(struct tb_port *port) { return 0; }
 static inline int tb_acpi_power_off_retimers(struct tb_port *port) { return 0; }
 #endif
 
+static inline bool tb_may_tunnel_pcie(void)
+{
+#ifdef CONFIG_USB4_PCIE_TUNNELING
+	return tb_acpi_may_tunnel_pcie();
+#else
+	return false;
+#endif
+}
+
 #ifdef CONFIG_DEBUG_FS
 void tb_debugfs_init(void);
 void tb_debugfs_exit(void);
diff --git a/drivers/thunderbolt/tunnel.c b/drivers/thunderbolt/tunnel.c
index 76254ed3f47f4b..d3fc4173d9cecb 100644
--- a/drivers/thunderbolt/tunnel.c
+++ b/drivers/thunderbolt/tunnel.c
@@ -122,7 +122,7 @@ static unsigned int tb_available_credits(const struct tb_port *port,
 	size_t ndp;
 
 	usb3 = tb_acpi_may_tunnel_usb3() ? sw->max_usb3_credits : 0;
-	pcie = tb_acpi_may_tunnel_pcie() ? sw->max_pcie_credits : 0;
+	pcie = tb_may_tunnel_pcie() ? sw->max_pcie_credits : 0;
 
 	if (tb_acpi_is_xdomain_allowed()) {
 		spare = min_not_zero(sw->max_dma_credits, dma_credits);
@@ -479,7 +479,7 @@ bool tb_tunnel_reserved_pci(struct tb_port *port, int *reserved_up,
 	if (WARN_ON_ONCE(!port->remote))
 		return false;
 
-	if (!tb_acpi_may_tunnel_pcie())
+	if (!tb_may_tunnel_pcie())
 		return false;
 
 	if (tb_port_get_link_generation(port) < 4)
@@ -1646,7 +1646,7 @@ static unsigned int tb_dma_available_credits(const struct tb_port *port)
 	int credits;
 
 	credits = tb_available_credits(port, NULL);
-	if (tb_acpi_may_tunnel_pcie())
+	if (tb_may_tunnel_pcie())
 		credits -= sw->max_pcie_credits;
 	credits -= port->dma_credits;
 
@@ -1957,7 +1957,7 @@ static int tb_usb3_consumed_bandwidth(struct tb_tunnel *tunnel,
 		int *consumed_up, int *consumed_down)
 {
 	struct tb_port *port = tb_upstream_port(tunnel->dst_port->sw);
-	int pcie_weight = tb_acpi_may_tunnel_pcie() ? TB_PCI_WEIGHT : 0;
+	int pcie_weight = tb_may_tunnel_pcie() ? TB_PCI_WEIGHT : 0;
 
 	/*
 	 * PCIe tunneling, if enabled, affects the USB3 bandwidth so
diff --git a/drivers/thunderbolt/usb4.c b/drivers/thunderbolt/usb4.c
index e51d01671d8e7c..1b740d7fc7dab0 100644
--- a/drivers/thunderbolt/usb4.c
+++ b/drivers/thunderbolt/usb4.c
@@ -276,7 +276,7 @@ int usb4_switch_setup(struct tb_switch *sw)
 	 * Only enable PCIe tunneling if the parent router supports it
 	 * and it is not disabled.
 	 */
-	if (tb_acpi_may_tunnel_pcie() &&
+	if (tb_may_tunnel_pcie() &&
 	    tb_switch_find_port(parent, TB_TYPE_PCIE_DOWN)) {
 		val |= ROUTER_CS_5_PTO;
 		/*

From abfe4d1d04b5806ffcf552b2482b8b279846f152 Mon Sep 17 00:00:00 2001
From: Luca Coelho <luciano.coelho@intel.com>
Date: Mon, 3 Feb 2025 15:31:13 +0200
Subject: [PATCH 14/21] Revert "lockdep: Enable PROVE_RAW_LOCK_NESTING with
 PROVE_LOCKING."

This reverts commit 560af5dc839eef08a273908f390cfefefb82aa04.

Locking in i915_pmu.c interacting with perf is completely wrong. It's
using spinlock_t everywhere when it should actually use raw_spinlock_t
since perf is already holding raw_spinlock in the caller. This started
to be checked with commit 560af5dc839e ("lockdep: Enable
PROVE_RAW_LOCK_NESTING with PROVE_LOCKING."), but should only be a real
issue when PREEMPT_RT is enabled: in that config, the spinlock_t can
sleep and creates issue.

Reworking the locks in i915_pmu.c is not very simple as changing locks
to raw_spinlock_t cascades to too many locks, which is both a) not
desired from an RT perspective and b) hard to get right as it calls into
other parts of the driver that have other requirements.

Example backtrace:

<4> [141.043897] =============================
<4> [141.043922] [ BUG: Invalid wait context ]
<4> [141.043940] 6.13.0-rc2-CI_DRM_15820-g78bd7a249aa0+ #1 Not tainted
<4> [141.043964] -----------------------------
<4> [141.043981] swapper/0/0 is trying to lock:
<4> [141.044000] ffff88810861b910 (&pmu->lock){....}-{3:3}, at: i915_pmu_enable+0x48/0x3a0 [i915]
<4> [141.044194] other info that might help us debug this:
<4> [141.044217] context-{5:5}
<4> [141.044229] 1 lock held by swapper/0/0:
<4> [141.044248]  #0: ffff88885f432038 (&cpuctx_lock){....}-{2:2}, at: __perf_install_in_context+0x3f/0x360
<4> [141.044297] stack backtrace:
<4> [141.044312] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.13.0-rc2-CI_DRM_15820-g78bd7a249aa0+ #1
<4> [141.044353] Hardware name: Intel Corporation Meteor Lake Client Platform/MTL-P LP5x T3 RVP, BIOS MTLPFWI1.R00.3471.D91.2401310918 01/31/2024
<4> [141.044405] Call Trace:
<4> [141.044419]  <TASK>
<4> [141.044431]  dump_stack_lvl+0x91/0xf0
<4> [141.044454]  dump_stack+0x10/0x20
<4> [141.044472]  __lock_acquire+0x990/0x2820
<4> [141.044498]  lock_acquire+0xc9/0x300
<4> [141.044518]  ? i915_pmu_enable+0x48/0x3a0 [i915]
<4> [141.044689]  _raw_spin_lock_irqsave+0x49/0x80
<4> [141.044713]  ? i915_pmu_enable+0x48/0x3a0 [i915]
<4> [141.044903]  i915_pmu_enable+0x48/0x3a0 [i915]
<4> [141.045112]  ? __lock_acquire+0x455/0x2820
<4> [141.045142]  i915_pmu_event_add+0x71/0x90 [i915]

More time is needed to get this fixed properly, but let's not pile
regressions on top.

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241211121703.2890150-1-luciano.coelho@intel.com
[ Reword commit message, giving more detail on what the issue is ]
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
References: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/13311
Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 lib/Kconfig.debug | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 9fe4d8dfe57829..a5e76c86b32f04 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1420,12 +1420,20 @@ config PROVE_LOCKING
 config PROVE_RAW_LOCK_NESTING
 	bool "Enable raw_spinlock - spinlock nesting checks" if !ARCH_SUPPORTS_RT
 	depends on PROVE_LOCKING
-	default y if ARCH_SUPPORTS_RT
+	default n
 	help
 	 Enable the raw_spinlock vs. spinlock nesting checks which ensure
 	 that the lock nesting rules for PREEMPT_RT enabled kernels are
 	 not violated.
 
+	 NOTE: There are known nesting problems. So if you enable this
+	 option expect lockdep splats until these problems have been fully
+	 addressed which is work in progress. This config switch allows to
+	 identify and analyze these problems. It will be removed and the
+	 check permanently enabled once the main issues have been fixed.
+
+	 If unsure, select N.
+
 config LOCK_STAT
 	bool "Lock usage statistics"
 	depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT

From 6c87d5b5bc046f802813f750485414e3fc3d5914 Mon Sep 17 00:00:00 2001
From: Michal Wajdeczko <michal.wajdeczko@intel.com>
Date: Thu, 11 Jul 2024 21:23:20 +0200
Subject: [PATCH 15/21] drm/xe: Enable SR-IOV for ADL/ATSM

We should now have sufficient changes in the Xe driver changes to
run it on ADL and ATSM platforms in the PF mode, to configure VFs
and successfully probe driver on the enabled VF devices.

While some more changes are likely still needed to fix all corner
cases, we will not find them without running any tests. To start
testing this feature by the CI, we need to mark which platforms
have basic SR-IOV support and let the driver run in the PF mode.

Since this feature support is still in the early testing stage,
make all enabling available only for CONFIG_DRM_XE_DEBUG=y and
keep it on CI topic branch.

Note that from this point, on selected platforms, the Xe driver
will be acting as a PF driver, will some SR-IOV specific changes
compared to running in the non-virtualized (native) mode.

However, those specific changes are visible mostly on the debugfs,
and should not impact normal driver execution, unless VFs will be
manually provisioned or explicitly enabled.

Once we finish adding the remaining SR-IOV tests to the CI and fix
any issues that we find in the meantime, we will replace this patch
with proper series outside the topic branch.

Suggested-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Acked-by: Lucas De Marchi <lucas.demarchi@intel.com>
Acked-by: Thomas Hellstrom <thomas.hellstrom@linux.intel.com>
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240711192320.1198-3-michal.wajdeczko@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/xe/xe_module.c | 3 +++
 drivers/gpu/drm/xe/xe_pci.c    | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c
index be8603b16ff3fb..15b3cf22193c47 100644
--- a/drivers/gpu/drm/xe/xe_module.c
+++ b/drivers/gpu/drm/xe/xe_module.c
@@ -22,6 +22,9 @@ struct xe_modparam xe_modparam = {
 	.probe_display = true,
 	.guc_log_level = 3,
 	.force_probe = CONFIG_DRM_XE_FORCE_PROBE,
+#ifdef CONFIG_PCI_IOV
+	.max_vfs = IS_ENABLED(CONFIG_DRM_XE_DEBUG) ? ~0 : 0,
+#endif
 	.wedged_mode = 1,
 	.svm_notifier_size = 512,
 	/* the rest are 0 by default */
diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
index 07fe994f2a807d..c965898b3ebf57 100644
--- a/drivers/gpu/drm/xe/xe_pci.c
+++ b/drivers/gpu/drm/xe/xe_pci.c
@@ -224,6 +224,7 @@ static const struct xe_device_desc adl_s_desc = {
 	.dma_mask_size = 39,
 	.has_display = true,
 	.has_llc = true,
+	.has_sriov = IS_ENABLED(CONFIG_DRM_XE_DEBUG),
 	.require_force_probe = true,
 	.subplatforms = (const struct xe_subplatform_desc[]) {
 		{ XE_SUBPLATFORM_ALDERLAKE_S_RPLS, "RPLS", adls_rpls_ids },
@@ -240,6 +241,7 @@ static const struct xe_device_desc adl_p_desc = {
 	.dma_mask_size = 39,
 	.has_display = true,
 	.has_llc = true,
+	.has_sriov = IS_ENABLED(CONFIG_DRM_XE_DEBUG),
 	.require_force_probe = true,
 	.subplatforms = (const struct xe_subplatform_desc[]) {
 		{ XE_SUBPLATFORM_ALDERLAKE_P_RPLU, "RPLU", adlp_rplu_ids },
@@ -254,6 +256,7 @@ static const struct xe_device_desc adl_n_desc = {
 	.dma_mask_size = 39,
 	.has_display = true,
 	.has_llc = true,
+	.has_sriov = IS_ENABLED(CONFIG_DRM_XE_DEBUG),
 	.require_force_probe = true,
 };
 
@@ -294,6 +297,7 @@ static const struct xe_device_desc ats_m_desc = {
 
 	DG2_FEATURES,
 	.has_display = false,
+	.has_sriov = IS_ENABLED(CONFIG_DRM_XE_DEBUG),
 };
 
 static const struct xe_device_desc dg2_desc = {

From e3332d1bcbe86d145a19301cff0dc51d3270991c Mon Sep 17 00:00:00 2001
From: Rodrigo Vivi <rodrigo.vivi@intel.com>
Date: Fri, 8 Dec 2023 12:11:06 -0500
Subject: [PATCH 16/21] drm/xe: Add PVC's PCI device IDs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch introduces Ponte Vecchio support in Xe driver.

Please note that besides this patch, likely the force_probe
still needs to be used in order to actually enable the
support for PVC.

This patch was separated from the rest so we can ensure
compliance with DRM uAPI rules on compute platforms.

Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Oded Gabbay <ogabbay@kernel.org>
Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/1154
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com>
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/xe/xe_pci.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
index c965898b3ebf57..4fe7e0d941a911 100644
--- a/drivers/gpu/drm/xe/xe_pci.c
+++ b/drivers/gpu/drm/xe/xe_pci.c
@@ -380,6 +380,7 @@ static const struct pci_device_id pciidlist[] = {
 	INTEL_ATS_M_IDS(INTEL_VGA_DEVICE, &ats_m_desc),
 	INTEL_ARL_IDS(INTEL_VGA_DEVICE, &mtl_desc),
 	INTEL_DG2_IDS(INTEL_VGA_DEVICE, &dg2_desc),
+	INTEL_PVC_IDS(INTEL_VGA_DEVICE, &pvc_desc),
 	INTEL_MTL_IDS(INTEL_VGA_DEVICE, &mtl_desc),
 	INTEL_LNL_IDS(INTEL_VGA_DEVICE, &lnl_desc),
 	INTEL_BMG_IDS(INTEL_VGA_DEVICE, &bmg_desc),

From 20f634360f3f26b4fb97e64622c681b473d41443 Mon Sep 17 00:00:00 2001
From: Julia Filipchuk <julia.filipchuk@intel.com>
Date: Thu, 3 Apr 2025 11:56:16 -0700
Subject: [PATCH 17/21] drm/xe/pvc: Add GuC firmware definition

Add pre-release support for PVC.

UAPI version 1.13.4.

Signed-off-by: Julia Filipchuk <julia.filipchuk@intel.com>
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250403185619.1555853-7-John.C.Harrison@Intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/xe/xe_uc_fw.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/xe/xe_uc_fw.c b/drivers/gpu/drm/xe/xe_uc_fw.c
index 2741849bbf4df3..fe1813deff004b 100644
--- a/drivers/gpu/drm/xe/xe_uc_fw.c
+++ b/drivers/gpu/drm/xe/xe_uc_fw.c
@@ -117,6 +117,7 @@ struct fw_blobs_by_type {
 	fw_def(BATTLEMAGE,	GT_TYPE_ANY,	major_ver(xe,	guc,	bmg,	70, 44, 1))	\
 	fw_def(LUNARLAKE,	GT_TYPE_ANY,	major_ver(xe,	guc,	lnl,	70, 44, 1))	\
 	fw_def(METEORLAKE,	GT_TYPE_ANY,	major_ver(i915,	guc,	mtl,	70, 44, 1))	\
+	fw_def(PVC,		GT_TYPE_ANY,	mmp_ver(xe,	guc,	pvc,	70, 44, 1))	\
 	fw_def(DG2,		GT_TYPE_ANY,	major_ver(i915,	guc,	dg2,	70, 44, 1))	\
 	fw_def(DG1,		GT_TYPE_ANY,	major_ver(i915,	guc,	dg1,	70, 44, 1))	\
 	fw_def(ALDERLAKE_N,	GT_TYPE_ANY,	major_ver(i915,	guc,	tgl,	70, 44, 1))	\

From 727af07109f42257bedc30c19c98fec3805c1c89 Mon Sep 17 00:00:00 2001
From: Clint Taylor <clinton.a.taylor@intel.com>
Date: Thu, 3 Apr 2025 11:56:17 -0700
Subject: [PATCH 18/21] drm/xe/ptl: Add GuC firmware definition

Define the GuC firmware to load on the platform.

Signed-off-by: Clint Taylor <clinton.a.taylor@intel.com>
Signed-off-by: Matt Atwood <matthew.s.atwood@intel.com>
Reviewed-by: John Harrison <John.C.Harrison@Intel.com>
Acked-by: Lucas De Marchi <lucas.demarchi@intel.com>
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250403185619.1555853-8-John.C.Harrison@Intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/xe/xe_uc_fw.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/xe/xe_uc_fw.c b/drivers/gpu/drm/xe/xe_uc_fw.c
index fe1813deff004b..1db598d3b1de3f 100644
--- a/drivers/gpu/drm/xe/xe_uc_fw.c
+++ b/drivers/gpu/drm/xe/xe_uc_fw.c
@@ -114,6 +114,7 @@ struct fw_blobs_by_type {
 #define XE_GT_TYPE_ANY XE_GT_TYPE_UNINITIALIZED
 
 #define XE_GUC_FIRMWARE_DEFS(fw_def, mmp_ver, major_ver)					\
+	fw_def(PANTHERLAKE,	GT_TYPE_ANY,	mmp_ver(xe,	guc,	ptl,	70, 44, 1))	\
 	fw_def(BATTLEMAGE,	GT_TYPE_ANY,	major_ver(xe,	guc,	bmg,	70, 44, 1))	\
 	fw_def(LUNARLAKE,	GT_TYPE_ANY,	major_ver(xe,	guc,	lnl,	70, 44, 1))	\
 	fw_def(METEORLAKE,	GT_TYPE_ANY,	major_ver(i915,	guc,	mtl,	70, 44, 1))	\

From b7aa26cfe2bf552630ece2cce510350d1360b575 Mon Sep 17 00:00:00 2001
From: Rodrigo Vivi <rodrigo.vivi@intel.com>
Date: Fri, 7 Mar 2025 19:56:36 -0500
Subject: [PATCH 19/21] drm/xe/pm: Re-enable D3Cold by default on BMG

This patch re-enables D3Cold by default on BMG.

If issues on runtime_pm resume are seen and the D3cold->D0 transition
is suspected to block the device or cause memory corruptions, D3cold
can be disabled for confirmation with either:

1. at runtime:
   echo 0 > /sys/bus/pci/devices/<addr>/vram_d3cold_threshold

2. at boot:
   pcie_port_pm=off

Upon confirmation of D3Cold related bug, please file a bug to the
link below.

Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/
Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250308005636.1475420-2-rodrigo.vivi@intel.com
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/xe/xe_pm.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c
index 4e112fbacada45..aaba2a97bb3aa3 100644
--- a/drivers/gpu/drm/xe/xe_pm.c
+++ b/drivers/gpu/drm/xe/xe_pm.c
@@ -279,10 +279,6 @@ ALLOW_ERROR_INJECTION(xe_pm_init_early, ERRNO); /* See xe_pci_probe() */
 
 static u32 vram_threshold_value(struct xe_device *xe)
 {
-	/* FIXME: D3Cold temporarily disabled by default on BMG */
-	if (xe->info.platform == XE_BATTLEMAGE)
-		return 0;
-
 	return DEFAULT_VRAM_THRESHOLD;
 }
 

From 21c916e4f3d332d70fea7d5659d341de32c6ca9e Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dmitry.baryshkov@oss.qualcomm.com>
Date: Fri, 11 Apr 2025 14:44:53 +0300
Subject: [PATCH 20/21] drm-tip: 2025y-04m-11d-11h-44m-23s UTC integration
 manifest

---
 integration-manifest | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 integration-manifest

diff --git a/integration-manifest b/integration-manifest
new file mode 100644
index 00000000000000..dc489857ad36ba
--- /dev/null
+++ b/integration-manifest
@@ -0,0 +1,28 @@
+drm drm-fixes 485442c6a523de1d293350e039a9d9df9c08704c
+	Merge tag 'drm-xe-fixes-2025-04-10' of https://gitlab.freedesktop.org/drm/xe/kernel into drm-fixes
+drm-misc drm-misc-fixes 31660b406d872b5ccb3c2ec6f932969809c35b18
+	accel/ivpu: Add cmdq_id to job related logs
+drm-intel drm-intel-fixes e3ea2eae70692a455e256787e4f54153fb739b90
+	drm/i915/huc: Fix fence not released on early probe errors
+drm-xe drm-xe-fixes 88ecb66b9956a14577d513a6c8c28bb2e7989703
+	drm/xe: Restore EIO errno return when GuC PC start fails
+drm drm-next 0af2f6be1b4281385b618cb86ad946eded089ac8
+	Linux 6.15-rc1
+drm-misc drm-misc-next-fixes 85a063b8b281e144ed96463936fb4e6b3d4fe9e4
+	drm/i2c: tda998x: select CONFIG_DRM_KMS_HELPER
+drm-intel drm-intel-next-fixes 0af2f6be1b4281385b618cb86ad946eded089ac8
+	Linux 6.15-rc1
+drm-xe drm-xe-next-fixes 5e66cf6edddb5f6237e3afb07475ace57ecb56bc
+	drm/xe: Fix unmet direct dependencies warning
+drm-misc drm-misc-next 4c962bc929f1734d209a0862359e25fef8f56fa0
+	drm/hisilicon/hibmc: Add vga connector detect functions
+drm-intel drm-intel-next 1954629dc649b25071eec0d353288c5ee303e358
+	drm/i915/debugfs: move PCH type to display caps
+drm-intel drm-intel-gt-next 795dbde92fe5c6996a02a5b579481de73035e7bf
+	drm/i915/huc: Fix fence not released on early probe errors
+drm-xe drm-xe-next d11c5a928a6e1d786e25a9284ef59bf58a02cf0d
+	drm/xe/vf: Don't expose privileged GT debugfs files if VF
+drm-intel topic/core-for-CI abfe4d1d04b5806ffcf552b2482b8b279846f152
+	Revert "lockdep: Enable PROVE_RAW_LOCK_NESTING with PROVE_LOCKING."
+drm-xe topic/xe-for-CI b7aa26cfe2bf552630ece2cce510350d1360b575
+	drm/xe/pm: Re-enable D3Cold by default on BMG

From dcb5e6d1ddcf5a8a2e771095a253dba29c72162c Mon Sep 17 00:00:00 2001
From: Ankit Nautiyal <ankit.k.nautiyal@intel.com>
Date: Tue, 29 Apr 2025 20:00:55 +0530
Subject: [PATCH 21/21] drm/i915/vrr: Program EMP_AS_SDP_TL for DP AS SDP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The register EMP_AS_SDP_TL (MTL) was introduced for configuring the
double buffering point and transmission line for
HDMI Video Timing Extended Metadata Packet (VTEMP) for VRR.
This was also intended to be configured for DP to HDMI2.1 PCON to
support VRR.

>From BMG and LNL+ onwards, this register was extended to Display Port
Adaptive Sync SDP to have a common register to configure double
buffering point and transmission line for both HDMI and DP VRR related
packets.

Currently, we do not support VRR for either native HDMI or via PCON.
However we need to configure this for DP SDP case. As per the spec,
program the register to set Vsync start as the double buffering point
for DP AS SDP.

Bspec:70984, 71197

Signed-off-by: Ankit Nautiyal <ankit.k.nautiyal@intel.com>
Tested-by: Jouni Högander <jouni.hogander@intel.com>
---
 drivers/gpu/drm/i915/display/intel_vrr.c      | 20 +++++++++++++++++++
 drivers/gpu/drm/i915/display/intel_vrr_regs.h |  6 ++++++
 2 files changed, 26 insertions(+)

diff --git a/drivers/gpu/drm/i915/display/intel_vrr.c b/drivers/gpu/drm/i915/display/intel_vrr.c
index 633a66f6b73be3..39706a458d3b4d 100644
--- a/drivers/gpu/drm/i915/display/intel_vrr.c
+++ b/drivers/gpu/drm/i915/display/intel_vrr.c
@@ -573,6 +573,22 @@ bool intel_vrr_always_use_vrr_tg(struct intel_display *display)
 	return false;
 }
 
+static
+void intel_vrr_set_emp_as_sdp_tl(const struct intel_crtc_state *crtc_state)
+{
+	struct intel_display *display = to_intel_display(crtc_state);
+	enum transcoder cpu_transcoder = crtc_state->cpu_transcoder;
+
+	/*
+	 * For BMG and LNL+ onwards the EMP_AS_SDP_TL is used for programming
+	 * double buffering point and transmission line for Adaptive Sync SDP.
+	 */
+	if (DISPLAY_VERx100(display) == 1401 || DISPLAY_VER(display) >= 20)
+		intel_de_write(display,
+			       EMP_AS_SDP_TL(display, cpu_transcoder),
+			       EMP_AS_SDP_DB_TL(crtc_state->vrr.vsync_start));
+}
+
 void intel_vrr_enable(const struct intel_crtc_state *crtc_state)
 {
 	struct intel_display *display = to_intel_display(crtc_state);
@@ -592,6 +608,8 @@ void intel_vrr_enable(const struct intel_crtc_state *crtc_state)
 		       TRANS_PUSH_EN);
 
 	if (!intel_vrr_always_use_vrr_tg(display)) {
+		intel_vrr_set_emp_as_sdp_tl(crtc_state);
+
 		if (crtc_state->cmrr.enable) {
 			intel_de_write(display, TRANS_VRR_CTL(display, cpu_transcoder),
 				       VRR_CTL_VRR_ENABLE | VRR_CTL_CMRR_ENABLE |
@@ -643,6 +661,8 @@ void intel_vrr_transcoder_enable(const struct intel_crtc_state *crtc_state)
 	intel_de_write(display, TRANS_PUSH(display, cpu_transcoder),
 		       TRANS_PUSH_EN);
 
+	intel_vrr_set_emp_as_sdp_tl(crtc_state);
+
 	intel_de_write(display, TRANS_VRR_CTL(display, cpu_transcoder),
 		       VRR_CTL_VRR_ENABLE | trans_vrr_ctl(crtc_state));
 }
diff --git a/drivers/gpu/drm/i915/display/intel_vrr_regs.h b/drivers/gpu/drm/i915/display/intel_vrr_regs.h
index 6ed0e0dc97e76d..d2af1b6710bf15 100644
--- a/drivers/gpu/drm/i915/display/intel_vrr_regs.h
+++ b/drivers/gpu/drm/i915/display/intel_vrr_regs.h
@@ -108,6 +108,12 @@
 #define VRR_VSYNC_START_MASK			REG_GENMASK(12, 0)
 #define VRR_VSYNC_START(vsync_start)		REG_FIELD_PREP(VRR_VSYNC_START_MASK, (vsync_start))
 
+/* Common register for HDMI VTEMP and DP AS SDP */
+#define _EMP_AS_SDP_TL_A			0x60204
+#define EMP_AS_SDP_DB_TL_MASK			REG_GENMASK(12, 0)
+#define EMP_AS_SDP_TL(dev_priv, trans)		_MMIO_TRANS2(dev_priv, trans, _EMP_AS_SDP_TL_A)
+#define EMP_AS_SDP_DB_TL(db_transmit_line)	REG_FIELD_PREP(EMP_AS_SDP_DB_TL_MASK, (db_transmit_line))
+
 /*CMRR Registers*/
 
 #define _TRANS_CMRR_M_LO_A			0x604F0