MALI: rockchip: upgrade bifrost DDK to g15p0-01eac0, from g13p0-01eac0

Note, the corresponding mali_csffw.bin for DDK g15 MUST be used. Change-Id: Ic30634fa6247d62bf96f506c64d13b89e16b02e6 Signed-off-by: Zhen Chen <chenzhen@rock-chips.com>
2026-06-07 05:55:44 +02:00 · 2022-10-13 08:42:04 +08:00 · 2022-10-13 08:42:04 +08:00 · 034aad5dd8
commit 034aad5dd8
parent 96e93dba44
192 changed files with 10415 additions and 8170 deletions
--- a/Documentation/ABI/testing/sysfs-device-mali
+++ b/Documentation/ABI/testing/sysfs-device-mali
@ -236,6 +236,7 @@ Description:
 		device-driver that supports a CSF GPU. The duration value unit
 		is in milliseconds and is used for configuring csf scheduling
 		tick duration.
+
 What:		/sys/class/misc/mali%u/device/reset_timeout
 Description:
 		This attribute is used to set the number of milliseconds to
--- a/Documentation/devicetree/bindings/arm/mali-bifrost.txt
+++ b/Documentation/devicetree/bindings/arm/mali-bifrost.txt
@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 #
-# (C) COPYRIGHT 2013-2021 ARM Limited. All rights reserved.
+# (C) COPYRIGHT 2013-2022 ARM Limited. All rights reserved.
 #
 # This program is free software and is provided to you under the terms of the
 # GNU General Public License version 2 as published by the Free Software
@ -129,7 +129,7 @@ for details.
 		   set and the setting coresponding to the SYSC_ALLOC register.


-Example for a Mali GPU with 1 clock and no regulators:
+Example for a Mali GPU with 1 clock and 1 regulator:

 gpu@0xfc010000 {
 	compatible = "arm,malit602", "arm,malit60x", "arm,malit6xx", "arm,mali-midgard";
--- a/drivers/base/arm/Kbuild
+++ b/drivers/base/arm/Kbuild
@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 #
-# (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+# (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
 #
 # This program is free software and is provided to you under the terms of the
 # GNU General Public License version 2 as published by the Free Software
@ -28,7 +28,6 @@ subdir-ccflags-y += $(ccflags-y)
 #
 # Kernel modules
 #
-obj-$(CONFIG_DMA_BUF_LOCK) += dma_buf_lock/src/
 obj-$(CONFIG_DMA_SHARED_BUFFER_TEST_EXPORTER) += dma_buf_test_exporter/
 obj-$(CONFIG_MALI_MEMORY_GROUP_MANAGER) += memory_group_manager/
 obj-$(CONFIG_MALI_PROTECTED_MEMORY_ALLOCATOR) += protected_memory_allocator/
--- a/drivers/base/arm/Kconfig
+++ b/drivers/base/arm/Kconfig
@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 #
-# (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+# (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
 #
 # This program is free software and is provided to you under the terms of the
 # GNU General Public License version 2 as published by the Free Software
@ -26,16 +26,6 @@ menuconfig MALI_BASE_MODULES
 	  Those modules provide extra features or debug interfaces and,
 	  are optional for the use of the Mali GPU modules.

-config DMA_BUF_LOCK
-	bool "Build dma-buf lock module"
-	depends on MALI_BASE_MODULES && MALI_DMA_FENCE
-	default y
-	help
-	  This option will build the dma_buf_lock module.
-
-	  Modules:
-	    - dma_buf_lock.ko
-
 config DMA_SHARED_BUFFER_TEST_EXPORTER
 	bool "Build dma-buf framework test exporter module"
 	depends on MALI_BASE_MODULES && DMA_SHARED_BUFFER
--- a/drivers/base/arm/Makefile
+++ b/drivers/base/arm/Makefile
@ -38,11 +38,9 @@ ifeq ($(CONFIG_MALI_BASE_MODULES),y)
    CONFIG_MALI_CSF_SUPPORT ?= n

    ifneq ($(CONFIG_DMA_SHARED_BUFFER),n)
-        CONFIG_DMA_BUF_LOCK ?= y
        CONFIG_DMA_SHARED_BUFFER_TEST_EXPORTER ?= y
    else
        # Prevent misuse when CONFIG_DMA_SHARED_BUFFER=n
-        CONFIG_DMA_BUF_LOCK = n
        CONFIG_DMA_SHARED_BUFFER_TEST_EXPORTER = n
    endif

@ -54,7 +52,6 @@ ifeq ($(CONFIG_MALI_BASE_MODULES),y)

 else
    # Prevent misuse when CONFIG_MALI_BASE_MODULES=n
-    CONFIG_DMA_BUF_LOCK = n
    CONFIG_DMA_SHARED_BUFFER_TEST_EXPORTER = n
    CONFIG_MALI_MEMORY_GROUP_MANAGER = n
    CONFIG_MALI_PROTECTED_MEMORY_ALLOCATOR = n
@ -64,10 +61,9 @@ endif
 CONFIGS := \
    CONFIG_MALI_BASE_MODULES \
    CONFIG_MALI_CSF_SUPPORT \
-    CONFIG_DMA_BUF_LOCK \
    CONFIG_DMA_SHARED_BUFFER_TEST_EXPORTER \
    CONFIG_MALI_MEMORY_GROUP_MANAGER \
-    CONFIG_MALI_PROTECTED_MEMORY_ALLOCATOR
+    CONFIG_MALI_PROTECTED_MEMORY_ALLOCATOR \


 #
@ -92,26 +88,47 @@ EXTRA_CFLAGS := $(foreach config,$(CONFIGS), \
                    $(if $(filter y m,$(value $(value config))), \
                        -D$(value config)=1))

-# The following were added to align with W=1 in scripts/Makefile.extrawarn
-# from the Linux source tree
 KBUILD_CFLAGS += -Wall -Werror
+
+# The following were added to align with W=1 in scripts/Makefile.extrawarn
+# from the Linux source tree (v5.18.14)
 KBUILD_CFLAGS += -Wextra -Wunused -Wno-unused-parameter
 KBUILD_CFLAGS += -Wmissing-declarations
 KBUILD_CFLAGS += -Wmissing-format-attribute
 KBUILD_CFLAGS += -Wmissing-prototypes
 KBUILD_CFLAGS += -Wold-style-definition
-KBUILD_CFLAGS += -Wmissing-include-dirs
+# The -Wmissing-include-dirs cannot be enabled as the path to some of the
+# included directories change depending on whether it is an in-tree or
+# out-of-tree build.
 KBUILD_CFLAGS += $(call cc-option, -Wunused-but-set-variable)
 KBUILD_CFLAGS += $(call cc-option, -Wunused-const-variable)
 KBUILD_CFLAGS += $(call cc-option, -Wpacked-not-aligned)
 KBUILD_CFLAGS += $(call cc-option, -Wstringop-truncation)
 # The following turn off the warnings enabled by -Wextra
-KBUILD_CFLAGS += -Wno-missing-field-initializers
 KBUILD_CFLAGS += -Wno-sign-compare
-KBUILD_CFLAGS += -Wno-type-limits
+KBUILD_CFLAGS += -Wno-shift-negative-value
+# This flag is needed to avoid build errors on older kernels
+KBUILD_CFLAGS += $(call cc-option, -Wno-cast-function-type)

 KBUILD_CPPFLAGS += -DKBUILD_EXTRA_WARN1

+# The following were added to align with W=2 in scripts/Makefile.extrawarn
+# from the Linux source tree (v5.18.14)
+KBUILD_CFLAGS += -Wdisabled-optimization
+# The -Wshadow flag cannot be enabled unless upstream kernels are
+# patched to fix redefinitions of certain built-in functions and
+# global variables.
+KBUILD_CFLAGS += $(call cc-option, -Wlogical-op)
+KBUILD_CFLAGS += -Wmissing-field-initializers
+KBUILD_CFLAGS += -Wtype-limits
+KBUILD_CFLAGS += $(call cc-option, -Wmaybe-uninitialized)
+KBUILD_CFLAGS += $(call cc-option, -Wunused-macros)
+
+KBUILD_CPPFLAGS += -DKBUILD_EXTRA_WARN2
+
+# This warning is disabled to avoid build failures in some kernel versions
+KBUILD_CFLAGS += -Wno-ignored-qualifiers
+
 all:
 	$(MAKE) -C $(KDIR) M=$(CURDIR) $(MAKE_ARGS) EXTRA_CFLAGS="$(EXTRA_CFLAGS)" KBUILD_EXTRA_SYMBOLS="$(EXTRA_SYMBOLS)" modules

--- a/drivers/base/arm/Mconfig
+++ b/drivers/base/arm/Mconfig
@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 #
-# (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+# (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
 #
 # This program is free software and is provided to you under the terms of the
 # GNU General Public License version 2 as published by the Free Software
@ -26,16 +26,6 @@ menuconfig MALI_BASE_MODULES
 	  Those modules provide extra features or debug interfaces and,
 	  are optional for the use of the Mali GPU modules.

-config DMA_BUF_LOCK
-	bool "Build dma-buf lock module"
-	depends on MALI_BASE_MODULES && MALI_DMA_FENCE
-	default y
-	help
-	  This option will build the dma_buf_lock module.
-
-	  Modules:
-	    - dma_buf_lock.ko
-
 config DMA_SHARED_BUFFER_TEST_EXPORTER
 	bool "Build dma-buf framework test exporter module"
 	depends on MALI_BASE_MODULES
@ -45,7 +35,7 @@ config DMA_SHARED_BUFFER_TEST_EXPORTER
 	  Usable to help test importers.

 	  Modules:
-	    - dma-buf-test-exporter.ko
+	  - dma-buf-test-exporter.ko

 config MALI_MEMORY_GROUP_MANAGER
 	bool "Build Mali Memory Group Manager module"
@ -57,7 +47,7 @@ config MALI_MEMORY_GROUP_MANAGER
 	  for memory pools managed by Mali GPU device drivers.

 	  Modules:
-	    - memory_group_manager.ko
+	  - memory_group_manager.ko

 config MALI_PROTECTED_MEMORY_ALLOCATOR
 	bool "Build Mali Protected Memory Allocator module"
@ -70,5 +60,5 @@ config MALI_PROTECTED_MEMORY_ALLOCATOR
 	  of Mali GPU device drivers.

 	  Modules:
-	    - protected_memory_allocator.ko
+	  - protected_memory_allocator.ko

--- a/drivers/base/arm/dma_buf_lock/src/dma_buf_lock.c
+++ b/drivers/base/arm/dma_buf_lock/src/dma_buf_lock.c
@ -1,908 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
-/*
- *
- * (C) COPYRIGHT 2012-2014, 2017-2018, 2020-2022 ARM Limited. All rights reserved.
- *
- * This program is free software and is provided to you under the terms of the
- * GNU General Public License version 2 as published by the Free Software
- * Foundation, and any use by you of this program is subject to the terms
- * of such GNU license.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- */
-
-#include <linux/version.h>
-#include <linux/version_compat_defs.h>
-#include <linux/uaccess.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/cdev.h>
-#include <linux/device.h>
-#include <linux/slab.h>
-#include <linux/atomic.h>
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-#include <linux/reservation.h>
-#else
-#include <linux/dma-resv.h>
-#endif
-#include <linux/dma-buf.h>
-#include <linux/wait.h>
-#include <linux/sched.h>
-#include <linux/poll.h>
-#include <linux/anon_inodes.h>
-#include <linux/file.h>
-
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-
-#include <linux/fence.h>
-
-#define dma_fence_context_alloc(a) fence_context_alloc(a)
-#define dma_fence_init(a, b, c, d, e) fence_init(a, b, c, d, e)
-#define dma_fence_get(a) fence_get(a)
-#define dma_fence_put(a) fence_put(a)
-#define dma_fence_signal(a) fence_signal(a)
-#define dma_fence_is_signaled(a) fence_is_signaled(a)
-#define dma_fence_add_callback(a, b, c) fence_add_callback(a, b, c)
-#define dma_fence_remove_callback(a, b) fence_remove_callback(a, b)
-
-#if (KERNEL_VERSION(4, 9, 68) > LINUX_VERSION_CODE)
-#define dma_fence_get_status(a) (fence_is_signaled(a) ? (a)->status ?: 1 : 0)
-#else
-#define dma_fence_get_status(a) (fence_is_signaled(a) ? (a)->error ?: 1 : 0)
-#endif
-
-#else
-
-#include <linux/dma-fence.h>
-
-#if (KERNEL_VERSION(4, 11, 0) > LINUX_VERSION_CODE)
-#define dma_fence_get_status(a) (dma_fence_is_signaled(a) ? \
-	(a)->status ?: 1 \
-	: 0)
-#endif
-
-#endif /* < 4.10.0 */
-
-#include "dma_buf_lock.h"
-
-/* Maximum number of buffers that a single handle can address */
-#define DMA_BUF_LOCK_BUF_MAX 32
-
-#define DMA_BUF_LOCK_DEBUG 1
-
-#define DMA_BUF_LOCK_INIT_BIAS  0xFF
-
-static dev_t dma_buf_lock_dev;
-static struct cdev dma_buf_lock_cdev;
-static struct class *dma_buf_lock_class;
-static const char dma_buf_lock_dev_name[] = "dma_buf_lock";
-
-#if defined(HAVE_UNLOCKED_IOCTL) || defined(HAVE_COMPAT_IOCTL) || ((KERNEL_VERSION(5, 9, 0) <= LINUX_VERSION_CODE))
-static long dma_buf_lock_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
-#else
-static int dma_buf_lock_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg);
-#endif
-
-static const struct file_operations dma_buf_lock_fops = {
-	.owner   = THIS_MODULE,
-#if defined(HAVE_UNLOCKED_IOCTL) || ((KERNEL_VERSION(5, 9, 0) <= LINUX_VERSION_CODE))
-	.unlocked_ioctl   = dma_buf_lock_ioctl,
-#endif
-#if defined(HAVE_COMPAT_IOCTL) || ((KERNEL_VERSION(5, 9, 0) <= LINUX_VERSION_CODE))
-	.compat_ioctl   = dma_buf_lock_ioctl,
-#endif
-};
-
-struct dma_buf_lock_resource {
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-	struct fence fence;
-#else
-	struct dma_fence fence;
-#endif
-	int *list_of_dma_buf_fds;               /* List of buffers copied from userspace */
-	atomic_t locked;                        /* Status of lock */
-	struct dma_buf **dma_bufs;
-	unsigned long exclusive;                /* Exclusive access bitmap */
-	atomic_t fence_dep_count;		/* Number of dma-fence dependencies */
-	struct list_head dma_fence_callbacks;	/* list of all callbacks set up to wait on other fences */
-	wait_queue_head_t wait;
-	struct kref refcount;
-	struct list_head link;
-	struct work_struct work;
-	int count;
-};
-
-/**
- * struct dma_buf_lock_fence_cb - Callback data struct for dma-fence
- * @fence_cb: Callback function
- * @fence:    Pointer to the fence object on which this callback is waiting
- * @res:      Pointer to dma_buf_lock_resource that is waiting on this callback
- * @node:     List head for linking this callback to the lock resource
- */
-struct dma_buf_lock_fence_cb {
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-	struct fence_cb fence_cb;
-	struct fence *fence;
-#else
-	struct dma_fence_cb fence_cb;
-	struct dma_fence *fence;
-#endif
-	struct dma_buf_lock_resource *res;
-	struct list_head node;
-};
-
-static LIST_HEAD(dma_buf_lock_resource_list);
-static DEFINE_MUTEX(dma_buf_lock_mutex);
-
-static inline int is_dma_buf_lock_file(struct file *);
-static void dma_buf_lock_dounlock(struct kref *ref);
-
-
-/*** dma_buf_lock fence part ***/
-
-/* Spin lock protecting all Mali fences as fence->lock. */
-static DEFINE_SPINLOCK(dma_buf_lock_fence_lock);
-
-static const char *
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-dma_buf_lock_fence_get_driver_name(struct fence *fence)
-#else
-dma_buf_lock_fence_get_driver_name(struct dma_fence *fence)
-#endif
-{
-	return "dma_buf_lock";
-}
-
-static const char *
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-dma_buf_lock_fence_get_timeline_name(struct fence *fence)
-#else
-dma_buf_lock_fence_get_timeline_name(struct dma_fence *fence)
-#endif
-{
-	return "dma_buf_lock.timeline";
-}
-
-static bool
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-dma_buf_lock_fence_enable_signaling(struct fence *fence)
-#else
-dma_buf_lock_fence_enable_signaling(struct dma_fence *fence)
-#endif
-{
-	return true;
-}
-
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-const struct fence_ops dma_buf_lock_fence_ops = {
-	.wait = fence_default_wait,
-#else
-const struct dma_fence_ops dma_buf_lock_fence_ops = {
-	.wait = dma_fence_default_wait,
-#endif
-	.get_driver_name = dma_buf_lock_fence_get_driver_name,
-	.get_timeline_name = dma_buf_lock_fence_get_timeline_name,
-	.enable_signaling = dma_buf_lock_fence_enable_signaling,
-};
-
-static void
-dma_buf_lock_fence_init(struct dma_buf_lock_resource *resource)
-{
-	dma_fence_init(&resource->fence,
-		       &dma_buf_lock_fence_ops,
-		       &dma_buf_lock_fence_lock,
-		       0,
-		       0);
-}
-
-static void
-dma_buf_lock_fence_free_callbacks(struct dma_buf_lock_resource *resource)
-{
-	struct dma_buf_lock_fence_cb *cb, *tmp;
-
-	/* Clean up and free callbacks. */
-	list_for_each_entry_safe(cb, tmp, &resource->dma_fence_callbacks, node) {
-		/* Cancel callbacks that hasn't been called yet and release the
-		 * reference taken in dma_buf_lock_fence_add_callback().
-		 */
-		dma_fence_remove_callback(cb->fence, &cb->fence_cb);
-		dma_fence_put(cb->fence);
-		list_del(&cb->node);
-		kfree(cb);
-	}
-}
-
-static void
-dma_buf_lock_fence_work(struct work_struct *pwork)
-{
-	struct dma_buf_lock_resource *resource =
-		container_of(pwork, struct dma_buf_lock_resource, work);
-
-	WARN_ON(atomic_read(&resource->fence_dep_count));
-	WARN_ON(!atomic_read(&resource->locked));
-	WARN_ON(!resource->exclusive);
-
-	mutex_lock(&dma_buf_lock_mutex);
-	kref_put(&resource->refcount, dma_buf_lock_dounlock);
-	mutex_unlock(&dma_buf_lock_mutex);
-}
-
-static void
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-dma_buf_lock_fence_callback(struct fence *fence, struct fence_cb *cb)
-#else
-dma_buf_lock_fence_callback(struct dma_fence *fence, struct dma_fence_cb *cb)
-#endif
-{
-	struct dma_buf_lock_fence_cb *dma_buf_lock_cb = container_of(cb,
-				struct dma_buf_lock_fence_cb,
-				fence_cb);
-	struct dma_buf_lock_resource *resource = dma_buf_lock_cb->res;
-
-#if DMA_BUF_LOCK_DEBUG
-	pr_debug("%s\n", __func__);
-#endif
-
-	/* Callback function will be invoked in atomic context. */
-
-	if (atomic_dec_and_test(&resource->fence_dep_count)) {
-		atomic_set(&resource->locked, 1);
-		wake_up(&resource->wait);
-
-		if (resource->exclusive)
-			/* Warn if the work was already queued */
-			WARN_ON(!schedule_work(&resource->work));
-	}
-}
-
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-static int
-dma_buf_lock_fence_add_callback(struct dma_buf_lock_resource *resource,
-				struct fence *fence,
-				fence_func_t callback)
-#else
-static int
-dma_buf_lock_fence_add_callback(struct dma_buf_lock_resource *resource,
-				struct dma_fence *fence,
-				dma_fence_func_t callback)
-#endif
-{
-	int err = 0;
-	struct dma_buf_lock_fence_cb *fence_cb;
-
-	if (!fence)
-		return -EINVAL;
-
-	fence_cb = kmalloc(sizeof(*fence_cb), GFP_KERNEL);
-	if (!fence_cb)
-		return -ENOMEM;
-
-	fence_cb->fence = fence;
-	fence_cb->res   = resource;
-	INIT_LIST_HEAD(&fence_cb->node);
-
-	err = dma_fence_add_callback(fence, &fence_cb->fence_cb,
-				     callback);
-
-	if (err == -ENOENT) {
-		/* Fence signaled, get the completion result */
-		err = dma_fence_get_status(fence);
-
-		/* remap success completion to err code */
-		if (err == 1)
-			err = 0;
-
-		kfree(fence_cb);
-	} else if (err) {
-		kfree(fence_cb);
-	} else {
-		/*
-		 * Get reference to fence that will be kept until callback gets
-		 * cleaned up in dma_buf_lock_fence_free_callbacks().
-		 */
-		dma_fence_get(fence);
-		atomic_inc(&resource->fence_dep_count);
-		/* Add callback to resource's list of callbacks */
-		list_add(&fence_cb->node, &resource->dma_fence_callbacks);
-	}
-
-	return err;
-}
-
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-static int
-dma_buf_lock_add_fence_reservation_callback(struct dma_buf_lock_resource *resource,
-					    struct reservation_object *resv,
-					    bool exclusive)
-#else
-static int
-dma_buf_lock_add_fence_reservation_callback(struct dma_buf_lock_resource *resource,
-					    struct dma_resv *resv,
-					    bool exclusive)
-#endif
-{
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-	struct fence *excl_fence = NULL;
-	struct fence **shared_fences = NULL;
-#else
-	struct dma_fence *excl_fence = NULL;
-	struct dma_fence **shared_fences = NULL;
-#endif
-	unsigned int shared_count = 0;
-	int err, i;
-
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-	err = reservation_object_get_fences_rcu(
-#elif (KERNEL_VERSION(5, 14, 0) > LINUX_VERSION_CODE)
-	err = dma_resv_get_fences_rcu(
-#else
-	err = dma_resv_get_fences(
-#endif
-						resv,
-						&excl_fence,
-						&shared_count,
-						&shared_fences);
-	if (err)
-		return err;
-
-	if (excl_fence) {
-		err = dma_buf_lock_fence_add_callback(resource,
-						      excl_fence,
-						      dma_buf_lock_fence_callback);
-
-		/* Release our reference, taken by reservation_object_get_fences_rcu(),
-		 * to the fence. We have set up our callback (if that was possible),
-		 * and it's the fence's owner is responsible for singling the fence
-		 * before allowing it to disappear.
-		 */
-		dma_fence_put(excl_fence);
-
-		if (err)
-			goto out;
-	}
-
-	if (exclusive) {
-		for (i = 0; i < shared_count; i++) {
-			err = dma_buf_lock_fence_add_callback(resource,
-							      shared_fences[i],
-							      dma_buf_lock_fence_callback);
-			if (err)
-				goto out;
-		}
-	}
-
-	/* Release all our references to the shared fences, taken by
-	 * reservation_object_get_fences_rcu(). We have set up our callback (if
-	 * that was possible), and it's the fence's owner is responsible for
-	 * signaling the fence before allowing it to disappear.
-	 */
-out:
-	for (i = 0; i < shared_count; i++)
-		dma_fence_put(shared_fences[i]);
-	kfree(shared_fences);
-
-	return err;
-}
-
-static void
-dma_buf_lock_release_fence_reservation(struct dma_buf_lock_resource *resource,
-				       struct ww_acquire_ctx *ctx)
-{
-	unsigned int r;
-
-	for (r = 0; r < resource->count; r++)
-		ww_mutex_unlock(&resource->dma_bufs[r]->resv->lock);
-	ww_acquire_fini(ctx);
-}
-
-static int
-dma_buf_lock_acquire_fence_reservation(struct dma_buf_lock_resource *resource,
-				       struct ww_acquire_ctx *ctx)
-{
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-	struct reservation_object *content_resv = NULL;
-#else
-	struct dma_resv *content_resv = NULL;
-#endif
-	unsigned int content_resv_idx = 0;
-	unsigned int r;
-	int err = 0;
-
-	ww_acquire_init(ctx, &reservation_ww_class);
-
-retry:
-	for (r = 0; r < resource->count; r++) {
-		if (resource->dma_bufs[r]->resv == content_resv) {
-			content_resv = NULL;
-			continue;
-		}
-
-		err = ww_mutex_lock(&resource->dma_bufs[r]->resv->lock, ctx);
-		if (err)
-			goto error;
-	}
-
-	ww_acquire_done(ctx);
-	return err;
-
-error:
-	content_resv_idx = r;
-
-	/* Unlock the locked one ones */
-	while (r--)
-		ww_mutex_unlock(&resource->dma_bufs[r]->resv->lock);
-
-	if (content_resv)
-		ww_mutex_unlock(&content_resv->lock);
-
-	/* If we deadlock try with lock_slow and retry */
-	if (err == -EDEADLK) {
-#if DMA_BUF_LOCK_DEBUG
-		pr_debug("deadlock at dma_buf fd %i\n",
-		       resource->list_of_dma_buf_fds[content_resv_idx]);
-#endif
-		content_resv = resource->dma_bufs[content_resv_idx]->resv;
-		ww_mutex_lock_slow(&content_resv->lock, ctx);
-		goto retry;
-	}
-
-	/* If we are here the function failed */
-	ww_acquire_fini(ctx);
-	return err;
-}
-
-static int dma_buf_lock_handle_release(struct inode *inode, struct file *file)
-{
-	struct dma_buf_lock_resource *resource;
-
-	if (!is_dma_buf_lock_file(file))
-		return -EINVAL;
-
-	resource = file->private_data;
-#if DMA_BUF_LOCK_DEBUG
-	pr_debug("%s\n", __func__);
-#endif
-	mutex_lock(&dma_buf_lock_mutex);
-	kref_put(&resource->refcount, dma_buf_lock_dounlock);
-	mutex_unlock(&dma_buf_lock_mutex);
-
-	return 0;
-}
-
-static __poll_t dma_buf_lock_handle_poll(struct file *file, poll_table *wait)
-{
-	struct dma_buf_lock_resource *resource;
-	unsigned int ret = 0;
-
-	if (!is_dma_buf_lock_file(file)) {
-#if (KERNEL_VERSION(4, 19, 0) > LINUX_VERSION_CODE)
-		return POLLERR;
-#else
-		return EPOLLERR;
-#endif
-	}
-
-	resource = file->private_data;
-#if DMA_BUF_LOCK_DEBUG
-	pr_debug("%s\n", __func__);
-#endif
-	if (atomic_read(&resource->locked) == 1) {
-		/* Resources have been locked */
-#if (KERNEL_VERSION(4, 19, 0) > LINUX_VERSION_CODE)
-		ret = POLLIN | POLLRDNORM;
-		if (resource->exclusive)
-			ret |= POLLOUT | POLLWRNORM;
-#else
-		ret = EPOLLIN | EPOLLRDNORM;
-		if (resource->exclusive)
-			ret |= EPOLLOUT | EPOLLWRNORM;
-#endif
-	} else {
-		if (!poll_does_not_wait(wait))
-			poll_wait(file, &resource->wait, wait);
-	}
-#if DMA_BUF_LOCK_DEBUG
-	pr_debug("%s : return %i\n", __func__, ret);
-#endif
-	return ret;
-}
-
-static const struct file_operations dma_buf_lock_handle_fops = {
-	.owner		= THIS_MODULE,
-	.release	= dma_buf_lock_handle_release,
-	.poll		= dma_buf_lock_handle_poll,
-};
-
-/*
- * is_dma_buf_lock_file - Check if struct file* is associated with dma_buf_lock
- */
-static inline int is_dma_buf_lock_file(struct file *file)
-{
-	return file->f_op == &dma_buf_lock_handle_fops;
-}
-
-/*
- * Start requested lock.
- *
- * Allocates required memory, copies dma_buf_fd list from userspace,
- * acquires related reservation objects, and starts the lock.
- */
-static int dma_buf_lock_dolock(struct dma_buf_lock_k_request *request)
-{
-	struct dma_buf_lock_resource *resource;
-	struct ww_acquire_ctx ww_ctx;
-	struct file *file;
-	int size;
-	int fd;
-	int i;
-	int ret;
-	int error;
-
-	if (request->list_of_dma_buf_fds == NULL)
-		return -EINVAL;
-	if (request->count <= 0)
-		return -EINVAL;
-	if (request->count > DMA_BUF_LOCK_BUF_MAX)
-		return -EINVAL;
-	if (request->exclusive != DMA_BUF_LOCK_NONEXCLUSIVE &&
-	    request->exclusive != DMA_BUF_LOCK_EXCLUSIVE)
-		return -EINVAL;
-
-	resource = kzalloc(sizeof(*resource), GFP_KERNEL);
-	if (resource == NULL)
-		return -ENOMEM;
-
-	atomic_set(&resource->locked, 0);
-	kref_init(&resource->refcount);
-	INIT_LIST_HEAD(&resource->link);
-	INIT_WORK(&resource->work, dma_buf_lock_fence_work);
-	resource->count = request->count;
-
-	/* Allocate space to store dma_buf_fds received from user space */
-	size = request->count * sizeof(int);
-	resource->list_of_dma_buf_fds = kmalloc(size, GFP_KERNEL);
-
-	if (resource->list_of_dma_buf_fds == NULL) {
-		kfree(resource);
-		return -ENOMEM;
-	}
-
-	/* Allocate space to store dma_buf pointers associated with dma_buf_fds */
-	size = sizeof(struct dma_buf *) * request->count;
-	resource->dma_bufs = kmalloc(size, GFP_KERNEL);
-
-	if (resource->dma_bufs == NULL) {
-		kfree(resource->list_of_dma_buf_fds);
-		kfree(resource);
-		return -ENOMEM;
-	}
-
-	/* Copy requested list of dma_buf_fds from user space */
-	size = request->count * sizeof(int);
-	if (copy_from_user(resource->list_of_dma_buf_fds,
-			   (void __user *)request->list_of_dma_buf_fds,
-			   size) != 0) {
-		kfree(resource->list_of_dma_buf_fds);
-		kfree(resource->dma_bufs);
-		kfree(resource);
-		return -ENOMEM;
-	}
-#if DMA_BUF_LOCK_DEBUG
-	for (i = 0; i < request->count; i++)
-		pr_debug("dma_buf %i = %X\n", i, resource->list_of_dma_buf_fds[i]);
-#endif
-
-	/* Initialize the fence associated with dma_buf_lock resource */
-	dma_buf_lock_fence_init(resource);
-
-	INIT_LIST_HEAD(&resource->dma_fence_callbacks);
-
-	atomic_set(&resource->fence_dep_count, DMA_BUF_LOCK_INIT_BIAS);
-
-	/* Add resource to global list */
-	mutex_lock(&dma_buf_lock_mutex);
-
-	list_add(&resource->link, &dma_buf_lock_resource_list);
-
-	mutex_unlock(&dma_buf_lock_mutex);
-
-	for (i = 0; i < request->count; i++) {
-		/* Convert fd into dma_buf structure */
-		resource->dma_bufs[i] = dma_buf_get(resource->list_of_dma_buf_fds[i]);
-
-		if (IS_ERR_VALUE(PTR_ERR(resource->dma_bufs[i]))) {
-			mutex_lock(&dma_buf_lock_mutex);
-			kref_put(&resource->refcount, dma_buf_lock_dounlock);
-			mutex_unlock(&dma_buf_lock_mutex);
-			return -EINVAL;
-		}
-
-		/*Check the reservation object associated with dma_buf */
-		if (resource->dma_bufs[i]->resv == NULL) {
-			mutex_lock(&dma_buf_lock_mutex);
-			kref_put(&resource->refcount, dma_buf_lock_dounlock);
-			mutex_unlock(&dma_buf_lock_mutex);
-			return -EINVAL;
-		}
-#if DMA_BUF_LOCK_DEBUG
-		pr_debug("%s : dma_buf_fd %i dma_buf %p dma_fence reservation %p\n",
-		       __func__, resource->list_of_dma_buf_fds[i], resource->dma_bufs[i], resource->dma_bufs[i]->resv);
-#endif
-	}
-
-	init_waitqueue_head(&resource->wait);
-
-	kref_get(&resource->refcount);
-
-	error = get_unused_fd_flags(0);
-	if (error < 0)
-		return error;
-
-	fd = error;
-
-	file = anon_inode_getfile("dma_buf_lock", &dma_buf_lock_handle_fops, (void *)resource, 0);
-
-	if (IS_ERR(file)) {
-		put_unused_fd(fd);
-		mutex_lock(&dma_buf_lock_mutex);
-		kref_put(&resource->refcount, dma_buf_lock_dounlock);
-		kref_put(&resource->refcount, dma_buf_lock_dounlock);
-		mutex_unlock(&dma_buf_lock_mutex);
-		return PTR_ERR(file);
-	}
-
-	resource->exclusive = request->exclusive;
-
-	/* Start locking process */
-	ret = dma_buf_lock_acquire_fence_reservation(resource, &ww_ctx);
-	if (ret) {
-#if DMA_BUF_LOCK_DEBUG
-		pr_debug("%s : Error %d locking reservations.\n", __func__, ret);
-#endif
-		put_unused_fd(fd);
-		mutex_lock(&dma_buf_lock_mutex);
-		kref_put(&resource->refcount, dma_buf_lock_dounlock);
-		kref_put(&resource->refcount, dma_buf_lock_dounlock);
-		mutex_unlock(&dma_buf_lock_mutex);
-		return ret;
-	}
-
-	/* Take an extra reference for exclusive access, which will be dropped
-	 * once the pre-existing fences attached to dma-buf resources, for which
-	 * we have commited for exclusive access, are signaled.
-	 * At a given time there can be only one exclusive fence attached to a
-	 * reservation object, so the new exclusive fence replaces the original
-	 * fence and the future sync is done against the new fence which is
-	 * supposed to be signaled only after the original fence was signaled.
-	 * If the new exclusive fence is signaled prematurely then the resources
-	 * would become available for new access while they are already being
-	 * written to by the original owner.
-	 */
-	if (resource->exclusive)
-		kref_get(&resource->refcount);
-
-	for (i = 0; i < request->count; i++) {
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-		struct reservation_object *resv = resource->dma_bufs[i]->resv;
-#else
-		struct dma_resv *resv = resource->dma_bufs[i]->resv;
-#endif
-		if (!test_bit(i, &resource->exclusive)) {
-
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-			ret = reservation_object_reserve_shared(resv);
-#else
-			ret = dma_resv_reserve_shared(resv, 0);
-#endif
-			if (ret) {
-#if DMA_BUF_LOCK_DEBUG
-				pr_debug("%s : Error %d reserving space for shared fence.\n", __func__, ret);
-#endif
-				break;
-			}
-
-			ret = dma_buf_lock_add_fence_reservation_callback(resource,
-									  resv,
-									  false);
-			if (ret) {
-#if DMA_BUF_LOCK_DEBUG
-				pr_debug("%s : Error %d adding reservation to callback.\n", __func__, ret);
-#endif
-				break;
-			}
-
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-			reservation_object_add_shared_fence(resv, &resource->fence);
-#else
-			dma_resv_add_shared_fence(resv, &resource->fence);
-#endif
-		} else {
-			ret = dma_buf_lock_add_fence_reservation_callback(resource, resv, true);
-			if (ret) {
-#if DMA_BUF_LOCK_DEBUG
-				pr_debug("%s : Error %d adding reservation to callback.\n", __func__, ret);
-#endif
-				break;
-			}
-
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-			reservation_object_add_excl_fence(resv, &resource->fence);
-#else
-			dma_resv_add_excl_fence(resv, &resource->fence);
-#endif
-		}
-	}
-
-	dma_buf_lock_release_fence_reservation(resource, &ww_ctx);
-
-	/* Test if the callbacks were already triggered */
-	if (!atomic_sub_return(DMA_BUF_LOCK_INIT_BIAS, &resource->fence_dep_count)) {
-		atomic_set(&resource->locked, 1);
-
-		/* Drop the extra reference taken for exclusive access */
-		if (resource->exclusive)
-			dma_buf_lock_fence_work(&resource->work);
-	}
-
-	if (IS_ERR_VALUE((unsigned long)ret)) {
-		put_unused_fd(fd);
-
-		mutex_lock(&dma_buf_lock_mutex);
-		kref_put(&resource->refcount, dma_buf_lock_dounlock);
-		kref_put(&resource->refcount, dma_buf_lock_dounlock);
-		mutex_unlock(&dma_buf_lock_mutex);
-
-		return ret;
-	}
-
-#if DMA_BUF_LOCK_DEBUG
-	pr_debug("%s : complete\n", __func__);
-#endif
-	mutex_lock(&dma_buf_lock_mutex);
-	kref_put(&resource->refcount, dma_buf_lock_dounlock);
-	mutex_unlock(&dma_buf_lock_mutex);
-
-	/* Installing the fd is deferred to the very last operation before return
-	 * to avoid allowing userspace to close it during the setup.
-	 */
-	fd_install(fd, file);
-	return fd;
-}
-
-static void dma_buf_lock_dounlock(struct kref *ref)
-{
-	int i;
-	struct dma_buf_lock_resource *resource = container_of(ref, struct dma_buf_lock_resource, refcount);
-
-	atomic_set(&resource->locked, 0);
-
-	/* Signal the resource's fence. */
-	dma_fence_signal(&resource->fence);
-
-	dma_buf_lock_fence_free_callbacks(resource);
-
-	list_del(&resource->link);
-
-	for (i = 0; i < resource->count; i++) {
-		if (resource->dma_bufs[i])
-			dma_buf_put(resource->dma_bufs[i]);
-	}
-
-	kfree(resource->dma_bufs);
-	kfree(resource->list_of_dma_buf_fds);
-	dma_fence_put(&resource->fence);
-}
-
-static int __init dma_buf_lock_init(void)
-{
-	int err;
-#if DMA_BUF_LOCK_DEBUG
-	pr_debug("%s\n", __func__);
-#endif
-	err = alloc_chrdev_region(&dma_buf_lock_dev, 0, 1, dma_buf_lock_dev_name);
-
-	if (err == 0) {
-		cdev_init(&dma_buf_lock_cdev, &dma_buf_lock_fops);
-
-		err = cdev_add(&dma_buf_lock_cdev, dma_buf_lock_dev, 1);
-
-		if (err == 0) {
-			dma_buf_lock_class = class_create(THIS_MODULE, dma_buf_lock_dev_name);
-			if (IS_ERR(dma_buf_lock_class))
-				err = PTR_ERR(dma_buf_lock_class);
-			else {
-				struct device *mdev = device_create(
-					dma_buf_lock_class, NULL, dma_buf_lock_dev,
-					NULL, "%s", dma_buf_lock_dev_name);
-				if (!IS_ERR(mdev))
-					return 0;
-
-				err = PTR_ERR(mdev);
-				class_destroy(dma_buf_lock_class);
-			}
-			cdev_del(&dma_buf_lock_cdev);
-		}
-
-		unregister_chrdev_region(dma_buf_lock_dev, 1);
-	}
-#if DMA_BUF_LOCK_DEBUG
-	pr_debug("%s failed\n", __func__);
-#endif
-	return err;
-}
-
-static void __exit dma_buf_lock_exit(void)
-{
-#if DMA_BUF_LOCK_DEBUG
-	pr_debug("%s\n", __func__);
-#endif
-
-	/* Unlock all outstanding references */
-	while (1) {
-		struct dma_buf_lock_resource *resource;
-
-		mutex_lock(&dma_buf_lock_mutex);
-		if (list_empty(&dma_buf_lock_resource_list)) {
-			mutex_unlock(&dma_buf_lock_mutex);
-			break;
-		}
-
-		resource = list_entry(dma_buf_lock_resource_list.next,
-			struct dma_buf_lock_resource, link);
-
-		kref_put(&resource->refcount, dma_buf_lock_dounlock);
-		mutex_unlock(&dma_buf_lock_mutex);
-	}
-
-	device_destroy(dma_buf_lock_class, dma_buf_lock_dev);
-
-	class_destroy(dma_buf_lock_class);
-
-	cdev_del(&dma_buf_lock_cdev);
-
-	unregister_chrdev_region(dma_buf_lock_dev, 1);
-}
-
-#if defined(HAVE_UNLOCKED_IOCTL) || defined(HAVE_COMPAT_IOCTL) || ((KERNEL_VERSION(5, 9, 0) <= LINUX_VERSION_CODE))
-static long dma_buf_lock_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-#else
-static int dma_buf_lock_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg)
-#endif
-{
-	struct dma_buf_lock_k_request request;
-	int size = _IOC_SIZE(cmd);
-
-	if (_IOC_TYPE(cmd) != DMA_BUF_LOCK_IOC_MAGIC)
-		return -ENOTTY;
-	if ((_IOC_NR(cmd) < DMA_BUF_LOCK_IOC_MINNR) || (_IOC_NR(cmd) > DMA_BUF_LOCK_IOC_MAXNR))
-		return -ENOTTY;
-
-	switch (cmd) {
-	case DMA_BUF_LOCK_FUNC_LOCK_ASYNC:
-		if (size != sizeof(request))
-			return -ENOTTY;
-		if (copy_from_user(&request, (void __user *)arg, size))
-			return -EFAULT;
-#if DMA_BUF_LOCK_DEBUG
-		pr_debug("DMA_BUF_LOCK_FUNC_LOCK_ASYNC - %i\n", request.count);
-#endif
-		return dma_buf_lock_dolock(&request);
-	}
-
-	return -ENOTTY;
-}
-
-module_init(dma_buf_lock_init);
-module_exit(dma_buf_lock_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_INFO(import_ns, "DMA_BUF");
--- a/drivers/base/arm/dma_buf_test_exporter/build.bp
+++ b/drivers/base/arm/dma_buf_test_exporter/build.bp
@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 *
- * (C) COPYRIGHT 2017, 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2017, 2020-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -22,7 +22,7 @@
 bob_kernel_module {
    name: "dma-buf-test-exporter",
    defaults: [
-        "kernel_defaults"
+        "kernel_defaults",
    ],
    srcs: [
        "Kbuild",
--- a/drivers/base/arm/dma_buf_test_exporter/dma-buf-test-exporter.c
+++ b/drivers/base/arm/dma_buf_test_exporter/dma-buf-test-exporter.c
@ -19,7 +19,7 @@
 *
 */

-#include <linux/dma-buf-test-exporter.h>
+#include <uapi/base/arm/dma_buf_test_exporter/dma-buf-test-exporter.h>
 #include <linux/dma-buf.h>
 #include <linux/miscdevice.h>
 #include <linux/slab.h>
@ -32,6 +32,9 @@
 #include <linux/highmem.h>
 #include <linux/dma-mapping.h>

+#define DMA_BUF_TE_VER_MAJOR 1
+#define DMA_BUF_TE_VER_MINOR 0
+
 /* Maximum size allowed in a single DMA_BUF_TE_ALLOC call */
 #define DMA_BUF_TE_ALLOC_MAX_SIZE ((8ull << 30) >> PAGE_SHIFT) /* 8 GB */

--- a/drivers/base/arm/memory_group_manager/build.bp
+++ b/drivers/base/arm/memory_group_manager/build.bp
@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 *
- * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -22,7 +22,7 @@
 bob_kernel_module {
    name: "memory_group_manager",
    defaults: [
-        "kernel_defaults"
+        "kernel_defaults",
    ],
    srcs: [
        "Kbuild",
--- a/drivers/base/arm/memory_group_manager/memory_group_manager.c
+++ b/drivers/base/arm/memory_group_manager/memory_group_manager.c
@ -265,8 +265,8 @@ static struct page *example_mgm_alloc_page(
 	struct mgm_groups *const data = mgm_dev->data;
 	struct page *p;

-	dev_dbg(data->dev, "%s(mgm_dev=%p, group_id=%d gfp_mask=0x%x order=%u\n",
-		__func__, (void *)mgm_dev, group_id, gfp_mask, order);
+	dev_dbg(data->dev, "%s(mgm_dev=%pK, group_id=%d gfp_mask=0x%x order=%u\n", __func__,
+		(void *)mgm_dev, group_id, gfp_mask, order);

 	if (WARN_ON(group_id < 0) ||
 		WARN_ON(group_id >= MEMORY_GROUP_MANAGER_NR_GROUPS))
@ -291,8 +291,8 @@ static void example_mgm_free_page(
 {
 	struct mgm_groups *const data = mgm_dev->data;

-	dev_dbg(data->dev, "%s(mgm_dev=%p, group_id=%d page=%p order=%u\n",
-		__func__, (void *)mgm_dev, group_id, (void *)page, order);
+	dev_dbg(data->dev, "%s(mgm_dev=%pK, group_id=%d page=%pK order=%u\n", __func__,
+		(void *)mgm_dev, group_id, (void *)page, order);

 	if (WARN_ON(group_id < 0) ||
 		WARN_ON(group_id >= MEMORY_GROUP_MANAGER_NR_GROUPS))
@ -309,9 +309,8 @@ static int example_mgm_get_import_memory_id(
 {
 	struct mgm_groups *const data = mgm_dev->data;

-	dev_dbg(data->dev, "%s(mgm_dev=%p, import_data=%p (type=%d)\n",
-		__func__, (void *)mgm_dev, (void *)import_data,
-		(int)import_data->type);
+	dev_dbg(data->dev, "%s(mgm_dev=%pK, import_data=%pK (type=%d)\n", __func__, (void *)mgm_dev,
+		(void *)import_data, (int)import_data->type);

 	if (!WARN_ON(!import_data)) {
 		WARN_ON(!import_data->u.dma_buf);
@ -329,9 +328,8 @@ static u64 example_mgm_update_gpu_pte(
 {
 	struct mgm_groups *const data = mgm_dev->data;

-	dev_dbg(data->dev,
-		"%s(mgm_dev=%p, group_id=%d, mmu_level=%d, pte=0x%llx)\n",
-		__func__, (void *)mgm_dev, group_id, mmu_level, pte);
+	dev_dbg(data->dev, "%s(mgm_dev=%pK, group_id=%d, mmu_level=%d, pte=0x%llx)\n", __func__,
+		(void *)mgm_dev, group_id, mmu_level, pte);

 	if (WARN_ON(group_id < 0) ||
 		WARN_ON(group_id >= MEMORY_GROUP_MANAGER_NR_GROUPS))
@ -367,9 +365,9 @@ static vm_fault_t example_mgm_vmf_insert_pfn_prot(
 	vm_fault_t fault;

 	dev_dbg(data->dev,
-		"%s(mgm_dev=%p, group_id=%d, vma=%p, addr=0x%lx, pfn=0x%lx, prot=0x%llx)\n",
+		"%s(mgm_dev=%pK, group_id=%d, vma=%pK, addr=0x%lx, pfn=0x%lx, prot=0x%llx)\n",
 		__func__, (void *)mgm_dev, group_id, (void *)vma, addr, pfn,
-		(unsigned long long) pgprot_val(prot));
+		(unsigned long long)pgprot_val(prot));

 	if (WARN_ON(group_id < 0) ||
 		WARN_ON(group_id >= MEMORY_GROUP_MANAGER_NR_GROUPS))
--- a/drivers/base/arm/protected_memory_allocator/build.bp
+++ b/drivers/base/arm/protected_memory_allocator/build.bp
@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 *
- * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -22,7 +22,7 @@
 bob_kernel_module {
    name: "protected_memory_allocator",
    defaults: [
-        "kernel_defaults"
+        "kernel_defaults",
    ],
    srcs: [
        "Kbuild",
--- a/drivers/gpu/arm/bifrost/Kbuild
+++ b/drivers/gpu/arm/bifrost/Kbuild
@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 #
-# (C) COPYRIGHT 2012-2021 ARM Limited. All rights reserved.
+# (C) COPYRIGHT 2012-2022 ARM Limited. All rights reserved.
 #
 # This program is free software and is provided to you under the terms of the
 # GNU General Public License version 2 as published by the Free Software
@ -59,10 +59,8 @@ ifeq ($(CONFIG_MALI_PRFCNT_SET_SELECT_VIA_DEBUG_FS), y)
 endif

 ifeq ($(CONFIG_MALI_BIFROST_FENCE_DEBUG), y)
-    ifneq ($(CONFIG_SYNC), y)
-        ifneq ($(CONFIG_SYNC_FILE), y)
-            $(error CONFIG_MALI_BIFROST_FENCE_DEBUG depends on CONFIG_SYNC || CONFIG_SYNC_FILE to be set in Kernel configuration)
-        endif
+    ifneq ($(CONFIG_SYNC_FILE), y)
+        $(error CONFIG_MALI_BIFROST_FENCE_DEBUG depends on CONFIG_SYNC_FILE to be set in Kernel configuration)
    endif
 endif

@ -71,7 +69,7 @@ endif
 #

 # Driver version string which is returned to userspace via an ioctl
-MALI_RELEASE_NAME ?= '"g13p0-01eac0"'
+MALI_RELEASE_NAME ?= '"g15p0-01eac0"'
 # Set up defaults if not defined by build system
 ifeq ($(CONFIG_MALI_BIFROST_DEBUG), y)
    MALI_UNIT_TEST = 1
@ -151,6 +149,7 @@ bifrost_kbase-y := \
    mali_kbase_cache_policy.o \
    mali_kbase_ccswe.o \
    mali_kbase_mem.o \
+    mali_kbase_mem_migrate.o \
    mali_kbase_mem_pool_group.o \
    mali_kbase_native_mgm.o \
    mali_kbase_ctx_sched.o \
@ -159,12 +158,6 @@ bifrost_kbase-y := \
    mali_kbase_config.o \
    mali_kbase_kinstr_prfcnt.o \
    mali_kbase_vinstr.o \
-    mali_kbase_hwcnt.o \
-    mali_kbase_hwcnt_gpu.o \
-    mali_kbase_hwcnt_gpu_narrow.o \
-    mali_kbase_hwcnt_types.o \
-    mali_kbase_hwcnt_virtualizer.o \
-    mali_kbase_hwcnt_watchdog_if_timer.o \
    mali_kbase_softjobs.o \
    mali_kbase_hw.o \
    mali_kbase_debug.o \
@ -175,6 +168,7 @@ bifrost_kbase-y := \
    mali_kbase_disjoint_events.o \
    mali_kbase_debug_mem_view.o \
    mali_kbase_debug_mem_zones.o \
+    mali_kbase_debug_mem_allocs.o \
    mali_kbase_smc.o \
    mali_kbase_mem_pool.o \
    mali_kbase_mem_pool_debugfs.o \
@ -191,24 +185,14 @@ bifrost_kbase-$(CONFIG_DEBUG_FS) += mali_kbase_pbha_debugfs.o

 bifrost_kbase-$(CONFIG_MALI_CINSTR_GWT) += mali_kbase_gwt.o

-bifrost_kbase-$(CONFIG_SYNC) += \
-    mali_kbase_sync_android.o \
-    mali_kbase_sync_common.o
-
 bifrost_kbase-$(CONFIG_SYNC_FILE) += \
    mali_kbase_fence_ops.o \
    mali_kbase_sync_file.o \
    mali_kbase_sync_common.o

-ifeq ($(CONFIG_MALI_CSF_SUPPORT),y)
-    bifrost_kbase-y += \
-        mali_kbase_hwcnt_backend_csf.o \
-        mali_kbase_hwcnt_backend_csf_if_fw.o
-else
+ifneq ($(CONFIG_MALI_CSF_SUPPORT),y)
    bifrost_kbase-y += \
        mali_kbase_jm.o \
-        mali_kbase_hwcnt_backend_jm.o \
-        mali_kbase_hwcnt_backend_jm_watchdog.o \
        mali_kbase_dummy_job_wa.o \
        mali_kbase_debug_job_fault.o \
        mali_kbase_event.o \
@ -218,11 +202,6 @@ else
        mali_kbase_js_ctx_attr.o \
        mali_kbase_kinstr_jm.o

-    bifrost_kbase-$(CONFIG_MALI_BIFROST_DMA_FENCE) += \
-        mali_kbase_fence_ops.o \
-        mali_kbase_dma_fence.o \
-        mali_kbase_fence.o
-
    bifrost_kbase-$(CONFIG_SYNC_FILE) += \
        mali_kbase_fence_ops.o \
        mali_kbase_fence.o
@ -236,6 +215,7 @@ INCLUDE_SUBDIR = \
    $(src)/backend/gpu/Kbuild \
    $(src)/mmu/Kbuild \
    $(src)/tl/Kbuild \
+    $(src)/hwcnt/Kbuild \
    $(src)/gpu/Kbuild \
    $(src)/thirdparty/Kbuild \
    $(src)/platform/$(MALI_PLATFORM_DIR)/Kbuild
--- a/drivers/gpu/arm/bifrost/Kconfig
+++ b/drivers/gpu/arm/bifrost/Kconfig
@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 #
-# (C) COPYRIGHT 2012-2021 ARM Limited. All rights reserved.
+# (C) COPYRIGHT 2012-2022 ARM Limited. All rights reserved.
 #
 # This program is free software and is provided to you under the terms of the
 # GNU General Public License version 2 as published by the Free Software
@ -91,16 +91,6 @@ config MALI_BIFROST_ENABLE_TRACE
 	  Enables tracing in kbase. Trace log available through
 	  the "mali_trace" debugfs file, when the CONFIG_DEBUG_FS is enabled

-config MALI_BIFROST_DMA_FENCE
-	bool "Enable DMA_BUF fence support for Mali"
-	depends on MALI_BIFROST
-	default n
-	help
-	  Support DMA_BUF fences for Mali.
-
-	  This option should only be enabled if the Linux Kernel has built in
-	  support for DMA_BUF fences.
-
 config MALI_ARBITER_SUPPORT
 	bool "Enable arbiter support for Mali"
 	depends on MALI_BIFROST && !MALI_CSF_SUPPORT
@ -117,7 +107,7 @@ config MALI_DMA_BUF_MAP_ON_DEMAND
 	depends on MALI_BIFROST
 	default n
 	help
-	  This option caused kbase to set up the GPU mapping of imported
+	  This option will cause kbase to set up the GPU mapping of imported
 	  dma-buf when needed to run atoms. This is the legacy behavior.

 	  This is intended for testing and the option will get removed in the
@ -237,7 +227,7 @@ config MALI_BIFROST_DEBUG

 config MALI_BIFROST_FENCE_DEBUG
 	bool "Enable debug sync fence usage"
-	depends on MALI_BIFROST && MALI_BIFROST_EXPERT && (SYNC || SYNC_FILE)
+	depends on MALI_BIFROST && MALI_BIFROST_EXPERT && SYNC_FILE
 	default y if MALI_BIFROST_DEBUG
 	help
 	  Select this option to enable additional checking and reporting on the
@ -385,9 +375,6 @@ config MALI_ARBITRATION
 	  virtualization setup for Mali
 	  If unsure, say N.

-if MALI_ARBITRATION
-source "drivers/gpu/arm/bifrost/arbitration/Kconfig"
-endif

 # source "drivers/gpu/arm/bifrost/tests/Kconfig"

--- a/drivers/gpu/arm/bifrost/Makefile
+++ b/drivers/gpu/arm/bifrost/Makefile
@ -65,7 +65,7 @@ ifeq ($(CONFIG_MALI_BIFROST),m)
    endif

    ifeq ($(CONFIG_XEN),y)
-        ifneq ($(CONFIG_MALI_ARBITRATION), n)
+        ifneq ($(CONFIG_MALI_ARBITER_SUPPORT), n)
            CONFIG_MALI_XEN ?= m
        endif
    endif
@ -91,14 +91,10 @@ ifeq ($(CONFIG_MALI_BIFROST),m)
            CONFIG_MALI_BIFROST_ENABLE_TRACE ?= y
            CONFIG_MALI_BIFROST_SYSTEM_TRACE ?= y

-            ifeq ($(CONFIG_SYNC), y)
+            ifeq ($(CONFIG_SYNC_FILE), y)
                CONFIG_MALI_BIFROST_FENCE_DEBUG ?= y
            else
-                ifeq ($(CONFIG_SYNC_FILE), y)
-                    CONFIG_MALI_BIFROST_FENCE_DEBUG ?= y
-                else
-                    CONFIG_MALI_BIFROST_FENCE_DEBUG = n
-                endif
+                CONFIG_MALI_BIFROST_FENCE_DEBUG = n
            endif
        else
            # Prevent misuse when CONFIG_MALI_BIFROST_DEBUG=n
@ -160,7 +156,6 @@ CONFIGS := \
    CONFIG_MALI_BIFROST \
    CONFIG_MALI_CSF_SUPPORT \
    CONFIG_MALI_BIFROST_GATOR_SUPPORT \
-    CONFIG_MALI_BIFROST_DMA_FENCE \
    CONFIG_MALI_ARBITER_SUPPORT \
    CONFIG_MALI_ARBITRATION \
    CONFIG_MALI_ARBITER_MODULES \
@ -227,26 +222,47 @@ EXTRA_CFLAGS += -DCONFIG_MALI_PLATFORM_NAME=$(CONFIG_MALI_PLATFORM_NAME)
 # KBUILD_EXTRA_SYMBOLS to prevent warnings about unknown functions
 #

-# The following were added to align with W=1 in scripts/Makefile.extrawarn
-# from the Linux source tree
 KBUILD_CFLAGS += -Wall -Werror
+
+# The following were added to align with W=1 in scripts/Makefile.extrawarn
+# from the Linux source tree (v5.18.14)
 KBUILD_CFLAGS += -Wextra -Wunused -Wno-unused-parameter
 KBUILD_CFLAGS += -Wmissing-declarations
 KBUILD_CFLAGS += -Wmissing-format-attribute
 KBUILD_CFLAGS += -Wmissing-prototypes
 KBUILD_CFLAGS += -Wold-style-definition
-KBUILD_CFLAGS += -Wmissing-include-dirs
+# The -Wmissing-include-dirs cannot be enabled as the path to some of the
+# included directories change depending on whether it is an in-tree or
+# out-of-tree build.
 KBUILD_CFLAGS += $(call cc-option, -Wunused-but-set-variable)
 KBUILD_CFLAGS += $(call cc-option, -Wunused-const-variable)
 KBUILD_CFLAGS += $(call cc-option, -Wpacked-not-aligned)
 KBUILD_CFLAGS += $(call cc-option, -Wstringop-truncation)
 # The following turn off the warnings enabled by -Wextra
-KBUILD_CFLAGS += -Wno-missing-field-initializers
 KBUILD_CFLAGS += -Wno-sign-compare
-KBUILD_CFLAGS += -Wno-type-limits
+KBUILD_CFLAGS += -Wno-shift-negative-value
+# This flag is needed to avoid build errors on older kernels
+KBUILD_CFLAGS += $(call cc-option, -Wno-cast-function-type)

 KBUILD_CPPFLAGS += -DKBUILD_EXTRA_WARN1

+# The following were added to align with W=2 in scripts/Makefile.extrawarn
+# from the Linux source tree (v5.18.14)
+KBUILD_CFLAGS += -Wdisabled-optimization
+# The -Wshadow flag cannot be enabled unless upstream kernels are
+# patched to fix redefinitions of certain built-in functions and
+# global variables.
+KBUILD_CFLAGS += $(call cc-option, -Wlogical-op)
+KBUILD_CFLAGS += -Wmissing-field-initializers
+KBUILD_CFLAGS += -Wtype-limits
+KBUILD_CFLAGS += $(call cc-option, -Wmaybe-uninitialized)
+KBUILD_CFLAGS += $(call cc-option, -Wunused-macros)
+
+KBUILD_CPPFLAGS += -DKBUILD_EXTRA_WARN2
+
+# This warning is disabled to avoid build failures in some kernel versions
+KBUILD_CFLAGS += -Wno-ignored-qualifiers
+
 all:
 	$(MAKE) -C $(KDIR) M=$(CURDIR) $(MAKE_ARGS) EXTRA_CFLAGS="$(EXTRA_CFLAGS)" KBUILD_EXTRA_SYMBOLS="$(EXTRA_SYMBOLS)" modules

--- a/drivers/gpu/arm/bifrost/Mconfig
+++ b/drivers/gpu/arm/bifrost/Mconfig
@ -97,16 +97,6 @@ config MALI_BIFROST_ENABLE_TRACE
 	  Enables tracing in kbase. Trace log available through
 	  the "mali_trace" debugfs file, when the CONFIG_DEBUG_FS is enabled

-config MALI_BIFROST_DMA_FENCE
-	bool "Enable DMA_BUF fence support for Mali"
-	depends on MALI_BIFROST
-	default n
-	help
-	  Support DMA_BUF fences for Mali.
-
-	  This option should only be enabled if the Linux Kernel has built in
-	  support for DMA_BUF fences.
-
 config MALI_ARBITER_SUPPORT
 	bool "Enable arbiter support for Mali"
 	depends on MALI_BIFROST && !MALI_CSF_SUPPORT
@ -129,7 +119,7 @@ config MALI_DMA_BUF_MAP_ON_DEMAND
 	default n
 	default y if !DMA_BUF_SYNC_IOCTL_SUPPORTED
 	help
-	  This option caused kbase to set up the GPU mapping of imported
+	  This option will cause kbase to set up the GPU mapping of imported
 	  dma-buf when needed to run atoms. This is the legacy behavior.

 	  This is intended for testing and the option will get removed in the
@ -157,17 +147,6 @@ menuconfig MALI_BIFROST_EXPERT
 	  Enabling this option and modifying the default settings may produce
 	  a driver with performance or other limitations.

-config MALI_2MB_ALLOC
-	bool "Attempt to allocate 2MB pages"
-	depends on MALI_BIFROST && MALI_BIFROST_EXPERT
-	default n
-	help
-	  Rather than allocating all GPU memory page-by-page, attempt to
-	  allocate 2MB pages from the kernel. This reduces TLB pressure and
-	  helps to prevent memory fragmentation.
-
-	  If in doubt, say N
-
 config MALI_MEMORY_FULLY_BACKED
 	bool "Enable memory fully physically-backed"
 	depends on MALI_BIFROST && MALI_BIFROST_EXPERT
@ -200,10 +179,10 @@ config MALI_FW_CORE_DUMP

 	  Example:
 	  * To explicitly request core dump:
-		echo 1 >/sys/kernel/debug/mali0/fw_core_dump
+	  echo 1 >/sys/kernel/debug/mali0/fw_core_dump
 	  * To output current core dump (after explicitly requesting a core dump,
-	    or kernel driver reported an internal firmware error):
-                cat /sys/kernel/debug/mali0/fw_core_dump
+	  or kernel driver reported an internal firmware error):
+	  cat /sys/kernel/debug/mali0/fw_core_dump

 choice
 	prompt "Error injection level"
@ -343,5 +322,5 @@ config MALI_HW_ERRATA_1485982_USE_CLOCK_ALTERNATIVE
 	  slowest clock will be selected.


-source "kernel/drivers/gpu/arm/midgard/arbitration/Mconfig"
+source "kernel/drivers/gpu/arm/arbitration/Mconfig"
 source "kernel/drivers/gpu/arm/midgard/tests/Mconfig"
--- a/drivers/gpu/arm/bifrost/arbiter/mali_kbase_arbif.c
+++ b/drivers/gpu/arm/bifrost/arbiter/mali_kbase_arbif.c
@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
 *
- * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -28,12 +28,12 @@
 #include <tl/mali_kbase_tracepoints.h>
 #include <linux/of.h>
 #include <linux/of_platform.h>
-#include "mali_kbase_arbiter_interface.h"
+#include "linux/mali_arbiter_interface.h"

 /* Arbiter interface version against which was implemented this module */
 #define MALI_REQUIRED_KBASE_ARBITER_INTERFACE_VERSION 5
 #if MALI_REQUIRED_KBASE_ARBITER_INTERFACE_VERSION != \
-			MALI_KBASE_ARBITER_INTERFACE_VERSION
+			MALI_ARBITER_INTERFACE_VERSION
 #error "Unsupported Mali Arbiter interface version."
 #endif

@ -205,6 +205,7 @@ int kbase_arbif_init(struct kbase_device *kbdev)

 	if (!pdev->dev.driver || !try_module_get(pdev->dev.driver->owner)) {
 		dev_err(kbdev->dev, "arbiter_if driver not available\n");
+		put_device(&pdev->dev);
 		return -EPROBE_DEFER;
 	}
 	kbdev->arb.arb_dev = &pdev->dev;
@ -212,6 +213,7 @@ int kbase_arbif_init(struct kbase_device *kbdev)
 	if (!arb_if) {
 		dev_err(kbdev->dev, "arbiter_if driver not ready\n");
 		module_put(pdev->dev.driver->owner);
+		put_device(&pdev->dev);
 		return -EPROBE_DEFER;
 	}

@ -233,6 +235,7 @@ int kbase_arbif_init(struct kbase_device *kbdev)
 		if (err) {
 			dev_err(&pdev->dev, "Failed to register with arbiter\n");
 			module_put(pdev->dev.driver->owner);
+			put_device(&pdev->dev);
 			if (err != -EPROBE_DEFER)
 				err = -EFAULT;
 			return err;
@ -262,8 +265,10 @@ void kbase_arbif_destroy(struct kbase_device *kbdev)
 		arb_if->vm_ops.vm_arb_unregister_dev(kbdev->arb.arb_if);
 	}
 	kbdev->arb.arb_if = NULL;
-	if (kbdev->arb.arb_dev)
+	if (kbdev->arb.arb_dev) {
 		module_put(kbdev->arb.arb_dev->driver->owner);
+		put_device(kbdev->arb.arb_dev);
+	}
 	kbdev->arb.arb_dev = NULL;
 }

--- a/drivers/gpu/arm/bifrost/arbitration/Kconfig
+++ b/drivers/gpu/arm/bifrost/arbitration/Kconfig
@ -1,49 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note OR MIT
-#
-# (C) COPYRIGHT 2012-2021 ARM Limited. All rights reserved.
-#
-# This program is free software and is provided to you under the terms of the
-# GNU General Public License version 2 as published by the Free Software
-# Foundation, and any use by you of this program is subject to the terms
-# of such GNU license.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, you can access it online at
-# http://www.gnu.org/licenses/gpl-2.0.html.
-#
-#
-
-config MALI_XEN
-	tristate "Enable Xen Interface reference code"
-	depends on MALI_ARBITRATION && XEN
-	default n
-	help
-	  Enables the build of xen interface modules used in the reference
-	  virtualization setup for Mali
-	  If unsure, say N.
-
-config MALI_ARBITER_MODULES
-	tristate "Enable mali arbiter modules"
-	depends on MALI_ARBITRATION
-	default y
-	help
-	  Enables the build of the arbiter modules used in the reference
-	  virtualization setup for Mali
-	  If unsure, say N
-
-config MALI_GPU_POWER_MODULES
-	tristate "Enable gpu power modules"
-	depends on MALI_ARBITRATION
-	default y
-	help
-	  Enables the build of the gpu power modules used in the reference
-	  virtualization setup for Mali
-	  If unsure, say N
-
-
-source "drivers/gpu/arm/bifrost/arbitration/ptm/Kconfig"
--- a/drivers/gpu/arm/bifrost/arbitration/ptm/Kconfig
+++ b/drivers/gpu/arm/bifrost/arbitration/ptm/Kconfig
@ -1,28 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note OR MIT
-#
-# (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
-#
-# This program is free software and is provided to you under the terms of the
-# GNU General Public License version 2 as published by the Free Software
-# Foundation, and any use by you of this program is subject to the terms
-# of such GNU license.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, you can access it online at
-# http://www.gnu.org/licenses/gpl-2.0.html.
-#
-#
-
-config MALI_PARTITION_MANAGER
-	tristate "Enable compilation of partition manager modules"
-	depends on MALI_ARBITRATION
-	default n
-	help
-	  This option enables the compilation of the partition manager
-	  modules used to configure the Mali-G78AE GPU.
-
--- a/drivers/gpu/arm/bifrost/backend/gpu/Kbuild
+++ b/drivers/gpu/arm/bifrost/backend/gpu/Kbuild
@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 #
-# (C) COPYRIGHT 2014-2021 ARM Limited. All rights reserved.
+# (C) COPYRIGHT 2014-2022 ARM Limited. All rights reserved.
 #
 # This program is free software and is provided to you under the terms of the
 # GNU General Public License version 2 as published by the Free Software
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_cache_policy_backend.c
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_cache_policy_backend.c
@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
 *
- * (C) COPYRIGHT 2014-2016, 2018, 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2016, 2018, 2020-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -22,12 +22,32 @@
 #include "backend/gpu/mali_kbase_cache_policy_backend.h"
 #include <device/mali_kbase_device.h>

+/**
+ * kbasep_amba_register_present() - Check AMBA_<> register is present
+ *                                  in the GPU.
+ * @kbdev:    Device pointer
+ *
+ * Note: Only for arch version 12.x.1 onwards.
+ *
+ * Return: true if AMBA_FEATURES/ENABLE registers are present.
+ */
+static bool kbasep_amba_register_present(struct kbase_device *kbdev)
+{
+	return (ARCH_MAJOR_REV_REG(kbdev->gpu_props.props.raw_props.gpu_id) >=
+		GPU_ID2_ARCH_MAJOR_REV_MAKE(12, 1));
+}

 void kbase_cache_set_coherency_mode(struct kbase_device *kbdev,
 		u32 mode)
 {
 	kbdev->current_gpu_coherency_mode = mode;

+	if (kbasep_amba_register_present(kbdev)) {
+		u32 val = kbase_reg_read(kbdev, AMBA_ENABLE);
+
+		val = AMBA_ENABLE_COHERENCY_PROTOCOL_SET(val, mode);
+		kbase_reg_write(kbdev, AMBA_ENABLE, val);
+	} else
 		kbase_reg_write(kbdev, COHERENCY_ENABLE, mode);
 }

@ -35,9 +55,38 @@ u32 kbase_cache_get_coherency_features(struct kbase_device *kbdev)
 {
 	u32 coherency_features;

+	if (kbasep_amba_register_present(kbdev))
+		coherency_features =
+			kbase_reg_read(kbdev, GPU_CONTROL_REG(AMBA_FEATURES));
+	else
 		coherency_features = kbase_reg_read(
 			kbdev, GPU_CONTROL_REG(COHERENCY_FEATURES));

 	return coherency_features;
 }

+void kbase_amba_set_memory_cache_support(struct kbase_device *kbdev,
+					 bool enable)
+{
+	if (kbasep_amba_register_present(kbdev)) {
+		u32 val = kbase_reg_read(kbdev, AMBA_ENABLE);
+
+		val = AMBA_ENABLE_MEMORY_CACHE_SUPPORT_SET(val, enable);
+		kbase_reg_write(kbdev, AMBA_ENABLE, val);
+
+	} else {
+		WARN(1, "memory_cache_support not supported");
+	}
+}
+
+void kbase_amba_set_invalidate_hint(struct kbase_device *kbdev, bool enable)
+{
+	if (kbasep_amba_register_present(kbdev)) {
+		u32 val = kbase_reg_read(kbdev, AMBA_ENABLE);
+
+		val = AMBA_ENABLE_INVALIDATE_HINT_SET(val, enable);
+		kbase_reg_write(kbdev, AMBA_ENABLE, val);
+	} else {
+		WARN(1, "invalidate_hint not supported");
+	}
+}
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_cache_policy_backend.h
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_cache_policy_backend.h
@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 *
- * (C) COPYRIGHT 2014-2016, 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2016, 2020-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -43,4 +43,23 @@ void kbase_cache_set_coherency_mode(struct kbase_device *kbdev,
 */
 u32 kbase_cache_get_coherency_features(struct kbase_device *kbdev);

+/**
+ * kbase_amba_set_memory_cache_support() - Sets AMBA memory cache support
+ *                                         in the GPU.
+ * @kbdev:    Device pointer
+ * @enable:   true for enable.
+ *
+ * Note: Only for arch version 12.x.1 onwards.
+ */
+void kbase_amba_set_memory_cache_support(struct kbase_device *kbdev,
+					 bool enable);
+/**
+ * kbase_amba_set_invalidate_hint() - Sets AMBA invalidate hint
+ *                                    in the GPU.
+ * @kbdev:    Device pointer
+ * @enable:   true for enable.
+ *
+ * Note: Only for arch version 12.x.1 onwards.
+ */
+void kbase_amba_set_invalidate_hint(struct kbase_device *kbdev, bool enable);
 #endif /* _KBASE_CACHE_POLICY_BACKEND_H_ */
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_instr_defs.h
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_instr_defs.h
@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 *
- * (C) COPYRIGHT 2014, 2016, 2018-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014, 2016, 2018-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -26,7 +26,7 @@
 #ifndef _KBASE_INSTR_DEFS_H_
 #define _KBASE_INSTR_DEFS_H_

-#include <mali_kbase_hwcnt_gpu.h>
+#include <hwcnt/mali_kbase_hwcnt_gpu.h>

 /*
 * Instrumentation State Machine States
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_irq_linux.c
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_irq_linux.c
@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
 *
- * (C) COPYRIGHT 2014-2016, 2018-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2016, 2018-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -163,7 +163,6 @@ static irq_handler_t kbase_handler_table[] = {

 #ifdef CONFIG_MALI_BIFROST_DEBUG
 #define  JOB_IRQ_HANDLER JOB_IRQ_TAG
-#define  MMU_IRQ_HANDLER MMU_IRQ_TAG
 #define  GPU_IRQ_HANDLER GPU_IRQ_TAG

 /**
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_jm_hw.c
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_jm_hw.c
@ -34,7 +34,7 @@
 #include <mali_kbase_ctx_sched.h>
 #include <mali_kbase_kinstr_jm.h>
 #include <mali_kbase_hwaccess_instr.h>
-#include <mali_kbase_hwcnt_context.h>
+#include <hwcnt/mali_kbase_hwcnt_context.h>
 #include <device/mali_kbase_device.h>
 #include <backend/gpu/mali_kbase_irq_internal.h>
 #include <backend/gpu/mali_kbase_jm_internal.h>
@ -1440,6 +1440,11 @@ bool kbase_reset_gpu_is_active(struct kbase_device *kbdev)
 	return true;
 }

+bool kbase_reset_gpu_is_not_pending(struct kbase_device *kbdev)
+{
+	return atomic_read(&kbdev->hwaccess.backend.reset_gpu) == KBASE_RESET_GPU_NOT_PENDING;
+}
+
 int kbase_reset_gpu_wait(struct kbase_device *kbdev)
 {
 	wait_event(kbdev->hwaccess.backend.reset_wait,
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_jm_rb.c
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_jm_rb.c
@ -29,7 +29,7 @@
 #include <mali_kbase_jm.h>
 #include <mali_kbase_js.h>
 #include <tl/mali_kbase_tracepoints.h>
-#include <mali_kbase_hwcnt_context.h>
+#include <hwcnt/mali_kbase_hwcnt_context.h>
 #include <mali_kbase_reset_gpu.h>
 #include <mali_kbase_kinstr_jm.h>
 #include <backend/gpu/mali_kbase_cache_policy_backend.h>
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_model_dummy.c
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_model_dummy.c
@ -80,31 +80,360 @@ static bool ipa_control_timer_enabled;
 #endif

 #define LO_MASK(M) ((M) & 0xFFFFFFFF)
+#if !MALI_USE_CSF
 #define HI_MASK(M) ((M) & 0xFFFFFFFF00000000)
+#endif

-static u32 get_implementation_register(u32 reg)
-{
-	switch (reg) {
-	case GPU_CONTROL_REG(SHADER_PRESENT_LO):
-		return LO_MASK(DUMMY_IMPLEMENTATION_SHADER_PRESENT);
-	case GPU_CONTROL_REG(TILER_PRESENT_LO):
-		return LO_MASK(DUMMY_IMPLEMENTATION_TILER_PRESENT);
-	case GPU_CONTROL_REG(L2_PRESENT_LO):
-		return LO_MASK(DUMMY_IMPLEMENTATION_L2_PRESENT);
-	case GPU_CONTROL_REG(STACK_PRESENT_LO):
-		return LO_MASK(DUMMY_IMPLEMENTATION_STACK_PRESENT);
+/* Construct a value for the THREAD_FEATURES register, *except* the two most
+ * significant bits, which are set to IMPLEMENTATION_MODEL in
+ * midgard_model_read_reg().
+ */
+#if MALI_USE_CSF
+#define THREAD_FEATURES_PARTIAL(MAX_REGISTERS, MAX_TASK_QUEUE, MAX_TG_SPLIT)                       \
+	((MAX_REGISTERS) | ((MAX_TASK_QUEUE) << 24))
+#else
+#define THREAD_FEATURES_PARTIAL(MAX_REGISTERS, MAX_TASK_QUEUE, MAX_TG_SPLIT)                       \
+	((MAX_REGISTERS) | ((MAX_TASK_QUEUE) << 16) | ((MAX_TG_SPLIT) << 24))
+#endif

-	case GPU_CONTROL_REG(SHADER_PRESENT_HI):
-	case GPU_CONTROL_REG(TILER_PRESENT_HI):
-	case GPU_CONTROL_REG(L2_PRESENT_HI):
-	case GPU_CONTROL_REG(STACK_PRESENT_HI):
-	/* *** FALLTHROUGH *** */
-	default:
-		return 0;
-	}
-}
+struct error_status_t hw_error_status;

-struct {
+/**
+ * struct control_reg_values_t - control register values specific to the GPU being 'emulated'
+ * @name:			GPU name
+ * @gpu_id:			GPU ID to report
+ * @as_present:			Bitmap of address spaces present
+ * @thread_max_threads:		Maximum number of threads per core
+ * @thread_max_workgroup_size:	Maximum number of threads per workgroup
+ * @thread_max_barrier_size:	Maximum number of threads per barrier
+ * @thread_features:		Thread features, NOT INCLUDING the 2
+ *				most-significant bits, which are always set to
+ *				IMPLEMENTATION_MODEL.
+ * @core_features:		Core features
+ * @tiler_features:		Tiler features
+ * @mmu_features:		MMU features
+ * @gpu_features_lo:		GPU features (low)
+ * @gpu_features_hi:		GPU features (high)
+ * @shader_present:		Available shader bitmap
+ * @stack_present:		Core stack present bitmap
+ *
+ */
+struct control_reg_values_t {
+	const char *name;
+	u32 gpu_id;
+	u32 as_present;
+	u32 thread_max_threads;
+	u32 thread_max_workgroup_size;
+	u32 thread_max_barrier_size;
+	u32 thread_features;
+	u32 core_features;
+	u32 tiler_features;
+	u32 mmu_features;
+	u32 gpu_features_lo;
+	u32 gpu_features_hi;
+	u32 shader_present;
+	u32 stack_present;
+};
+
+struct job_slot {
+	int job_active;
+	int job_queued;
+	int job_complete_irq_asserted;
+	int job_irq_mask;
+	int job_disabled;
+};
+
+struct dummy_model_t {
+	int reset_completed;
+	int reset_completed_mask;
+#if !MALI_USE_CSF
+	int prfcnt_sample_completed;
+#endif /* !MALI_USE_CSF */
+	int power_changed_mask;	/* 2bits: _ALL,_SINGLE */
+	int power_changed;	/* 1bit */
+	bool clean_caches_completed;
+	bool clean_caches_completed_irq_enabled;
+#if MALI_USE_CSF
+	bool flush_pa_range_completed;
+	bool flush_pa_range_completed_irq_enabled;
+#endif
+	int power_on;		/* 6bits: SHADER[4],TILER,L2 */
+	u32 stack_power_on_lo;
+	u32 coherency_enable;
+	unsigned int job_irq_js_state;
+	struct job_slot slots[NUM_SLOTS];
+	const struct control_reg_values_t *control_reg_values;
+	u32 l2_config;
+	void *data;
+};
+
+/* Array associating GPU names with control register values. The first
+ * one is used in the case of no match.
+ */
+static const struct control_reg_values_t all_control_reg_values[] = {
+	{
+		.name = "tMIx",
+		.gpu_id = GPU_ID2_MAKE(6, 0, 10, 0, 0, 1, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tHEx",
+		.gpu_id = GPU_ID2_MAKE(6, 2, 0, 1, 0, 3, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tSIx",
+		.gpu_id = GPU_ID2_MAKE(7, 0, 0, 0, 1, 1, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x300,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
+		.tiler_features = 0x209,
+		.mmu_features = 0x2821,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tDVx",
+		.gpu_id = GPU_ID2_MAKE(7, 0, 0, 3, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x300,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
+		.tiler_features = 0x209,
+		.mmu_features = 0x2821,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tNOx",
+		.gpu_id = GPU_ID2_MAKE(7, 2, 1, 1, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tGOx_r0p0",
+		.gpu_id = GPU_ID2_MAKE(7, 2, 2, 2, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tGOx_r1p0",
+		.gpu_id = GPU_ID2_MAKE(7, 4, 0, 2, 1, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
+		.core_features = 0x2,
+		.tiler_features = 0x209,
+		.mmu_features = 0x2823,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tTRx",
+		.gpu_id = GPU_ID2_MAKE(9, 0, 8, 0, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tNAx",
+		.gpu_id = GPU_ID2_MAKE(9, 0, 8, 1, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tBEx",
+		.gpu_id = GPU_ID2_MAKE(9, 2, 0, 2, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tBAx",
+		.gpu_id = GPU_ID2_MAKE(9, 14, 4, 5, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tDUx",
+		.gpu_id = GPU_ID2_MAKE(10, 2, 0, 1, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tODx",
+		.gpu_id = GPU_ID2_MAKE(10, 8, 0, 2, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tGRx",
+		.gpu_id = GPU_ID2_MAKE(10, 10, 0, 3, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.core_features = 0x0, /* core_1e16fma2tex */
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tVAx",
+		.gpu_id = GPU_ID2_MAKE(10, 12, 0, 4, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.core_features = 0x0, /* core_1e16fma2tex */
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tTUx",
+		.gpu_id = GPU_ID2_MAKE(11, 8, 5, 2, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x800,
+		.thread_max_workgroup_size = 0x400,
+		.thread_max_barrier_size = 0x400,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x10000, 4, 0),
+		.core_features = 0x0, /* core_1e32fma2tex */
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0xf,
+		.gpu_features_hi = 0,
+		.shader_present = 0xFF,
+		.stack_present = 0xF,
+	},
+	{
+		.name = "tTIx",
+		.gpu_id = GPU_ID2_MAKE(12, 8, 1, 0, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x800,
+		.thread_max_workgroup_size = 0x400,
+		.thread_max_barrier_size = 0x400,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x10000, 16, 0),
+		.core_features = 0x1, /* core_1e64fma4tex */
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0xf,
+		.gpu_features_hi = 0,
+		.shader_present = 0xFF,
+		.stack_present = 0xF,
+	},
+};
+
+static struct {
 	spinlock_t access_lock;
 #if !MALI_USE_CSF
 	unsigned long prfcnt_base;
@ -125,74 +454,33 @@ struct {
 #endif /* !MALI_USE_CSF */
 	u64 tiler_counters[KBASE_DUMMY_MODEL_COUNTER_PER_CORE];
 	u64 l2_counters[KBASE_DUMMY_MODEL_MAX_MEMSYS_BLOCKS *
-			KBASE_DUMMY_MODEL_COUNTER_PER_CORE];
+					KBASE_DUMMY_MODEL_COUNTER_PER_CORE];
 	u64 shader_counters[KBASE_DUMMY_MODEL_MAX_SHADER_CORES *
-			    KBASE_DUMMY_MODEL_COUNTER_PER_CORE];
+						KBASE_DUMMY_MODEL_COUNTER_PER_CORE];
+} performance_counters;

-} performance_counters = {
-	.l2_present = DUMMY_IMPLEMENTATION_L2_PRESENT,
-	.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
-};
+static u32 get_implementation_register(u32 reg,
+				       const struct control_reg_values_t *const control_reg_values)
+{
+	switch (reg) {
+	case GPU_CONTROL_REG(SHADER_PRESENT_LO):
+		return LO_MASK(control_reg_values->shader_present);
+	case GPU_CONTROL_REG(TILER_PRESENT_LO):
+		return LO_MASK(DUMMY_IMPLEMENTATION_TILER_PRESENT);
+	case GPU_CONTROL_REG(L2_PRESENT_LO):
+		return LO_MASK(DUMMY_IMPLEMENTATION_L2_PRESENT);
+	case GPU_CONTROL_REG(STACK_PRESENT_LO):
+		return LO_MASK(control_reg_values->stack_present);

-struct job_slot {
-	int job_active;
-	int job_queued;
-	int job_complete_irq_asserted;
-	int job_irq_mask;
-	int job_disabled;
-};
-
-/**
- * struct control_reg_values_t - control register values specific to the GPU being 'emulated'
- * @name:			GPU name
- * @gpu_id:			GPU ID to report
- * @as_present:			Bitmap of address spaces present
- * @thread_max_threads:		Maximum number of threads per core
- * @thread_max_workgroup_size:	Maximum number of threads per workgroup
- * @thread_max_barrier_size:	Maximum number of threads per barrier
- * @thread_features:		Thread features, NOT INCLUDING the 2
- *				most-significant bits, which are always set to
- *				IMPLEMENTATION_MODEL.
- * @core_features:		Core features
- * @tiler_features:		Tiler features
- * @mmu_features:		MMU features
- * @gpu_features_lo:		GPU features (low)
- * @gpu_features_hi:		GPU features (high)
- */
-struct control_reg_values_t {
-	const char *name;
-	u32 gpu_id;
-	u32 as_present;
-	u32 thread_max_threads;
-	u32 thread_max_workgroup_size;
-	u32 thread_max_barrier_size;
-	u32 thread_features;
-	u32 core_features;
-	u32 tiler_features;
-	u32 mmu_features;
-	u32 gpu_features_lo;
-	u32 gpu_features_hi;
-};
-
-struct dummy_model_t {
-	int reset_completed;
-	int reset_completed_mask;
-#if !MALI_USE_CSF
-	int prfcnt_sample_completed;
-#endif /* !MALI_USE_CSF */
-	int power_changed_mask;	/* 2bits: _ALL,_SINGLE */
-	int power_changed;	/* 1bit */
-	bool clean_caches_completed;
-	bool clean_caches_completed_irq_enabled;
-	int power_on;		/* 6bits: SHADER[4],TILER,L2 */
-	u32 stack_power_on_lo;
-	u32 coherency_enable;
-	unsigned int job_irq_js_state;
-	struct job_slot slots[NUM_SLOTS];
-	const struct control_reg_values_t *control_reg_values;
-	u32 l2_config;
-	void *data;
-};
+	case GPU_CONTROL_REG(SHADER_PRESENT_HI):
+	case GPU_CONTROL_REG(TILER_PRESENT_HI):
+	case GPU_CONTROL_REG(L2_PRESENT_HI):
+	case GPU_CONTROL_REG(STACK_PRESENT_HI):
+	/* *** FALLTHROUGH *** */
+	default:
+		return 0;
+	}
+}

 void gpu_device_set_data(void *model, void *data)
 {
@ -221,238 +509,6 @@ static char *no_mali_gpu = CONFIG_MALI_NO_MALI_DEFAULT_GPU;
 module_param(no_mali_gpu, charp, 0000);
 MODULE_PARM_DESC(no_mali_gpu, "GPU to identify as");

-/* Construct a value for the THREAD_FEATURES register, *except* the two most
- * significant bits, which are set to IMPLEMENTATION_MODEL in
- * midgard_model_read_reg().
- */
-#if MALI_USE_CSF
-#define THREAD_FEATURES_PARTIAL(MAX_REGISTERS, MAX_TASK_QUEUE, MAX_TG_SPLIT) \
-	((MAX_REGISTERS) | ((MAX_TASK_QUEUE) << 24))
-#else
-#define THREAD_FEATURES_PARTIAL(MAX_REGISTERS, MAX_TASK_QUEUE, MAX_TG_SPLIT) \
-	((MAX_REGISTERS) | ((MAX_TASK_QUEUE) << 16) | ((MAX_TG_SPLIT) << 24))
-#endif
-
-/* Array associating GPU names with control register values. The first
- * one is used in the case of no match.
- */
-static const struct control_reg_values_t all_control_reg_values[] = {
-	{
-		.name = "tMIx",
-		.gpu_id = GPU_ID2_MAKE(6, 0, 10, 0, 0, 1, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tHEx",
-		.gpu_id = GPU_ID2_MAKE(6, 2, 0, 1, 0, 3, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tSIx",
-		.gpu_id = GPU_ID2_MAKE(7, 0, 0, 0, 1, 1, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x300,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
-		.tiler_features = 0x209,
-		.mmu_features = 0x2821,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tDVx",
-		.gpu_id = GPU_ID2_MAKE(7, 0, 0, 3, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x300,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
-		.tiler_features = 0x209,
-		.mmu_features = 0x2821,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tNOx",
-		.gpu_id = GPU_ID2_MAKE(7, 2, 1, 1, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tGOx_r0p0",
-		.gpu_id = GPU_ID2_MAKE(7, 2, 2, 2, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tGOx_r1p0",
-		.gpu_id = GPU_ID2_MAKE(7, 4, 0, 2, 1, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
-		.core_features = 0x2,
-		.tiler_features = 0x209,
-		.mmu_features = 0x2823,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tTRx",
-		.gpu_id = GPU_ID2_MAKE(9, 0, 8, 0, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tNAx",
-		.gpu_id = GPU_ID2_MAKE(9, 0, 8, 1, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tBEx",
-		.gpu_id = GPU_ID2_MAKE(9, 2, 0, 2, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tBAx",
-		.gpu_id = GPU_ID2_MAKE(9, 14, 4, 5, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tDUx",
-		.gpu_id = GPU_ID2_MAKE(10, 2, 0, 1, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tODx",
-		.gpu_id = GPU_ID2_MAKE(10, 8, 0, 2, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tGRx",
-		.gpu_id = GPU_ID2_MAKE(10, 10, 0, 3, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
-		.core_features = 0x0, /* core_1e16fma2tex */
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tVAx",
-		.gpu_id = GPU_ID2_MAKE(10, 12, 0, 4, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
-		.core_features = 0x0, /* core_1e16fma2tex */
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tTUx",
-		.gpu_id = GPU_ID2_MAKE(11, 8, 5, 2, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x800,
-		.thread_max_workgroup_size = 0x400,
-		.thread_max_barrier_size = 0x400,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x10000, 4, 0),
-		.core_features = 0x0, /* core_1e32fma2tex */
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0xf,
-		.gpu_features_hi = 0,
-	},
-};
-
-struct error_status_t hw_error_status;
-
 #if MALI_USE_CSF
 static u32 gpu_model_get_prfcnt_value(enum kbase_ipa_core_type core_type,
 				      u32 cnt_idx, bool is_low_word)
@ -1011,6 +1067,21 @@ static const struct control_reg_values_t *find_control_reg_values(const char *gp
 	size_t i;
 	const struct control_reg_values_t *ret = NULL;

+	/* Edge case for tGOx, as it has 2 entries in the table for its R0 and R1
+	 * revisions respectively. As none of them are named "tGOx" the name comparison
+	 * needs to be fixed in these cases. CONFIG_GPU_HWVER should be one of "r0p0"
+	 * or "r1p0" and is derived from the DDK's build configuration. In cases
+	 * where it is unavailable, it defaults to tGOx r1p0.
+	 */
+	if (!strcmp(gpu, "tGOx")) {
+#ifdef CONFIG_GPU_HWVER
+		if (!strcmp(CONFIG_GPU_HWVER, "r0p0"))
+			gpu = "tGOx_r0p0";
+		else if (!strcmp(CONFIG_GPU_HWVER, "r1p0"))
+#endif /* CONFIG_GPU_HWVER defined */
+			gpu = "tGOx_r1p0";
+	}
+
 	for (i = 0; i < ARRAY_SIZE(all_control_reg_values); ++i) {
 		const struct control_reg_values_t * const fcrv = &all_control_reg_values[i];

@ -1043,6 +1114,10 @@ void *midgard_model_create(const void *config)
 		dummy->job_irq_js_state = 0;
 		init_register_statuses(dummy);
 		dummy->control_reg_values = find_control_reg_values(no_mali_gpu);
+		performance_counters.l2_present = get_implementation_register(
+			GPU_CONTROL_REG(L2_PRESENT_LO), dummy->control_reg_values);
+		performance_counters.shader_present = get_implementation_register(
+			GPU_CONTROL_REG(SHADER_PRESENT_LO), dummy->control_reg_values);
 	}
 	return dummy;
 }
@ -1066,6 +1141,8 @@ static void midgard_model_get_outputs(void *h)
 	    hw_error_status.gpu_error_irq ||
 #if !MALI_USE_CSF
 	    dummy->prfcnt_sample_completed ||
+#else
+	    (dummy->flush_pa_range_completed && dummy->flush_pa_range_completed_irq_enabled) ||
 #endif
 	    (dummy->clean_caches_completed && dummy->clean_caches_completed_irq_enabled))
 		gpu_device_raise_irq(dummy, GPU_DUMMY_GPU_IRQ);
@ -1235,6 +1312,9 @@ u8 midgard_model_write_reg(void *h, u32 addr, u32 value)
 		dummy->reset_completed_mask = (value >> 8) & 0x01;
 		dummy->power_changed_mask = (value >> 9) & 0x03;
 		dummy->clean_caches_completed_irq_enabled = (value & (1u << 17)) != 0u;
+#if MALI_USE_CSF
+		dummy->flush_pa_range_completed_irq_enabled = (value & (1u << 20)) != 0u;
+#endif
 	} else if (addr == GPU_CONTROL_REG(COHERENCY_ENABLE)) {
 		dummy->coherency_enable = value;
 	} else if (addr == GPU_CONTROL_REG(GPU_IRQ_CLEAR)) {
@ -1247,10 +1327,17 @@ u8 midgard_model_write_reg(void *h, u32 addr, u32 value)

 		if (value & (1 << 17))
 			dummy->clean_caches_completed = false;
-#if   !MALI_USE_CSF
-		if (value & PRFCNT_SAMPLE_COMPLETED)
+
+#if MALI_USE_CSF
+		if (value & (1u << 20))
+			dummy->flush_pa_range_completed = false;
+#endif /* MALI_USE_CSF */
+
+#if !MALI_USE_CSF
+		if (value & PRFCNT_SAMPLE_COMPLETED) /* (1 << 16) */
 			dummy->prfcnt_sample_completed = 0;
 #endif /* !MALI_USE_CSF */
+
 		/*update error status */
 		hw_error_status.gpu_error_irq &= ~(value);
 	} else if (addr == GPU_CONTROL_REG(GPU_COMMAND)) {
@ -1274,7 +1361,15 @@ u8 midgard_model_write_reg(void *h, u32 addr, u32 value)
 			pr_debug("clean caches requested");
 			dummy->clean_caches_completed = true;
 			break;
-#if   !MALI_USE_CSF
+#if MALI_USE_CSF
+		case GPU_COMMAND_FLUSH_PA_RANGE_CLN_INV_L2:
+		case GPU_COMMAND_FLUSH_PA_RANGE_CLN_INV_L2_LSC:
+		case GPU_COMMAND_FLUSH_PA_RANGE_CLN_INV_FULL:
+			pr_debug("pa range flush requested");
+			dummy->flush_pa_range_completed = true;
+			break;
+#endif /* MALI_USE_CSF */
+#if !MALI_USE_CSF
 		case GPU_COMMAND_PRFCNT_SAMPLE:
 			midgard_model_dump_prfcnt();
 			dummy->prfcnt_sample_completed = 1;
@ -1282,6 +1377,11 @@ u8 midgard_model_write_reg(void *h, u32 addr, u32 value)
 		default:
 			break;
 		}
+#if MALI_USE_CSF
+	} else if (addr >= GPU_CONTROL_REG(GPU_COMMAND_ARG0_LO) &&
+		   addr <= GPU_CONTROL_REG(GPU_COMMAND_ARG1_HI)) {
+		/* Writes ignored */
+#endif
 	} else if (addr == GPU_CONTROL_REG(L2_CONFIG)) {
 		dummy->l2_config = value;
 	}
@ -1291,6 +1391,12 @@ u8 midgard_model_write_reg(void *h, u32 addr, u32 value)
 						(CSF_NUM_DOORBELL * CSF_HW_DOORBELL_PAGE_SIZE))) {
 		if (addr == GPU_CONTROL_REG(CSF_HW_DOORBELL_PAGE_OFFSET))
 			hw_error_status.job_irq_status = JOB_IRQ_GLOBAL_IF;
+	} else if ((addr >= GPU_CONTROL_REG(SYSC_ALLOC0)) &&
+		   (addr < GPU_CONTROL_REG(SYSC_ALLOC(SYSC_ALLOC_COUNT)))) {
+		/* Do nothing */
+	} else if ((addr >= GPU_CONTROL_REG(ASN_HASH_0)) &&
+		   (addr < GPU_CONTROL_REG(ASN_HASH(ASN_HASH_COUNT)))) {
+		/* Do nothing */
 	} else if (addr == IPA_CONTROL_REG(COMMAND)) {
 		pr_debug("Received IPA_CONTROL command");
 	} else if (addr == IPA_CONTROL_REG(TIMER)) {
@ -1315,8 +1421,7 @@ u8 midgard_model_write_reg(void *h, u32 addr, u32 value)
 		hw_error_status.mmu_irq_mask = value;
 	} else if (addr == MMU_REG(MMU_IRQ_CLEAR)) {
 		hw_error_status.mmu_irq_rawstat &= (~value);
-	} else if ((addr >= MMU_AS_REG(0, AS_TRANSTAB_LO)) &&
-			(addr <= MMU_AS_REG(15, AS_STATUS))) {
+	} else if ((addr >= MMU_AS_REG(0, AS_TRANSTAB_LO)) && (addr <= MMU_AS_REG(15, AS_STATUS))) {
 		int mem_addr_space = (addr - MMU_AS_REG(0, AS_TRANSTAB_LO))
 									>> 6;

@ -1443,7 +1548,8 @@ u8 midgard_model_write_reg(void *h, u32 addr, u32 value)
 			dummy->power_changed = 1;
 			break;
 		case SHADER_PWRON_LO:
-			dummy->power_on |= (value & 0xF) << 2;
+			dummy->power_on |=
+				(value & dummy->control_reg_values->shader_present) << 2;
 			dummy->power_changed = 1;
 			break;
 		case L2_PWRON_LO:
@ -1459,7 +1565,8 @@ u8 midgard_model_write_reg(void *h, u32 addr, u32 value)
 			dummy->power_changed = 1;
 			break;
 		case SHADER_PWROFF_LO:
-			dummy->power_on &= ~((value & 0xF) << 2);
+			dummy->power_on &=
+				~((value & dummy->control_reg_values->shader_present) << 2);
 			dummy->power_changed = 1;
 			break;
 		case L2_PWROFF_LO:
@ -1546,6 +1653,9 @@ u8 midgard_model_read_reg(void *h, u32 addr, u32 * const value)
 	else if (addr == GPU_CONTROL_REG(GPU_IRQ_MASK)) {
 		*value = (dummy->reset_completed_mask << 8) |
 			 ((dummy->clean_caches_completed_irq_enabled ? 1u : 0u) << 17) |
+#if MALI_USE_CSF
+			 ((dummy->flush_pa_range_completed_irq_enabled ? 1u : 0u) << 20) |
+#endif
 			 (dummy->power_changed_mask << 9) | (1 << 7) | 1;
 		pr_debug("GPU_IRQ_MASK read %x", *value);
 	} else if (addr == GPU_CONTROL_REG(GPU_IRQ_RAWSTAT)) {
@ -1555,6 +1665,9 @@ u8 midgard_model_read_reg(void *h, u32 addr, u32 * const value)
 			 (dummy->prfcnt_sample_completed ? PRFCNT_SAMPLE_COMPLETED : 0) |
 #endif /* !MALI_USE_CSF */
 			 ((dummy->clean_caches_completed ? 1u : 0u) << 17) |
+#if MALI_USE_CSF
+			 ((dummy->flush_pa_range_completed ? 1u : 0u) << 20) |
+#endif
 			 hw_error_status.gpu_error_irq;
 		pr_debug("GPU_IRQ_RAWSTAT read %x", *value);
 	} else if (addr == GPU_CONTROL_REG(GPU_IRQ_STATUS)) {
@ -1569,6 +1682,13 @@ u8 midgard_model_read_reg(void *h, u32 addr, u32 * const value)
 				   1u :
 				   0u)
 			  << 17) |
+#if MALI_USE_CSF
+			 (((dummy->flush_pa_range_completed &&
+			    dummy->flush_pa_range_completed_irq_enabled) ?
+				   1u :
+				   0u)
+			  << 20) |
+#endif
 			 hw_error_status.gpu_error_irq;
 		pr_debug("GPU_IRQ_STAT read %x", *value);
 	} else if (addr == GPU_CONTROL_REG(GPU_STATUS)) {
@ -1581,8 +1701,18 @@ u8 midgard_model_read_reg(void *h, u32 addr, u32 * const value)
 		*value = hw_error_status.gpu_fault_status;
 	} else if (addr == GPU_CONTROL_REG(L2_CONFIG)) {
 		*value = dummy->l2_config;
-	} else if ((addr >= GPU_CONTROL_REG(SHADER_PRESENT_LO)) &&
-				(addr <= GPU_CONTROL_REG(L2_MMU_CONFIG))) {
+	}
+#if MALI_USE_CSF
+	else if ((addr >= GPU_CONTROL_REG(SYSC_ALLOC0)) &&
+		 (addr < GPU_CONTROL_REG(SYSC_ALLOC(SYSC_ALLOC_COUNT)))) {
+		*value = 0;
+	} else if ((addr >= GPU_CONTROL_REG(ASN_HASH_0)) &&
+		   (addr < GPU_CONTROL_REG(ASN_HASH(ASN_HASH_COUNT)))) {
+		*value = 0;
+	}
+#endif
+	else if ((addr >= GPU_CONTROL_REG(SHADER_PRESENT_LO)) &&
+		 (addr <= GPU_CONTROL_REG(L2_MMU_CONFIG))) {
 		switch (addr) {
 		case GPU_CONTROL_REG(SHADER_PRESENT_LO):
 		case GPU_CONTROL_REG(SHADER_PRESENT_HI):
@ -1592,27 +1722,27 @@ u8 midgard_model_read_reg(void *h, u32 addr, u32 * const value)
 		case GPU_CONTROL_REG(L2_PRESENT_HI):
 		case GPU_CONTROL_REG(STACK_PRESENT_LO):
 		case GPU_CONTROL_REG(STACK_PRESENT_HI):
-			*value = get_implementation_register(addr);
+			*value = get_implementation_register(addr, dummy->control_reg_values);
 			break;
 		case GPU_CONTROL_REG(SHADER_READY_LO):
 			*value = (dummy->power_on >> 0x02) &
-			get_implementation_register(
-				GPU_CONTROL_REG(SHADER_PRESENT_LO));
+				 get_implementation_register(GPU_CONTROL_REG(SHADER_PRESENT_LO),
+							     dummy->control_reg_values);
 			break;
 		case GPU_CONTROL_REG(TILER_READY_LO):
 			*value = (dummy->power_on >> 0x01) &
-				 get_implementation_register(
-				GPU_CONTROL_REG(TILER_PRESENT_LO));
+				 get_implementation_register(GPU_CONTROL_REG(TILER_PRESENT_LO),
+							     dummy->control_reg_values);
 			break;
 		case GPU_CONTROL_REG(L2_READY_LO):
 			*value = dummy->power_on &
-				 get_implementation_register(
-				GPU_CONTROL_REG(L2_PRESENT_LO));
+				 get_implementation_register(GPU_CONTROL_REG(L2_PRESENT_LO),
+							     dummy->control_reg_values);
 			break;
 		case GPU_CONTROL_REG(STACK_READY_LO):
 			*value = dummy->stack_power_on_lo &
-				 get_implementation_register(
-				GPU_CONTROL_REG(STACK_PRESENT_LO));
+				 get_implementation_register(GPU_CONTROL_REG(STACK_PRESENT_LO),
+							     dummy->control_reg_values);
 			break;

 		case GPU_CONTROL_REG(SHADER_READY_HI):
@ -1904,6 +2034,8 @@ u8 midgard_model_read_reg(void *h, u32 addr, u32 * const value)

 		*value = gpu_model_get_prfcnt_value(KBASE_IPA_CORE_TYPE_SHADER,
 						    counter_index, is_low_word);
+	} else if (addr == USER_REG(LATEST_FLUSH)) {
+		*value = 0;
 	}
 #endif
 	else if (addr == GPU_CONTROL_REG(GPU_FEATURES_LO)) {
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_model_error_generator.c
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_model_error_generator.c
@ -23,13 +23,6 @@
 #include <linux/random.h>
 #include "backend/gpu/mali_kbase_model_dummy.h"

-/* all the error conditions supported by the model */
-#define TOTAL_FAULTS 27
-/* maximum number of levels in the MMU translation table tree */
-#define MAX_MMU_TABLE_LEVEL 4
-/* worst case scenario is <1 MMU fault + 1 job fault + 2 GPU faults> */
-#define MAX_CONCURRENT_FAULTS 3
-
 static struct kbase_error_atom *error_track_list;

 unsigned int rand_seed;
@ -40,6 +33,14 @@ unsigned int error_probability = 50;	/* to be set between 0 and 100 */
 unsigned int multiple_error_probability = 50;

 #ifdef CONFIG_MALI_ERROR_INJECT_RANDOM
+
+/* all the error conditions supported by the model */
+#define TOTAL_FAULTS 27
+/* maximum number of levels in the MMU translation table tree */
+#define MAX_MMU_TABLE_LEVEL 4
+/* worst case scenario is <1 MMU fault + 1 job fault + 2 GPU faults> */
+#define MAX_CONCURRENT_FAULTS 3
+
 /**
 * gpu_generate_error - Generate GPU error
 */
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_backend.c
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_backend.c
@ -36,7 +36,7 @@
 #include <linux/pm_runtime.h>
 #include <mali_kbase_reset_gpu.h>
 #endif /* !MALI_USE_CSF */
-#include <mali_kbase_hwcnt_context.h>
+#include <hwcnt/mali_kbase_hwcnt_context.h>
 #include <backend/gpu/mali_kbase_pm_internal.h>
 #include <backend/gpu/mali_kbase_devfreq.h>
 #include <mali_kbase_dummy_job_wa.h>
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_ca.c
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_ca.c
@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
 *
- * (C) COPYRIGHT 2013-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2013-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -92,29 +92,10 @@ void kbase_devfreq_set_core_mask(struct kbase_device *kbdev, u64 core_mask)
 	 * for those cores to get powered down
 	 */
 	if ((core_mask & old_core_mask) != old_core_mask) {
-		bool can_wait;
-
-		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-		can_wait = kbdev->pm.backend.gpu_ready && kbase_pm_is_mcu_desired(kbdev);
-		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
-
-		/* This check is ideally not required, the wait function can
-		 * deal with the GPU power down. But it has been added to
-		 * address the scenario where down-scaling request comes from
-		 * the platform specific code soon after the GPU power down
-		 * and at the time same time application thread tries to
-		 * power up the GPU (on the flush of GPU queue).
-		 * The platform specific @ref callback_power_on that gets
-		 * invoked on power up does not return until down-scaling
-		 * request is complete. The check mitigates the race caused by
-		 * the problem in platform specific code.
-		 */
-		if (likely(can_wait)) {
-			if (kbase_pm_wait_for_desired_state(kbdev)) {
-				dev_warn(kbdev->dev,
-					 "Wait for update of core_mask from %llx to %llx failed",
-					 old_core_mask, core_mask);
-			}
+		if (kbase_pm_wait_for_cores_down_scale(kbdev)) {
+			dev_warn(kbdev->dev,
+				 "Wait for update of core_mask from %llx to %llx failed",
+				 old_core_mask, core_mask);
 		}
 	}
 #endif
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_driver.c
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_driver.c
@ -39,7 +39,7 @@

 #include <mali_kbase_reset_gpu.h>
 #include <mali_kbase_ctx_sched.h>
-#include <mali_kbase_hwcnt_context.h>
+#include <hwcnt/mali_kbase_hwcnt_context.h>
 #include <mali_kbase_pbha.h>
 #include <backend/gpu/mali_kbase_cache_policy_backend.h>
 #include <device/mali_kbase_device.h>
@ -538,6 +538,14 @@ static void kbase_pm_l2_config_override(struct kbase_device *kbdev)
 	if (!kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_L2_CONFIG))
 		return;

+#if MALI_USE_CSF
+	if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_PBHA_HWU)) {
+		val = kbase_reg_read(kbdev, GPU_CONTROL_REG(L2_CONFIG));
+		kbase_reg_write(kbdev, GPU_CONTROL_REG(L2_CONFIG),
+				L2_CONFIG_PBHA_HWU_SET(val, kbdev->pbha_propagate_bits));
+	}
+#endif /* MALI_USE_CSF */
+
 	/*
 	 * Skip if size and hash are not given explicitly,
 	 * which means default values are used.
@ -599,6 +607,21 @@ static const char *kbase_mcu_state_to_string(enum kbase_mcu_state state)
 		return strings[state];
 }

+static
+void kbase_ktrace_log_mcu_state(struct kbase_device *kbdev, enum kbase_mcu_state state)
+{
+#if KBASE_KTRACE_ENABLE
+	switch (state) {
+#define KBASEP_MCU_STATE(n) \
+	case KBASE_MCU_ ## n: \
+		KBASE_KTRACE_ADD(kbdev, PM_MCU_ ## n, NULL, state); \
+		break;
+#include "mali_kbase_pm_mcu_states.h"
+#undef KBASEP_MCU_STATE
+	}
+#endif
+}
+
 static inline bool kbase_pm_handle_mcu_core_attr_update(struct kbase_device *kbdev)
 {
 	struct kbase_pm_backend_data *backend = &kbdev->pm.backend;
@ -689,7 +712,6 @@ static void wait_mcu_as_inactive(struct kbase_device *kbdev)
 }
 #endif

-
 /**
 * kbasep_pm_toggle_power_interrupt - Toggles the IRQ mask for power interrupts
 *                                    from the firmware
@ -697,10 +719,10 @@ static void wait_mcu_as_inactive(struct kbase_device *kbdev)
 * @kbdev:  Pointer to the device
 * @enable: boolean indicating to enable interrupts or not
 *
- * The POWER_CHANGED_ALL and POWER_CHANGED_SINGLE interrupts can be disabled
- * after L2 has been turned on when FW is controlling the power for the shader
- * cores. Correspondingly, the interrupts can be re-enabled after the MCU has
- * been disabled before the power down of L2.
+ * The POWER_CHANGED_ALL interrupt can be disabled after L2 has been turned on
+ * when FW is controlling the power for the shader cores. Correspondingly, the
+ * interrupts can be re-enabled after the MCU has been disabled before the
+ * power down of L2.
 */
 static void kbasep_pm_toggle_power_interrupt(struct kbase_device *kbdev, bool enable)
 {
@ -710,10 +732,12 @@ static void kbasep_pm_toggle_power_interrupt(struct kbase_device *kbdev, bool en

 	irq_mask = kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK));

-	if (enable)
-		irq_mask |= POWER_CHANGED_ALL | POWER_CHANGED_SINGLE;
-	else
-		irq_mask &= ~(POWER_CHANGED_ALL | POWER_CHANGED_SINGLE);
+	if (enable) {
+		irq_mask |= POWER_CHANGED_ALL;
+		kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_CLEAR), POWER_CHANGED_ALL);
+	} else {
+		irq_mask &= ~POWER_CHANGED_ALL;
+	}

 	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK), irq_mask);
 }
@ -1028,10 +1052,12 @@ static int kbase_pm_mcu_update_state(struct kbase_device *kbdev)
 			     backend->mcu_state);
 		}

-		if (backend->mcu_state != prev_state)
+		if (backend->mcu_state != prev_state) {
 			dev_dbg(kbdev->dev, "MCU state transition: %s to %s\n",
 				kbase_mcu_state_to_string(prev_state),
 				kbase_mcu_state_to_string(backend->mcu_state));
+			kbase_ktrace_log_mcu_state(kbdev, backend->mcu_state);
+		}

 	} while (backend->mcu_state != prev_state);

@ -1079,6 +1105,21 @@ static const char *kbase_l2_core_state_to_string(enum kbase_l2_core_state state)
 		return strings[state];
 }

+static
+void kbase_ktrace_log_l2_core_state(struct kbase_device *kbdev, enum kbase_l2_core_state state)
+{
+#if KBASE_KTRACE_ENABLE
+	switch (state) {
+#define KBASEP_L2_STATE(n) \
+	case KBASE_L2_ ## n: \
+		KBASE_KTRACE_ADD(kbdev, PM_L2_ ## n, NULL, state); \
+		break;
+#include "mali_kbase_pm_l2_states.h"
+#undef KBASEP_L2_STATE
+	}
+#endif
+}
+
 #if !MALI_USE_CSF
 /* On powering on the L2, the tracked kctx becomes stale and can be cleared.
 * This enables the backend to spare the START_FLUSH.INV_SHADER_OTHER
@ -1136,18 +1177,13 @@ static int kbase_pm_l2_update_state(struct kbase_device *kbdev)
 				KBASE_PM_CORE_L2);
 		u64 l2_ready = kbase_pm_get_ready_cores(kbdev,
 				KBASE_PM_CORE_L2);
-#ifdef CONFIG_MALI_ARBITER_SUPPORT
-		u64 tiler_trans = kbase_pm_get_trans_cores(
-				kbdev, KBASE_PM_CORE_TILER);
-		u64 tiler_ready = kbase_pm_get_ready_cores(
-				kbdev, KBASE_PM_CORE_TILER);

+#ifdef CONFIG_MALI_ARBITER_SUPPORT
 		/*
 		 * kbase_pm_get_ready_cores and kbase_pm_get_trans_cores
 		 * are vulnerable to corruption if gpu is lost
 		 */
-		if (kbase_is_gpu_removed(kbdev)
-				|| kbase_pm_is_gpu_lost(kbdev)) {
+		if (kbase_is_gpu_removed(kbdev) || kbase_pm_is_gpu_lost(kbdev)) {
 			backend->shaders_state =
 				KBASE_SHADERS_OFF_CORESTACK_OFF;
 			backend->hwcnt_desired = false;
@ -1161,16 +1197,19 @@ static int kbase_pm_l2_update_state(struct kbase_device *kbdev)
 				 */
 				backend->l2_state =
 					KBASE_L2_ON_HWCNT_DISABLE;
+				KBASE_KTRACE_ADD(kbdev, PM_L2_ON_HWCNT_DISABLE, NULL,
+							backend->l2_state);
 				kbase_pm_trigger_hwcnt_disable(kbdev);
 			}

 			if (backend->hwcnt_disabled) {
 				backend->l2_state = KBASE_L2_OFF;
+				KBASE_KTRACE_ADD(kbdev, PM_L2_OFF, NULL, backend->l2_state);
 				dev_dbg(kbdev->dev, "GPU lost has occurred - L2 off\n");
 			}
 			break;
 		}
-#endif /* CONFIG_MALI_ARBITER_SUPPORT */
+#endif

 		/* mask off ready from trans in case transitions finished
 		 * between the register reads
@ -1182,6 +1221,12 @@ static int kbase_pm_l2_update_state(struct kbase_device *kbdev)
 		switch (backend->l2_state) {
 		case KBASE_L2_OFF:
 			if (kbase_pm_is_l2_desired(kbdev)) {
+#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME)
+				/* Enable HW timer of IPA control before
+				 * L2 cache is powered-up.
+				 */
+				kbase_ipa_control_handle_gpu_sleep_exit(kbdev);
+#endif
 				/*
 				 * Set the desired config for L2 before
 				 * powering it on
@ -1221,14 +1266,12 @@ static int kbase_pm_l2_update_state(struct kbase_device *kbdev)
 			l2_power_up_done = false;
 			if (!l2_trans && l2_ready == l2_present) {
 				if (need_tiler_control(kbdev)) {
-#ifndef CONFIG_MALI_ARBITER_SUPPORT
 					u64 tiler_trans = kbase_pm_get_trans_cores(
 						kbdev, KBASE_PM_CORE_TILER);
 					u64 tiler_ready = kbase_pm_get_ready_cores(
 						kbdev, KBASE_PM_CORE_TILER);
-#endif
-
 					tiler_trans &= ~tiler_ready;
+
 					if (!tiler_trans && tiler_ready == tiler_present) {
 						KBASE_KTRACE_ADD(kbdev,
 								 PM_CORES_CHANGE_AVAILABLE_TILER,
@ -1437,12 +1480,26 @@ static int kbase_pm_l2_update_state(struct kbase_device *kbdev)
 				/* We only need to check the L2 here - if the L2
 				 * is off then the tiler is definitely also off.
 				 */
-				if (!l2_trans && !l2_ready)
+				if (!l2_trans && !l2_ready) {
+#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME)
+					/* Allow clock gating within the GPU and prevent it
+					 * from being seen as active during sleep.
+					 */
+					kbase_ipa_control_handle_gpu_sleep_enter(kbdev);
+#endif
 					/* L2 is now powered off */
 					backend->l2_state = KBASE_L2_OFF;
+				}
 			} else {
-				if (!kbdev->cache_clean_in_progress)
+				if (!kbdev->cache_clean_in_progress) {
+#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME)
+					/* Allow clock gating within the GPU and prevent it
+					 * from being seen as active during sleep.
+					 */
+					kbase_ipa_control_handle_gpu_sleep_enter(kbdev);
+#endif
 					backend->l2_state = KBASE_L2_OFF;
+				}
 			}
 			break;

@ -1457,11 +1514,13 @@ static int kbase_pm_l2_update_state(struct kbase_device *kbdev)
 					backend->l2_state);
 		}

-		if (backend->l2_state != prev_state)
+		if (backend->l2_state != prev_state) {
 			dev_dbg(kbdev->dev, "L2 state transition: %s to %s\n",
 				kbase_l2_core_state_to_string(prev_state),
 				kbase_l2_core_state_to_string(
 					backend->l2_state));
+			kbase_ktrace_log_l2_core_state(kbdev, backend->l2_state);
+		}

 	} while (backend->l2_state != prev_state);

@ -1925,7 +1984,7 @@ static bool kbase_pm_is_in_desired_state_nolock(struct kbase_device *kbdev)
 			kbdev->pm.backend.shaders_state != KBASE_SHADERS_OFF_CORESTACK_OFF)
 		in_desired_state = false;
 #else
-	in_desired_state = kbase_pm_mcu_is_in_desired_state(kbdev);
+	in_desired_state &= kbase_pm_mcu_is_in_desired_state(kbdev);
 #endif

 	return in_desired_state;
@ -2122,6 +2181,7 @@ void kbase_pm_reset_start_locked(struct kbase_device *kbdev)

 	backend->in_reset = true;
 	backend->l2_state = KBASE_L2_RESET_WAIT;
+	KBASE_KTRACE_ADD(kbdev, PM_L2_RESET_WAIT, NULL, backend->l2_state);
 #if !MALI_USE_CSF
 	backend->shaders_state = KBASE_SHADERS_RESET_WAIT;
 #else
@ -2130,6 +2190,7 @@ void kbase_pm_reset_start_locked(struct kbase_device *kbdev)
 	 */
 	if (likely(kbdev->csf.firmware_inited)) {
 		backend->mcu_state = KBASE_MCU_RESET_WAIT;
+		KBASE_KTRACE_ADD(kbdev, PM_MCU_RESET_WAIT, NULL, backend->mcu_state);
 #ifdef KBASE_PM_RUNTIME
 		backend->exit_gpu_sleep_mode = true;
 #endif
@ -2328,6 +2389,66 @@ int kbase_pm_wait_for_desired_state(struct kbase_device *kbdev)
 }
 KBASE_EXPORT_TEST_API(kbase_pm_wait_for_desired_state);

+#if MALI_USE_CSF
+/**
+ * core_mask_update_done - Check if downscaling of shader cores is done
+ *
+ * @kbdev: The kbase device structure for the device.
+ *
+ * This function checks if the downscaling of cores is effectively complete.
+ *
+ * Return: true if the downscale is done.
+ */
+static bool core_mask_update_done(struct kbase_device *kbdev)
+{
+	bool update_done = false;
+	unsigned long flags;
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	/* If MCU is in stable ON state then it implies that the downscale
+	 * request had completed.
+	 * If MCU is not active then it implies all cores are off, so can
+	 * consider the downscale request as complete.
+	 */
+	if ((kbdev->pm.backend.mcu_state == KBASE_MCU_ON) ||
+	    kbase_pm_is_mcu_inactive(kbdev, kbdev->pm.backend.mcu_state))
+		update_done = true;
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	return update_done;
+}
+
+int kbase_pm_wait_for_cores_down_scale(struct kbase_device *kbdev)
+{
+	long timeout = kbase_csf_timeout_in_jiffies(kbase_get_timeout_ms(kbdev, CSF_PM_TIMEOUT));
+	long remaining;
+	int err = 0;
+
+	/* Wait for core mask update to complete  */
+#if KERNEL_VERSION(4, 13, 1) <= LINUX_VERSION_CODE
+	remaining = wait_event_killable_timeout(
+		kbdev->pm.backend.gpu_in_desired_state_wait,
+		core_mask_update_done(kbdev), timeout);
+#else
+	remaining = wait_event_timeout(
+		kbdev->pm.backend.gpu_in_desired_state_wait,
+		core_mask_update_done(kbdev), timeout);
+#endif
+
+	if (!remaining) {
+		kbase_pm_timed_out(kbdev);
+		err = -ETIMEDOUT;
+	} else if (remaining < 0) {
+		dev_info(
+			kbdev->dev,
+			"Wait for cores down scaling got interrupted");
+		err = (int)remaining;
+	}
+
+	return err;
+}
+#endif
+
 void kbase_pm_enable_interrupts(struct kbase_device *kbdev)
 {
 	unsigned long flags;
@ -2391,19 +2512,25 @@ static void update_user_reg_page_mapping(struct kbase_device *kbdev)
 	lockdep_assert_held(&kbdev->pm.lock);

 	mutex_lock(&kbdev->csf.reg_lock);
-	if (kbdev->csf.mali_file_inode) {
-		/* This would zap the pte corresponding to the mapping of User
-		 * register page for all the Kbase contexts.
-		 */
-		unmap_mapping_range(kbdev->csf.mali_file_inode->i_mapping,
-				    BASEP_MEM_CSF_USER_REG_PAGE_HANDLE,
-				    PAGE_SIZE, 1);
+
+	/* Only if the mappings for USER page exist, update all PTEs associated to it */
+	if (kbdev->csf.nr_user_page_mapped > 0) {
+		if (likely(kbdev->csf.mali_file_inode)) {
+			/* This would zap the pte corresponding to the mapping of User
+			 * register page for all the Kbase contexts.
+			 */
+			unmap_mapping_range(kbdev->csf.mali_file_inode->i_mapping,
+					    BASEP_MEM_CSF_USER_REG_PAGE_HANDLE, PAGE_SIZE, 1);
+		} else {
+			dev_err(kbdev->dev,
+				"Device file inode not exist even if USER page previously mapped");
+		}
 	}
+
 	mutex_unlock(&kbdev->csf.reg_lock);
 }
 #endif

-
 /*
 * pmu layout:
 * 0x0000: PMU TAG (RO) (0xCAFECAFE)
@ -2541,7 +2668,6 @@ void kbase_pm_clock_on(struct kbase_device *kbdev, bool is_resume)
 		backend->gpu_idled = false;
 	}
 #endif
-
 }

 KBASE_EXPORT_TEST_API(kbase_pm_clock_on);
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_internal.h
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_internal.h
@ -269,6 +269,37 @@ int kbase_pm_wait_for_desired_state(struct kbase_device *kbdev);
 */
 int kbase_pm_wait_for_l2_powered(struct kbase_device *kbdev);

+#if MALI_USE_CSF
+/**
+ * kbase_pm_wait_for_cores_down_scale - Wait for the downscaling of shader cores
+ *
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ *
+ * This function can be called to ensure that the downscaling of cores is
+ * effectively complete and it would be safe to lower the voltage.
+ * The function assumes that caller had exercised the MCU state machine for the
+ * downscale request through the kbase_pm_update_state() function.
+ *
+ * This function needs to be used by the caller to safely wait for the completion
+ * of downscale request, instead of kbase_pm_wait_for_desired_state().
+ * The downscale request would trigger a state change in MCU state machine
+ * and so when MCU reaches the stable ON state, it can be inferred that
+ * downscaling is complete. But it has been observed that the wake up of the
+ * waiting thread can get delayed by few milli seconds and by the time the
+ * thread wakes up the power down transition could have started (after the
+ * completion of downscale request).
+ * On the completion of power down transition another wake up signal would be
+ * sent, but again by the time thread wakes up the power up transition can begin.
+ * And the power up transition could then get blocked inside the platform specific
+ * callback_power_on() function due to the thread that called into Kbase (from the
+ * platform specific code) to perform the downscaling and then ended up waiting
+ * for the completion of downscale request.
+ *
+ * Return: 0 on success, error code on error or remaining jiffies on timeout.
+ */
+int kbase_pm_wait_for_cores_down_scale(struct kbase_device *kbdev);
+#endif
+
 /**
 * kbase_pm_update_dynamic_cores_onoff - Update the L2 and shader power state
 *                                       machines after changing shader core
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_metrics.c
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_metrics.c
@ -38,11 +38,13 @@
 #include <backend/gpu/mali_kbase_pm_defs.h>
 #include <mali_linux_trace.h>

+#if defined(CONFIG_MALI_BIFROST_DEVFREQ) || defined(CONFIG_MALI_BIFROST_DVFS) || !MALI_USE_CSF
 /* Shift used for kbasep_pm_metrics_data.time_busy/idle - units of (1 << 8) ns
 * This gives a maximum period between samples of 2^(32+8)/100 ns = slightly
 * under 11s. Exceeding this will cause overflow
 */
 #define KBASE_PM_TIME_SHIFT			8
+#endif

 #if MALI_USE_CSF
 /* To get the GPU_ACTIVE value in nano seconds unit */
--- a/drivers/gpu/arm/bifrost/build.bp
+++ b/drivers/gpu/arm/bifrost/build.bp
@ -32,6 +32,7 @@ bob_defaults {
        kbuild_options: [
            "CONFIG_MALI_BIFROST_NO_MALI=y",
            "CONFIG_MALI_NO_MALI_DEFAULT_GPU={{.gpu}}",
+            "CONFIG_GPU_HWVER={{.hwver}}",
        ],
    },
    mali_platform_dt_pin_rst: {
@ -52,9 +53,6 @@ bob_defaults {
    mali_midgard_enable_trace: {
        kbuild_options: ["CONFIG_MALI_BIFROST_ENABLE_TRACE=y"],
    },
-    mali_dma_fence: {
-        kbuild_options: ["CONFIG_MALI_BIFROST_DMA_FENCE=y"],
-    },
    mali_arbiter_support: {
        kbuild_options: ["CONFIG_MALI_ARBITER_SUPPORT=y"],
    },
@ -64,7 +62,7 @@ bob_defaults {
    mali_dma_buf_legacy_compat: {
        kbuild_options: ["CONFIG_MALI_DMA_BUF_LEGACY_COMPAT=y"],
    },
-    mali_2mb_alloc: {
+    large_page_alloc: {
        kbuild_options: ["CONFIG_MALI_2MB_ALLOC=y"],
    },
    mali_memory_fully_backed: {
@ -89,7 +87,7 @@ bob_defaults {
        kbuild_options: ["CONFIG_MALI_BIFROST_ERROR_INJECT=y"],
    },
    mali_gem5_build: {
-       kbuild_options: ["CONFIG_MALI_GEM5_BUILD=y"],
+        kbuild_options: ["CONFIG_MALI_GEM5_BUILD=y"],
    },
    mali_debug: {
        kbuild_options: [
@ -163,9 +161,7 @@ bob_defaults {
        // (catch-all for experimental CS code without separating it into
        // different features).
        "MALI_INCREMENTAL_RENDERING_JM={{.incremental_rendering_jm}}",
-        "MALI_GPU_TIMESTAMP_CORRECTION={{.gpu_timestamp_correction}}",
        "MALI_BASE_CSF_PERFORMANCE_TESTS={{.base_csf_performance_tests}}",
-        "MALI_GPU_TIMESTAMP_INTERPOLATION={{.gpu_timestamp_interpolation}}",
    ],
 }

@ -184,6 +180,10 @@ bob_kernel_module {
        "context/*.c",
        "context/*.h",
        "context/Kbuild",
+        "hwcnt/*.c",
+        "hwcnt/*.h",
+        "hwcnt/backend/*.h",
+        "hwcnt/Kbuild",
        "ipa/*.c",
        "ipa/*.h",
        "ipa/Kbuild",
@ -217,6 +217,10 @@ bob_kernel_module {
            "device/backend/*_jm.c",
            "gpu/backend/*_jm.c",
            "gpu/backend/*_jm.h",
+            "hwcnt/backend/*_jm.c",
+            "hwcnt/backend/*_jm.h",
+            "hwcnt/backend/*_jm_*.c",
+            "hwcnt/backend/*_jm_*.h",
            "jm/*.h",
            "tl/backend/*_jm.c",
            "mmu/backend/*_jm.c",
@ -238,6 +242,10 @@ bob_kernel_module {
            "device/backend/*_csf.c",
            "gpu/backend/*_csf.c",
            "gpu/backend/*_csf.h",
+            "hwcnt/backend/*_csf.c",
+            "hwcnt/backend/*_csf.h",
+            "hwcnt/backend/*_csf_*.c",
+            "hwcnt/backend/*_csf_*.h",
            "tl/backend/*_csf.c",
            "mmu/backend/*_csf.c",
            "ipa/backend/*_csf.c",
--- a/drivers/gpu/arm/bifrost/context/backend/mali_kbase_context_csf.c
+++ b/drivers/gpu/arm/bifrost/context/backend/mali_kbase_context_csf.c
@ -26,7 +26,6 @@
 #include <context/mali_kbase_context_internal.h>
 #include <gpu/mali_kbase_gpu_regmap.h>
 #include <mali_kbase.h>
-#include <mali_kbase_dma_fence.h>
 #include <mali_kbase_mem_linux.h>
 #include <mali_kbase_mem_pool_group.h>
 #include <mmu/mali_kbase_mmu.h>
@ -39,12 +38,14 @@
 #include <csf/mali_kbase_csf_cpu_queue_debugfs.h>
 #include <mali_kbase_debug_mem_view.h>
 #include <mali_kbase_debug_mem_zones.h>
+#include <mali_kbase_debug_mem_allocs.h>
 #include <mali_kbase_mem_pool_debugfs.h>

 void kbase_context_debugfs_init(struct kbase_context *const kctx)
 {
 	kbase_debug_mem_view_init(kctx);
 	kbase_debug_mem_zones_init(kctx);
+	kbase_debug_mem_allocs_init(kctx);
 	kbase_mem_pool_debugfs_init(kctx->kctx_dentry, kctx);
 	kbase_jit_debugfs_init(kctx);
 	kbase_csf_queue_group_debugfs_init(kctx);
--- a/drivers/gpu/arm/bifrost/context/backend/mali_kbase_context_jm.c
+++ b/drivers/gpu/arm/bifrost/context/backend/mali_kbase_context_jm.c
@ -27,7 +27,6 @@
 #include <gpu/mali_kbase_gpu_regmap.h>
 #include <mali_kbase.h>
 #include <mali_kbase_ctx_sched.h>
-#include <mali_kbase_dma_fence.h>
 #include <mali_kbase_kinstr_jm.h>
 #include <mali_kbase_mem_linux.h>
 #include <mali_kbase_mem_pool_group.h>
@ -37,12 +36,14 @@
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 #include <mali_kbase_debug_mem_view.h>
 #include <mali_kbase_debug_mem_zones.h>
+#include <mali_kbase_debug_mem_allocs.h>
 #include <mali_kbase_mem_pool_debugfs.h>

 void kbase_context_debugfs_init(struct kbase_context *const kctx)
 {
 	kbase_debug_mem_view_init(kctx);
 	kbase_debug_mem_zones_init(kctx);
+	kbase_debug_mem_allocs_init(kctx);
 	kbase_mem_pool_debugfs_init(kctx->kctx_dentry, kctx);
 	kbase_jit_debugfs_init(kctx);
 	kbasep_jd_debugfs_ctx_init(kctx);
@ -128,8 +129,6 @@ static const struct kbase_context_init context_init[] = {
 	{ NULL, kbase_context_free, NULL },
 	{ kbase_context_common_init, kbase_context_common_term,
 	  "Common context initialization failed" },
-	{ kbase_dma_fence_init, kbase_dma_fence_term,
-	  "DMA fence initialization failed" },
 	{ kbase_context_mem_pool_group_init, kbase_context_mem_pool_group_term,
 	  "Memory pool group initialization failed" },
 	{ kbase_mem_evictable_init, kbase_mem_evictable_deinit,
--- a/drivers/gpu/arm/bifrost/context/mali_kbase_context.c
+++ b/drivers/gpu/arm/bifrost/context/mali_kbase_context.c
@ -165,7 +165,9 @@ int kbase_context_common_init(struct kbase_context *kctx)
 	atomic64_set(&kctx->num_fixed_allocs, 0);
 #endif

+	kbase_gpu_vm_lock(kctx);
 	bitmap_copy(kctx->cookies, &cookies_mask, BITS_PER_LONG);
+	kbase_gpu_vm_unlock(kctx);

 	kctx->id = atomic_add_return(1, &(kctx->kbdev->ctx_num)) - 1;

@ -274,10 +276,8 @@ void kbase_context_common_term(struct kbase_context *kctx)

 int kbase_context_mem_pool_group_init(struct kbase_context *kctx)
 {
-	return kbase_mem_pool_group_init(&kctx->mem_pools,
-		kctx->kbdev,
-		&kctx->kbdev->mem_pool_defaults,
-		&kctx->kbdev->mem_pools);
+	return kbase_mem_pool_group_init(&kctx->mem_pools, kctx->kbdev,
+					 &kctx->kbdev->mem_pool_defaults, &kctx->kbdev->mem_pools);
 }

 void kbase_context_mem_pool_group_term(struct kbase_context *kctx)
--- a/drivers/gpu/arm/bifrost/csf/Kbuild
+++ b/drivers/gpu/arm/bifrost/csf/Kbuild
@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 #
-# (C) COPYRIGHT 2018-2021 ARM Limited. All rights reserved.
+# (C) COPYRIGHT 2018-2022 ARM Limited. All rights reserved.
 #
 # This program is free software and is provided to you under the terms of the
 # GNU General Public License version 2 as published by the Free Software
@ -34,12 +34,16 @@ bifrost_kbase-y += \
    csf/mali_kbase_csf_protected_memory.o \
    csf/mali_kbase_csf_tiler_heap_debugfs.o \
    csf/mali_kbase_csf_cpu_queue_debugfs.o \
-    csf/mali_kbase_csf_event.o
+    csf/mali_kbase_csf_event.o \
+    csf/mali_kbase_csf_firmware_log.o \
+    csf/mali_kbase_csf_tiler_heap_reclaim.o

 bifrost_kbase-$(CONFIG_MALI_REAL_HW) += csf/mali_kbase_csf_firmware.o

 bifrost_kbase-$(CONFIG_MALI_BIFROST_NO_MALI) += csf/mali_kbase_csf_firmware_no_mali.o

+bifrost_kbase-$(CONFIG_DEBUG_FS) += csf/mali_kbase_debug_csf_fault.o
+

 ifeq ($(KBUILD_EXTMOD),)
 # in-tree
--- a/drivers/gpu/arm/bifrost/csf/ipa_control/mali_kbase_csf_ipa_control.c
+++ b/drivers/gpu/arm/bifrost/csf/ipa_control/mali_kbase_csf_ipa_control.c
@ -28,8 +28,6 @@
 * Status flags from the STATUS register of the IPA Control interface.
 */
 #define STATUS_COMMAND_ACTIVE ((u32)1 << 0)
-#define STATUS_TIMER_ACTIVE ((u32)1 << 1)
-#define STATUS_AUTO_ACTIVE ((u32)1 << 2)
 #define STATUS_PROTECTED_MODE ((u32)1 << 8)
 #define STATUS_RESET ((u32)1 << 9)
 #define STATUS_TIMER_ENABLED ((u32)1 << 31)
@ -37,9 +35,7 @@
 /*
 * Commands for the COMMAND register of the IPA Control interface.
 */
-#define COMMAND_NOP ((u32)0)
 #define COMMAND_APPLY ((u32)1)
-#define COMMAND_CLEAR ((u32)2)
 #define COMMAND_SAMPLE ((u32)3)
 #define COMMAND_PROTECTED_ACK ((u32)4)
 #define COMMAND_RESET_ACK ((u32)5)
@ -965,6 +961,43 @@ void kbase_ipa_control_handle_gpu_reset_post(struct kbase_device *kbdev)
 }
 KBASE_EXPORT_TEST_API(kbase_ipa_control_handle_gpu_reset_post);

+#ifdef KBASE_PM_RUNTIME
+void kbase_ipa_control_handle_gpu_sleep_enter(struct kbase_device *kbdev)
+{
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	if (kbdev->pm.backend.mcu_state == KBASE_MCU_IN_SLEEP) {
+		/* GPU Sleep is treated as a power down */
+		kbase_ipa_control_handle_gpu_power_off(kbdev);
+
+		/* SELECT_CSHW register needs to be cleared to prevent any
+		 * IPA control message to be sent to the top level GPU HWCNT.
+		 */
+		kbase_reg_write(kbdev, IPA_CONTROL_REG(SELECT_CSHW_LO), 0);
+		kbase_reg_write(kbdev, IPA_CONTROL_REG(SELECT_CSHW_HI), 0);
+
+		/* No need to issue the APPLY command here */
+	}
+}
+KBASE_EXPORT_TEST_API(kbase_ipa_control_handle_gpu_sleep_enter);
+
+void kbase_ipa_control_handle_gpu_sleep_exit(struct kbase_device *kbdev)
+{
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	if (kbdev->pm.backend.mcu_state == KBASE_MCU_IN_SLEEP) {
+		/* To keep things simple, currently exit from
+		 * GPU Sleep is treated as a power on event where
+		 * all 4 SELECT registers are reconfigured.
+		 * On exit from sleep, reconfiguration is needed
+		 * only for the SELECT_CSHW register.
+		 */
+		kbase_ipa_control_handle_gpu_power_on(kbdev);
+	}
+}
+KBASE_EXPORT_TEST_API(kbase_ipa_control_handle_gpu_sleep_exit);
+#endif
+
 #if MALI_UNIT_TEST
 void kbase_ipa_control_rate_change_notify_test(struct kbase_device *kbdev,
 					       u32 clk_index, u32 clk_rate_hz)
--- a/drivers/gpu/arm/bifrost/csf/ipa_control/mali_kbase_csf_ipa_control.h
+++ b/drivers/gpu/arm/bifrost/csf/ipa_control/mali_kbase_csf_ipa_control.h
@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 *
- * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2020-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -198,6 +198,33 @@ void kbase_ipa_control_handle_gpu_reset_pre(struct kbase_device *kbdev);
 */
 void kbase_ipa_control_handle_gpu_reset_post(struct kbase_device *kbdev);

+#ifdef KBASE_PM_RUNTIME
+/**
+ * kbase_ipa_control_handle_gpu_sleep_enter - Handle the pre GPU Sleep event
+ *
+ * @kbdev:          Pointer to kbase device.
+ *
+ * This function is called after MCU has been put to sleep state & L2 cache has
+ * been powered down. The top level part of GPU is still powered up when this
+ * function is called.
+ */
+void kbase_ipa_control_handle_gpu_sleep_enter(struct kbase_device *kbdev);
+
+/**
+ * kbase_ipa_control_handle_gpu_sleep_exit - Handle the post GPU Sleep event
+ *
+ * @kbdev:          Pointer to kbase device.
+ *
+ * This function is called when L2 needs to be powered up and MCU can exit the
+ * sleep state. The top level part of GPU is powered up when this function is
+ * called.
+ *
+ * This function must be called only if kbase_ipa_control_handle_gpu_sleep_enter()
+ * was called previously.
+ */
+void kbase_ipa_control_handle_gpu_sleep_exit(struct kbase_device *kbdev);
+#endif
+
 #if MALI_UNIT_TEST
 /**
 * kbase_ipa_control_rate_change_notify_test - Notify GPU rate change
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf.c
@ -348,9 +348,8 @@ int kbase_csf_alloc_command_stream_user_pages(struct kbase_context *kctx,
 	if (!reg)
 		return -ENOMEM;

-	ret = kbase_mem_pool_alloc_pages(
-				&kctx->mem_pools.small[KBASE_MEM_GROUP_CSF_IO],
-				num_pages, queue->phys, false);
+	ret = kbase_mem_pool_alloc_pages(&kctx->mem_pools.small[KBASE_MEM_GROUP_CSF_IO], num_pages,
+					 queue->phys, false);

 	if (ret != num_pages)
 		goto phys_alloc_failed;
@ -374,8 +373,11 @@ int kbase_csf_alloc_command_stream_user_pages(struct kbase_context *kctx,

 	queue->db_file_offset = kbdev->csf.db_file_offsets;
 	kbdev->csf.db_file_offsets += BASEP_QUEUE_NR_MMAP_USER_PAGES;
-
+#if (KERNEL_VERSION(4, 11, 0) > LINUX_VERSION_CODE)
 	WARN(atomic_read(&queue->refcount) != 1, "Incorrect refcounting for queue object\n");
+#else
+	WARN(refcount_read(&queue->refcount) != 1, "Incorrect refcounting for queue object\n");
+#endif
 	/* This is the second reference taken on the queue object and
 	 * would be dropped only when the IO mapping is removed either
 	 * explicitly by userspace or implicitly by kernel on process exit.
@ -444,25 +446,34 @@ static struct kbase_queue *find_queue(struct kbase_context *kctx, u64 base_addr)

 static void get_queue(struct kbase_queue *queue)
 {
+#if (KERNEL_VERSION(4, 11, 0) > LINUX_VERSION_CODE)
 	WARN_ON(!atomic_inc_not_zero(&queue->refcount));
+#else
+	WARN_ON(!refcount_inc_not_zero(&queue->refcount));
+#endif
 }

 static void release_queue(struct kbase_queue *queue)
 {
 	lockdep_assert_held(&queue->kctx->csf.lock);
-
-	WARN_ON(atomic_read(&queue->refcount) <= 0);
-
+#if (KERNEL_VERSION(4, 11, 0) > LINUX_VERSION_CODE)
 	if (atomic_dec_and_test(&queue->refcount)) {
+#else
+	if (refcount_dec_and_test(&queue->refcount)) {
+#endif
 		/* The queue can't still be on the per context list. */
 		WARN_ON(!list_empty(&queue->link));
 		WARN_ON(queue->group);
+		dev_dbg(queue->kctx->kbdev->dev,
+			"Remove any pending command queue fatal from ctx %d_%d",
+			queue->kctx->tgid, queue->kctx->id);
+		kbase_csf_event_remove_error(queue->kctx, &queue->error);
 		kfree(queue);
 	}
 }

 static void oom_event_worker(struct work_struct *data);
-static void fatal_event_worker(struct work_struct *data);
+static void cs_error_worker(struct work_struct *data);

 /* Between reg and reg_ex, one and only one must be null */
 static int csf_queue_register_internal(struct kbase_context *kctx,
@ -565,7 +576,11 @@ static int csf_queue_register_internal(struct kbase_context *kctx,
 	queue->enabled = false;

 	queue->priority = reg->priority;
+#if (KERNEL_VERSION(4, 11, 0) > LINUX_VERSION_CODE)
 	atomic_set(&queue->refcount, 1);
+#else
+	refcount_set(&queue->refcount, 1);
+#endif

 	queue->group = NULL;
 	queue->bind_state = KBASE_CSF_QUEUE_UNBOUND;
@ -588,7 +603,7 @@ static int csf_queue_register_internal(struct kbase_context *kctx,
 	INIT_LIST_HEAD(&queue->link);
 	INIT_LIST_HEAD(&queue->error.link);
 	INIT_WORK(&queue->oom_event_work, oom_event_worker);
-	INIT_WORK(&queue->fatal_event_work, fatal_event_worker);
+	INIT_WORK(&queue->cs_error_work, cs_error_worker);
 	list_add(&queue->link, &kctx->csf.queue_list);

 	queue->extract_ofs = 0;
@ -699,11 +714,6 @@ void kbase_csf_queue_terminate(struct kbase_context *kctx,
 		}
 		kbase_gpu_vm_unlock(kctx);

-		dev_dbg(kctx->kbdev->dev,
-			"Remove any pending command queue fatal from context %pK\n",
-			(void *)kctx);
-		kbase_csf_event_remove_error(kctx, &queue->error);
-
 		release_queue(queue);
 	}

@ -784,6 +794,11 @@ static struct kbase_queue_group *get_bound_queue_group(
 	return group;
 }

+static void enqueue_gpu_submission_work(struct kbase_context *const kctx)
+{
+	queue_work(system_highpri_wq, &kctx->csf.pending_submission_work);
+}
+
 /**
 * pending_submission_worker() - Work item to process pending kicked GPU command queues.
 *
@ -813,11 +828,21 @@ static void pending_submission_worker(struct work_struct *work)
 	list_for_each_entry(queue, &kctx->csf.queue_list, link) {
 		if (atomic_cmpxchg(&queue->pending, 1, 0) == 1) {
 			struct kbase_queue_group *group = get_bound_queue_group(queue);
+			int ret;

-			if (!group || queue->bind_state != KBASE_CSF_QUEUE_BOUND)
+			if (!group || queue->bind_state != KBASE_CSF_QUEUE_BOUND) {
 				dev_dbg(kbdev->dev, "queue is not bound to a group");
-			else
-				WARN_ON(kbase_csf_scheduler_queue_start(queue));
+				continue;
+			}
+
+			ret = kbase_csf_scheduler_queue_start(queue);
+			if (unlikely(ret)) {
+				dev_dbg(kbdev->dev, "Failed to start queue");
+				if (ret == -EBUSY) {
+					atomic_cmpxchg(&queue->pending, 0, 1);
+					enqueue_gpu_submission_work(kctx);
+				}
+			}
 		}
 	}

@ -831,6 +856,8 @@ void kbase_csf_ring_csg_doorbell(struct kbase_device *kbdev, int slot)
 	if (WARN_ON(slot < 0))
 		return;

+	kbase_csf_scheduler_spin_lock_assert_held(kbdev);
+
 	kbase_csf_ring_csg_slots_doorbell(kbdev, (u32) (1 << slot));
 }

@ -843,6 +870,8 @@ void kbase_csf_ring_csg_slots_doorbell(struct kbase_device *kbdev,
 		(u32) ((1U << kbdev->csf.global_iface.group_num) - 1);
 	u32 value;

+	kbase_csf_scheduler_spin_lock_assert_held(kbdev);
+
 	if (WARN_ON(slot_bitmap > allowed_bitmap))
 		return;

@ -872,6 +901,8 @@ void kbase_csf_ring_cs_kernel_doorbell(struct kbase_device *kbdev,
 	struct kbase_csf_cmd_stream_group_info *ginfo;
 	u32 value;

+	kbase_csf_scheduler_spin_lock_assert_held(kbdev);
+
 	if (WARN_ON(csg_nr < 0) ||
 	    WARN_ON(csg_nr >= kbdev->csf.global_iface.group_num))
 		return;
@ -891,11 +922,6 @@ void kbase_csf_ring_cs_kernel_doorbell(struct kbase_device *kbdev,
 		kbase_csf_ring_csg_doorbell(kbdev, csg_nr);
 }

-static void enqueue_gpu_submission_work(struct kbase_context *const kctx)
-{
-	queue_work(system_highpri_wq, &kctx->csf.pending_submission_work);
-}
-
 int kbase_csf_queue_kick(struct kbase_context *kctx,
 			 struct kbase_ioctl_cs_queue_kick *kick)
 {
@ -1129,9 +1155,8 @@ static int create_normal_suspend_buffer(struct kbase_context *const kctx,
 	}

 	/* Get physical page for a normal suspend buffer */
-	err = kbase_mem_pool_alloc_pages(
-			&kctx->mem_pools.small[KBASE_MEM_GROUP_CSF_FW],
-			nr_pages, &s_buf->phy[0], false);
+	err = kbase_mem_pool_alloc_pages(&kctx->mem_pools.small[KBASE_MEM_GROUP_CSF_FW], nr_pages,
+					 &s_buf->phy[0], false);

 	if (err < 0)
 		goto phy_pages_alloc_failed;
@ -1362,6 +1387,11 @@ static int create_queue_group(struct kbase_context *const kctx,
 			group->cs_unrecoverable = false;
 			group->reevaluate_idle_status = false;

+			group->dvs_buf = create->in.dvs_buf;
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+			group->deschedule_deferred_cnt = 0;
+#endif

 			group->group_uid = generate_group_uid();
 			create->out.group_uid = group->group_uid;
@ -1377,6 +1407,9 @@ static int create_queue_group(struct kbase_context *const kctx,
 					MAX_SUPPORTED_STREAMS_PER_GROUP);

 			group->run_state = KBASE_CSF_GROUP_INACTIVE;
+			KBASE_KTRACE_ADD_CSF_GRP(group->kctx->kbdev, CSF_GROUP_INACTIVE, group,
+						group->run_state);
+
 			err = create_suspend_buffers(kctx, group);

 			if (err < 0) {
@ -1396,6 +1429,17 @@ static int create_queue_group(struct kbase_context *const kctx,
 	return group_handle;
 }

+static bool dvs_supported(u32 csf_version)
+{
+	if (GLB_VERSION_MAJOR_GET(csf_version) < 3)
+		return false;
+
+	if (GLB_VERSION_MAJOR_GET(csf_version) == 3)
+		if (GLB_VERSION_MINOR_GET(csf_version) < 2)
+			return false;
+
+	return true;
+}

 int kbase_csf_queue_group_create(struct kbase_context *const kctx,
 			union kbase_ioctl_cs_queue_group_create *const create)
@ -1434,8 +1478,17 @@ int kbase_csf_queue_group_create(struct kbase_context *const kctx,
 		dev_warn(kctx->kbdev->dev, "Unknown exception handler flags set: %u",
 			 create->in.csi_handlers & ~BASE_CSF_EXCEPTION_HANDLER_FLAGS_MASK);
 		err = -EINVAL;
-	} else if (create->in.reserved) {
-		dev_warn(kctx->kbdev->dev, "Reserved field was set to non-0");
+	} else if (!dvs_supported(kctx->kbdev->csf.global_iface.version) &&
+		   create->in.dvs_buf) {
+		dev_warn(
+			kctx->kbdev->dev,
+			"GPU does not support DVS but userspace is trying to use it");
+		err = -EINVAL;
+	} else if (dvs_supported(kctx->kbdev->csf.global_iface.version) &&
+		   !CSG_DVS_BUF_BUFFER_POINTER_GET(create->in.dvs_buf) &&
+		   CSG_DVS_BUF_BUFFER_SIZE_GET(create->in.dvs_buf)) {
+		dev_warn(kctx->kbdev->dev,
+			 "DVS buffer pointer is null but size is not 0");
 		err = -EINVAL;
 	} else {
 		/* For the CSG which satisfies the condition for having
@ -1555,6 +1608,7 @@ void kbase_csf_term_descheduled_queue_group(struct kbase_queue_group *group)
 			&group->protected_suspend_buf);

 	group->run_state = KBASE_CSF_GROUP_TERMINATED;
+	KBASE_KTRACE_ADD_CSF_GRP(group->kctx->kbdev, CSF_GROUP_TERMINATED, group, group->run_state);
 }

 /**
@ -1585,6 +1639,34 @@ static void term_queue_group(struct kbase_queue_group *group)
 	kbase_csf_term_descheduled_queue_group(group);
 }

+/**
+ * wait_group_deferred_deschedule_completion - Wait for refcount of the group to
+ *         become 0 that was taken when the group deschedule had to be deferred.
+ *
+ * @group: Pointer to GPU command queue group that is being deleted.
+ *
+ * This function is called when Userspace deletes the group and after the group
+ * has been descheduled. The function synchronizes with the other threads that were
+ * also trying to deschedule the group whilst the dumping was going on for a fault.
+ * Please refer the documentation of wait_for_dump_complete_on_group_deschedule()
+ * for more details.
+ */
+static void wait_group_deferred_deschedule_completion(struct kbase_queue_group *group)
+{
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+	struct kbase_context *kctx = group->kctx;
+
+	lockdep_assert_held(&kctx->csf.lock);
+
+	if (likely(!group->deschedule_deferred_cnt))
+		return;
+
+	mutex_unlock(&kctx->csf.lock);
+	wait_event(kctx->kbdev->csf.event_wait, !group->deschedule_deferred_cnt);
+	mutex_lock(&kctx->csf.lock);
+#endif
+}
+
 static void cancel_queue_group_events(struct kbase_queue_group *group)
 {
 	cancel_work_sync(&group->timer_event_work);
@ -1626,24 +1708,39 @@ void kbase_csf_queue_group_terminate(struct kbase_context *kctx,
 	group = find_queue_group(kctx, group_handle);

 	if (group) {
-		remove_pending_group_fatal_error(group);
-		term_queue_group(group);
 		kctx->csf.queue_groups[group_handle] = NULL;
+		/* Stop the running of the given group */
+		term_queue_group(group);
+		mutex_unlock(&kctx->csf.lock);
+
+		if (reset_prevented) {
+			/* Allow GPU reset before cancelling the group specific
+			 * work item to avoid potential deadlock.
+			 * Reset prevention isn't needed after group termination.
+			 */
+			kbase_reset_gpu_allow(kbdev);
+			reset_prevented = false;
+		}
+
+		/* Cancel any pending event callbacks. If one is in progress
+		 * then this thread waits synchronously for it to complete (which
+		 * is why we must unlock the context first). We already ensured
+		 * that no more callbacks can be enqueued by terminating the group.
+		 */
+		cancel_queue_group_events(group);
+
+		mutex_lock(&kctx->csf.lock);
+
+		/* Clean up after the termination */
+		remove_pending_group_fatal_error(group);
+
+		wait_group_deferred_deschedule_completion(group);
 	}

 	mutex_unlock(&kctx->csf.lock);
 	if (reset_prevented)
 		kbase_reset_gpu_allow(kbdev);

-	if (!group)
-		return;
-
-	/* Cancel any pending event callbacks. If one is in progress
-	 * then this thread waits synchronously for it to complete (which
-	 * is why we must unlock the context first). We already ensured
-	 * that no more callbacks can be enqueued by terminating the group.
-	 */
-	cancel_queue_group_events(group);
 	kfree(group);
 }

@ -1738,7 +1835,6 @@ void kbase_csf_active_queue_groups_reset(struct kbase_device *kbdev,

 int kbase_csf_ctx_init(struct kbase_context *kctx)
 {
-	struct kbase_device *kbdev = kctx->kbdev;
 	int err = -ENOMEM;

 	INIT_LIST_HEAD(&kctx->csf.queue_list);
@ -1747,19 +1843,6 @@ int kbase_csf_ctx_init(struct kbase_context *kctx)
 	kbase_csf_event_init(kctx);

 	kctx->csf.user_reg_vma = NULL;
-	mutex_lock(&kbdev->pm.lock);
-	/* The inode information for /dev/malixx file is not available at the
-	 * time of device probe as the inode is created when the device node
-	 * is created by udevd (through mknod).
-	 */
-	if (kctx->filp) {
-		if (!kbdev->csf.mali_file_inode)
-			kbdev->csf.mali_file_inode = kctx->filp->f_inode;
-
-		/* inode is unique for a file */
-		WARN_ON(kbdev->csf.mali_file_inode != kctx->filp->f_inode);
-	}
-	mutex_unlock(&kbdev->pm.lock);

 	/* Mark all the cookies as 'free' */
 	bitmap_fill(kctx->csf.cookies, KBASE_CSF_NUM_USER_IO_PAGES_HANDLE);
@ -1874,8 +1957,6 @@ void kbase_csf_ctx_term(struct kbase_context *kctx)
 	else
 		reset_prevented = true;

-	cancel_work_sync(&kctx->csf.pending_submission_work);
-
 	mutex_lock(&kctx->csf.lock);

 	/* Iterate through the queue groups that were not terminated by
@ -1894,6 +1975,8 @@ void kbase_csf_ctx_term(struct kbase_context *kctx)
 	if (reset_prevented)
 		kbase_reset_gpu_allow(kbdev);

+	cancel_work_sync(&kctx->csf.pending_submission_work);
+
 	/* Now that all queue groups have been terminated, there can be no
 	 * more OoM or timer event interrupts but there can be inflight work
 	 * items. Destroying the wq will implicitly flush those work items.
@ -1938,7 +2021,11 @@ void kbase_csf_ctx_term(struct kbase_context *kctx)
 		 * only one reference left that was taken when queue was
 		 * registered.
 		 */
+#if (KERNEL_VERSION(4, 11, 0) > LINUX_VERSION_CODE)
 		if (atomic_read(&queue->refcount) != 1)
+#else
+		if (refcount_read(&queue->refcount) != 1)
+#endif
 			dev_warn(kctx->kbdev->dev,
 				 "Releasing queue with incorrect refcounting!\n");
 		list_del_init(&queue->link);
@ -2059,6 +2146,36 @@ static void report_tiler_oom_error(struct kbase_queue_group *group)
 	kbase_event_wakeup(group->kctx);
 }

+static void flush_gpu_cache_on_fatal_error(struct kbase_device *kbdev)
+{
+	int err;
+	const unsigned int cache_flush_wait_timeout_ms = 2000;
+
+	kbase_pm_lock(kbdev);
+	/* With the advent of partial cache flush, dirty cache lines could
+	 * be left in the GPU L2 caches by terminating the queue group here
+	 * without waiting for proper cache maintenance. A full cache flush
+	 * here will prevent these dirty cache lines from being arbitrarily
+	 * evicted later and possible causing memory corruption.
+	 */
+	if (kbdev->pm.backend.gpu_powered) {
+		kbase_gpu_start_cache_clean(kbdev, GPU_COMMAND_CACHE_CLN_INV_L2_LSC);
+		err = kbase_gpu_wait_cache_clean_timeout(kbdev, cache_flush_wait_timeout_ms);
+
+		if (err) {
+			dev_warn(
+				kbdev->dev,
+				"[%llu] Timeout waiting for cache clean to complete after fatal error",
+				kbase_backend_get_cycle_cnt(kbdev));
+
+			if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_HWC_UNRECOVERABLE_ERROR))
+				kbase_reset_gpu(kbdev);
+		}
+	}
+
+	kbase_pm_unlock(kbdev);
+}
+
 /**
 * kbase_queue_oom_event - Handle tiler out-of-memory for a GPU command queue.
 *
@ -2071,8 +2188,8 @@ static void report_tiler_oom_error(struct kbase_queue_group *group)
 * notification to allow the firmware to report out-of-memory again in future.
 * If the out-of-memory condition was successfully handled then this function
 * rings the relevant doorbell to notify the firmware; otherwise, it terminates
- * the GPU command queue group to which the queue is bound. See
- * term_queue_group() for details.
+ * the GPU command queue group to which the queue is bound and notify a waiting
+ * user space client of the failure.
 */
 static void kbase_queue_oom_event(struct kbase_queue *const queue)
 {
@ -2084,6 +2201,7 @@ static void kbase_queue_oom_event(struct kbase_queue *const queue)
 	struct kbase_csf_cmd_stream_info const *stream;
 	int csi_index = queue->csi_index;
 	u32 cs_oom_ack, cs_oom_req;
+	unsigned long flags;

 	lockdep_assert_held(&kctx->csf.lock);

@ -2129,20 +2247,23 @@ static void kbase_queue_oom_event(struct kbase_queue *const queue)

 	err = handle_oom_event(group, stream);

+	kbase_csf_scheduler_spin_lock(kbdev, &flags);
 	kbase_csf_firmware_cs_input_mask(stream, CS_REQ, cs_oom_ack,
 					 CS_REQ_TILER_OOM_MASK);
+	kbase_csf_ring_cs_kernel_doorbell(kbdev, csi_index, slot_num, true);
+	kbase_csf_scheduler_spin_unlock(kbdev, flags);

-	if (err) {
+	if (unlikely(err)) {
 		dev_warn(
 			kbdev->dev,
 			"Queue group to be terminated, couldn't handle the OoM event\n");
+		kbase_debug_csf_fault_notify(kbdev, kctx, DF_TILER_OOM);
 		kbase_csf_scheduler_unlock(kbdev);
 		term_queue_group(group);
+		flush_gpu_cache_on_fatal_error(kbdev);
 		report_tiler_oom_error(group);
 		return;
 	}
-
-	kbase_csf_ring_cs_kernel_doorbell(kbdev, csi_index, slot_num, true);
 unlock:
 	kbase_csf_scheduler_unlock(kbdev);
 }
@ -2164,6 +2285,7 @@ static void oom_event_worker(struct work_struct *data)
 	struct kbase_device *const kbdev = kctx->kbdev;

 	int err = kbase_reset_gpu_try_prevent(kbdev);
+
 	/* Regardless of whether reset failed or is currently happening, exit
 	 * early
 	 */
@ -2216,12 +2338,13 @@ static void timer_event_worker(struct work_struct *data)
 	struct kbase_queue_group *const group =
 		container_of(data, struct kbase_queue_group, timer_event_work);
 	struct kbase_context *const kctx = group->kctx;
+	struct kbase_device *const kbdev = kctx->kbdev;
 	bool reset_prevented = false;
-	int err = kbase_reset_gpu_prevent_and_wait(kctx->kbdev);
+	int err = kbase_reset_gpu_prevent_and_wait(kbdev);

 	if (err)
 		dev_warn(
-			kctx->kbdev->dev,
+			kbdev->dev,
 			"Unsuccessful GPU reset detected when terminating group %d on progress timeout, attempting to terminate regardless",
 			group->handle);
 	else
@ -2230,11 +2353,12 @@ static void timer_event_worker(struct work_struct *data)
 	mutex_lock(&kctx->csf.lock);

 	term_queue_group(group);
+	flush_gpu_cache_on_fatal_error(kbdev);
 	report_group_timeout_error(group);

 	mutex_unlock(&kctx->csf.lock);
 	if (reset_prevented)
-		kbase_reset_gpu_allow(kctx->kbdev);
+		kbase_reset_gpu_allow(kbdev);
 }

 /**
@ -2242,11 +2366,15 @@ static void timer_event_worker(struct work_struct *data)
 *
 * @group: Pointer to GPU queue group for which the timeout event is received.
 *
+ * Notify a waiting user space client of the timeout.
 * Enqueue a work item to terminate the group and notify the event notification
 * thread of progress timeout fault for the GPU command queue group.
 */
 static void handle_progress_timer_event(struct kbase_queue_group *const group)
 {
+	kbase_debug_csf_fault_notify(group->kctx->kbdev, group->kctx,
+		DF_PROGRESS_TIMER_TIMEOUT);
+
 	queue_work(group->kctx->csf.wq, &group->timer_event_work);
 }

@ -2274,16 +2402,20 @@ static void protm_event_worker(struct work_struct *data)
 * handle_fault_event - Handler for CS fault.
 *
 * @queue:  Pointer to queue for which fault event was received.
- * @stream: Pointer to the structure containing info provided by the
- *          firmware about the CSI.
- *
- * Prints meaningful CS fault information.
+ * @cs_ack: Value of the CS_ACK register in the CS kernel input page used for
+ *          the queue.
 *
+ * Print required information about the CS fault and notify the user space client
+ * about the fault.
 */
 static void
-handle_fault_event(struct kbase_queue *const queue,
-		   struct kbase_csf_cmd_stream_info const *const stream)
+handle_fault_event(struct kbase_queue *const queue, const u32 cs_ack)
 {
+	struct kbase_device *const kbdev = queue->kctx->kbdev;
+	struct kbase_csf_cmd_stream_group_info const *ginfo =
+			&kbdev->csf.global_iface.groups[queue->group->csg_nr];
+	struct kbase_csf_cmd_stream_info const *stream =
+			&ginfo->streams[queue->csi_index];
 	const u32 cs_fault = kbase_csf_firmware_cs_output(stream, CS_FAULT);
 	const u64 cs_fault_info =
 		kbase_csf_firmware_cs_output(stream, CS_FAULT_INFO_LO) |
@ -2295,7 +2427,6 @@ handle_fault_event(struct kbase_queue *const queue,
 		CS_FAULT_EXCEPTION_DATA_GET(cs_fault);
 	const u64 cs_fault_info_exception_data =
 		CS_FAULT_INFO_EXCEPTION_DATA_GET(cs_fault_info);
-	struct kbase_device *const kbdev = queue->kctx->kbdev;

 	kbase_csf_scheduler_spin_lock_assert_held(kbdev);

@ -2310,6 +2441,36 @@ handle_fault_event(struct kbase_queue *const queue,
 		 kbase_gpu_exception_name(cs_fault_exception_type),
 		 cs_fault_exception_data, cs_fault_info_exception_data);

+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+	/* CS_RESOURCE_TERMINATED type fault event can be ignored from the
+	 * standpoint of dump on error. It is used to report fault for the CSIs
+	 * that are associated with the same CSG as the CSI for which the actual
+	 * fault was reported by the Iterator.
+	 * Dumping would be triggered when the actual fault is reported.
+	 *
+	 * CS_INHERIT_FAULT can also be ignored. It could happen due to the error
+	 * in other types of queues (cpu/kcpu). If a fault had occurred in some
+	 * other GPU queue then the dump would have been performed anyways when
+	 * that fault was reported.
+	 */
+	if ((cs_fault_exception_type != CS_FAULT_EXCEPTION_TYPE_CS_INHERIT_FAULT) &&
+	    (cs_fault_exception_type != CS_FAULT_EXCEPTION_TYPE_CS_RESOURCE_TERMINATED)) {
+		if (unlikely(kbase_debug_csf_fault_notify(kbdev, queue->kctx, DF_CS_FAULT))) {
+			get_queue(queue);
+			queue->cs_error = cs_fault;
+			queue->cs_error_info = cs_fault_info;
+			queue->cs_error_fatal = false;
+			if (!queue_work(queue->kctx->csf.wq, &queue->cs_error_work))
+				release_queue(queue);
+			return;
+		}
+	}
+#endif
+
+	kbase_csf_firmware_cs_input_mask(stream, CS_REQ, cs_ack,
+					 CS_REQ_FAULT_MASK);
+	kbase_csf_ring_cs_kernel_doorbell(kbdev, queue->csi_index, queue->group->csg_nr, true);
 }

 static void report_queue_fatal_error(struct kbase_queue *const queue,
@ -2341,16 +2502,16 @@ static void report_queue_fatal_error(struct kbase_queue *const queue,
 }

 /**
- * fatal_event_worker - Handle the fatal error for the GPU queue
+ * fatal_event_worker - Handle the CS_FATAL/CS_FAULT error for the GPU queue
 *
 * @data: Pointer to a work_struct embedded in GPU command queue.
 *
 * Terminate the CSG and report the error to userspace.
 */
-static void fatal_event_worker(struct work_struct *const data)
+static void cs_error_worker(struct work_struct *const data)
 {
 	struct kbase_queue *const queue =
-		container_of(data, struct kbase_queue, fatal_event_work);
+		container_of(data, struct kbase_queue, cs_error_work);
 	struct kbase_context *const kctx = queue->kctx;
 	struct kbase_device *const kbdev = kctx->kbdev;
 	struct kbase_queue_group *group;
@ -2365,6 +2526,7 @@ static void fatal_event_worker(struct work_struct *const data)
 	else
 		reset_prevented = true;

+	kbase_debug_csf_fault_wait_completion(kbdev);
 	mutex_lock(&kctx->csf.lock);

 	group = get_bound_queue_group(queue);
@ -2373,9 +2535,35 @@ static void fatal_event_worker(struct work_struct *const data)
 		goto unlock;
 	}

+#if IS_ENABLED(CONFIG_DEBUG_FS)
+	if (!queue->cs_error_fatal) {
+		unsigned long flags;
+		int slot_num;
+
+		kbase_csf_scheduler_spin_lock(kbdev, &flags);
+		slot_num = kbase_csf_scheduler_group_get_slot_locked(group);
+		if (slot_num >= 0) {
+			struct kbase_csf_cmd_stream_group_info const *ginfo =
+				&kbdev->csf.global_iface.groups[slot_num];
+			struct kbase_csf_cmd_stream_info const *stream =
+				&ginfo->streams[queue->csi_index];
+			u32 const cs_ack =
+				kbase_csf_firmware_cs_output(stream, CS_ACK);
+
+			kbase_csf_firmware_cs_input_mask(stream, CS_REQ, cs_ack,
+				CS_REQ_FAULT_MASK);
+			kbase_csf_ring_cs_kernel_doorbell(kbdev, queue->csi_index,
+				slot_num, true);
+		}
+		kbase_csf_scheduler_spin_unlock(kbdev, flags);
+		goto unlock;
+	}
+#endif
+
 	group_handle = group->handle;
 	term_queue_group(group);
-	report_queue_fatal_error(queue, queue->cs_fatal, queue->cs_fatal_info,
+	flush_gpu_cache_on_fatal_error(kbdev);
+	report_queue_fatal_error(queue, queue->cs_error, queue->cs_error_info,
 				 group_handle);

 unlock:
@ -2391,14 +2579,18 @@ static void fatal_event_worker(struct work_struct *const data)
 * @queue:    Pointer to queue for which fatal event was received.
 * @stream:   Pointer to the structure containing info provided by the
 *            firmware about the CSI.
+ * @cs_ack: Value of the CS_ACK register in the CS kernel input page used for
+ *          the queue.
 *
- * Prints meaningful CS fatal information.
+ * Notify a waiting user space client of the CS fatal and prints meaningful
+ * information.
 * Enqueue a work item to terminate the group and report the fatal error
 * to user space.
 */
 static void
 handle_fatal_event(struct kbase_queue *const queue,
-		   struct kbase_csf_cmd_stream_info const *const stream)
+		   struct kbase_csf_cmd_stream_info const *const stream,
+		   u32 cs_ack)
 {
 	const u32 cs_fatal = kbase_csf_firmware_cs_output(stream, CS_FATAL);
 	const u64 cs_fatal_info =
@ -2428,57 +2620,26 @@ handle_fatal_event(struct kbase_queue *const queue,

 	if (cs_fatal_exception_type ==
 			CS_FATAL_EXCEPTION_TYPE_FIRMWARE_INTERNAL_ERROR) {
+		kbase_debug_csf_fault_notify(kbdev, queue->kctx, DF_FW_INTERNAL_ERROR);
 		queue_work(system_wq, &kbdev->csf.fw_error_work);
 	} else {
+		kbase_debug_csf_fault_notify(kbdev, queue->kctx, DF_CS_FATAL);
 		if (cs_fatal_exception_type == CS_FATAL_EXCEPTION_TYPE_CS_UNRECOVERABLE) {
 			queue->group->cs_unrecoverable = true;
 			if (kbase_prepare_to_reset_gpu(queue->kctx->kbdev, RESET_FLAGS_NONE))
 				kbase_reset_gpu(queue->kctx->kbdev);
 		}
 		get_queue(queue);
-		queue->cs_fatal = cs_fatal;
-		queue->cs_fatal_info = cs_fatal_info;
-		if (!queue_work(queue->kctx->csf.wq, &queue->fatal_event_work))
+		queue->cs_error = cs_fatal;
+		queue->cs_error_info = cs_fatal_info;
+		queue->cs_error_fatal = true;
+		if (!queue_work(queue->kctx->csf.wq, &queue->cs_error_work))
 			release_queue(queue);
 	}

-}
+	kbase_csf_firmware_cs_input_mask(stream, CS_REQ, cs_ack,
+					CS_REQ_FATAL_MASK);

-/**
- * handle_queue_exception_event - Handler for CS fatal/fault exception events.
- *
- * @queue:  Pointer to queue for which fatal/fault event was received.
- * @cs_req: Value of the CS_REQ register from the CS's input page.
- * @cs_ack: Value of the CS_ACK register from the CS's output page.
- */
-static void handle_queue_exception_event(struct kbase_queue *const queue,
-					 const u32 cs_req, const u32 cs_ack)
-{
-	struct kbase_csf_cmd_stream_group_info const *ginfo;
-	struct kbase_csf_cmd_stream_info const *stream;
-	struct kbase_context *const kctx = queue->kctx;
-	struct kbase_device *const kbdev = kctx->kbdev;
-	struct kbase_queue_group *group = queue->group;
-	int csi_index = queue->csi_index;
-	int slot_num = group->csg_nr;
-
-	kbase_csf_scheduler_spin_lock_assert_held(kbdev);
-
-	ginfo = &kbdev->csf.global_iface.groups[slot_num];
-	stream = &ginfo->streams[csi_index];
-
-	if ((cs_ack & CS_ACK_FATAL_MASK) != (cs_req & CS_REQ_FATAL_MASK)) {
-		handle_fatal_event(queue, stream);
-		kbase_csf_firmware_cs_input_mask(stream, CS_REQ, cs_ack,
-						 CS_REQ_FATAL_MASK);
-	}
-
-	if ((cs_ack & CS_ACK_FAULT_MASK) != (cs_req & CS_REQ_FAULT_MASK)) {
-		handle_fault_event(queue, stream);
-		kbase_csf_firmware_cs_input_mask(stream, CS_REQ, cs_ack,
-						 CS_REQ_FAULT_MASK);
-		kbase_csf_ring_cs_kernel_doorbell(kbdev, csi_index, slot_num, true);
-	}
 }

 /**
@ -2531,11 +2692,16 @@ static void process_cs_interrupts(struct kbase_queue_group *const group,
 				kbase_csf_firmware_cs_output(stream, CS_ACK);
 			struct workqueue_struct *wq = group->kctx->csf.wq;

-			if ((cs_req & CS_REQ_EXCEPTION_MASK) ^
-			    (cs_ack & CS_ACK_EXCEPTION_MASK)) {
+			if ((cs_ack & CS_ACK_FATAL_MASK) != (cs_req & CS_REQ_FATAL_MASK)) {
 				KBASE_KTRACE_ADD_CSF_GRP_Q(kbdev, CSI_INTERRUPT_FAULT,
 							 group, queue, cs_req ^ cs_ack);
-				handle_queue_exception_event(queue, cs_req, cs_ack);
+				handle_fatal_event(queue, stream, cs_ack);
+			}
+
+			if ((cs_ack & CS_ACK_FAULT_MASK) != (cs_req & CS_REQ_FAULT_MASK)) {
+				KBASE_KTRACE_ADD_CSF_GRP_Q(kbdev, CSI_INTERRUPT_FAULT,
+							 group, queue, cs_req ^ cs_ack);
+				handle_fault_event(queue, cs_ack);
 			}

 			/* PROTM_PEND and TILER_OOM can be safely ignored
@ -2597,6 +2763,8 @@ static void process_cs_interrupts(struct kbase_queue_group *const group,
 		if (test_bit(group->csg_nr, scheduler->csg_slots_idle_mask)) {
 			clear_bit(group->csg_nr,
 				  scheduler->csg_slots_idle_mask);
+			KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSG_SLOT_IDLE_CLEAR, group,
+							scheduler->csg_slots_idle_mask[0]);
 			dev_dbg(kbdev->dev,
 				"Group-%d on slot %d de-idled by protm request",
 				group->handle, group->csg_nr);
@ -2698,7 +2866,12 @@ static void process_csg_interrupts(struct kbase_device *const kbdev, int const c
 			/* If there are non-idle CSGs waiting for a slot, fire
 			 * a tock for a replacement.
 			 */
-			mod_delayed_work(scheduler->wq, &scheduler->tock_work, 0);
+			KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSG_INTERRUPT_NON_IDLE_GROUPS,
+						group, req ^ ack);
+			kbase_csf_scheduler_invoke_tock(kbdev);
+		} else {
+			KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSG_INTERRUPT_NO_NON_IDLE_GROUPS,
+						group, req ^ ack);
 		}

 		if (group->scan_seq_num < track->idle_seq) {
@ -2709,14 +2882,15 @@ static void process_csg_interrupts(struct kbase_device *const kbdev, int const c

 	if ((req ^ ack) & CSG_REQ_PROGRESS_TIMER_EVENT_MASK) {
 		kbase_csf_firmware_csg_input_mask(ginfo, CSG_REQ, ack,
-			CSG_REQ_PROGRESS_TIMER_EVENT_MASK);
+						  CSG_REQ_PROGRESS_TIMER_EVENT_MASK);

-		KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSG_INTERRUPT_PROGRESS_TIMER_EVENT,
-					 group, req ^ ack);
-		dev_info(kbdev->dev,
+		KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSG_INTERRUPT_PROGRESS_TIMER_EVENT, group,
+					 req ^ ack);
+		dev_info(
+			kbdev->dev,
 			"[%llu] Iterator PROGRESS_TIMER timeout notification received for group %u of ctx %d_%d on slot %d\n",
-			kbase_backend_get_cycle_cnt(kbdev),
-			group->handle, group->kctx->tgid, group->kctx->id, csg_nr);
+			kbase_backend_get_cycle_cnt(kbdev), group->handle, group->kctx->tgid,
+			group->kctx->id, csg_nr);

 		handle_progress_timer_event(group);
 	}
@ -2904,7 +3078,7 @@ static inline void process_tracked_info_for_protm(struct kbase_device *kbdev,
 		 * for the scheduler to re-examine the case.
 		 */
 		dev_dbg(kbdev->dev, "Attempt pending protm from idle slot %d\n", track->idle_slot);
-		mod_delayed_work(scheduler->wq, &scheduler->tock_work, 0);
+		kbase_csf_scheduler_invoke_tock(kbdev);
 	} else if (group) {
 		u32 i, num_groups = kbdev->csf.global_iface.group_num;
 		struct kbase_queue_group *grp;
@ -2927,7 +3101,7 @@ static inline void process_tracked_info_for_protm(struct kbase_device *kbdev,
 				tock_triggered = true;
 				dev_dbg(kbdev->dev,
 					"Attempt new protm from tick/tock idle slot %d\n", i);
-				mod_delayed_work(scheduler->wq, &scheduler->tock_work, 0);
+				kbase_csf_scheduler_invoke_tock(kbdev);
 				break;
 			}
 		}
@ -2940,77 +3114,133 @@ static inline void process_tracked_info_for_protm(struct kbase_device *kbdev,
 	}
 }

+static void order_job_irq_clear_with_iface_mem_read(void)
+{
+	/* Ensure that write to the JOB_IRQ_CLEAR is ordered with regards to the
+	 * read from interface memory. The ordering is needed considering the way
+	 * FW & Kbase writes to the JOB_IRQ_RAWSTAT and JOB_IRQ_CLEAR registers
+	 * without any synchronization. Without the barrier there is no guarantee
+	 * about the ordering, the write to IRQ_CLEAR can take effect after the read
+	 * from interface memory and that could cause a problem for the scenario where
+	 * FW sends back to back notifications for the same CSG for events like
+	 * SYNC_UPDATE and IDLE, but Kbase gets a single IRQ and observes only the
+	 * first event. Similar thing can happen with glb events like CFG_ALLOC_EN
+	 * acknowledgment and GPU idle notification.
+	 *
+	 *       MCU                                    CPU
+	 *  ---------------                         ----------------
+	 *  Update interface memory                 Write to IRQ_CLEAR to clear current IRQ
+	 *  <barrier>                               <barrier>
+	 *  Write to IRQ_RAWSTAT to raise new IRQ   Read interface memory
+	 */
+
+	/* CPU and GPU would be in the same Outer shareable domain */
+	dmb(osh);
+}
+
 void kbase_csf_interrupt(struct kbase_device *kbdev, u32 val)
 {
-	unsigned long flags;
-	u32 csg_interrupts = val & ~JOB_IRQ_GLOBAL_IF;
-	struct irq_idle_and_protm_track track = { .protm_grp = NULL, .idle_seq = U32_MAX };
+	bool deferred_handling_glb_idle_irq = false;

 	lockdep_assert_held(&kbdev->hwaccess_lock);

 	KBASE_KTRACE_ADD(kbdev, CSF_INTERRUPT_START, NULL, val);
-	kbase_reg_write(kbdev, JOB_CONTROL_REG(JOB_IRQ_CLEAR), val);

-	if (csg_interrupts != 0) {
-		kbase_csf_scheduler_spin_lock(kbdev, &flags);
-		/* Looping through and track the highest idle and protm groups */
-		while (csg_interrupts != 0) {
-			int const csg_nr = ffs(csg_interrupts) - 1;
+	do {
+		unsigned long flags;
+		u32 csg_interrupts = val & ~JOB_IRQ_GLOBAL_IF;
+		struct irq_idle_and_protm_track track = { .protm_grp = NULL, .idle_seq = U32_MAX };
+		bool glb_idle_irq_received = false;

-			process_csg_interrupts(kbdev, csg_nr, &track);
-			csg_interrupts &= ~(1 << csg_nr);
+		kbase_reg_write(kbdev, JOB_CONTROL_REG(JOB_IRQ_CLEAR), val);
+		order_job_irq_clear_with_iface_mem_read();
+
+		if (csg_interrupts != 0) {
+			kbase_csf_scheduler_spin_lock(kbdev, &flags);
+			/* Looping through and track the highest idle and protm groups */
+			while (csg_interrupts != 0) {
+				int const csg_nr = ffs(csg_interrupts) - 1;
+
+				process_csg_interrupts(kbdev, csg_nr, &track);
+				csg_interrupts &= ~(1 << csg_nr);
+			}
+
+			/* Handle protm from the tracked information */
+			process_tracked_info_for_protm(kbdev, &track);
+			kbase_csf_scheduler_spin_unlock(kbdev, flags);
 		}

-		/* Handle protm from the tracked information */
-		process_tracked_info_for_protm(kbdev, &track);
-		kbase_csf_scheduler_spin_unlock(kbdev, flags);
-	}
+		if (val & JOB_IRQ_GLOBAL_IF) {
+			const struct kbase_csf_global_iface *const global_iface =
+				&kbdev->csf.global_iface;

-	if (val & JOB_IRQ_GLOBAL_IF) {
-		const struct kbase_csf_global_iface *const global_iface =
-			&kbdev->csf.global_iface;
+			kbdev->csf.interrupt_received = true;

-		kbdev->csf.interrupt_received = true;
+			if (!kbdev->csf.firmware_reloaded)
+				kbase_csf_firmware_reload_completed(kbdev);
+			else if (global_iface->output) {
+				u32 glb_req, glb_ack;

-		if (!kbdev->csf.firmware_reloaded)
-			kbase_csf_firmware_reload_completed(kbdev);
-		else if (global_iface->output) {
-			u32 glb_req, glb_ack;
+				kbase_csf_scheduler_spin_lock(kbdev, &flags);
+				glb_req =
+					kbase_csf_firmware_global_input_read(global_iface, GLB_REQ);
+				glb_ack = kbase_csf_firmware_global_output(global_iface, GLB_ACK);
+				KBASE_KTRACE_ADD(kbdev, CSF_INTERRUPT_GLB_REQ_ACK, NULL,
+						 glb_req ^ glb_ack);

-			kbase_csf_scheduler_spin_lock(kbdev, &flags);
-			glb_req = kbase_csf_firmware_global_input_read(
-					global_iface, GLB_REQ);
-			glb_ack = kbase_csf_firmware_global_output(
-					global_iface, GLB_ACK);
-			KBASE_KTRACE_ADD(kbdev, CSF_INTERRUPT_GLB_REQ_ACK, NULL, glb_req ^ glb_ack);
+				check_protm_enter_req_complete(kbdev, glb_req, glb_ack);

-			check_protm_enter_req_complete(kbdev, glb_req, glb_ack);
+				if ((glb_req ^ glb_ack) & GLB_REQ_PROTM_EXIT_MASK)
+					process_protm_exit(kbdev, glb_ack);

-			if ((glb_req ^ glb_ack) & GLB_REQ_PROTM_EXIT_MASK)
-				process_protm_exit(kbdev, glb_ack);
-
-			/* Handle IDLE Hysteresis notification event */
-			if ((glb_req ^ glb_ack) & GLB_REQ_IDLE_EVENT_MASK) {
-				dev_dbg(kbdev->dev, "Idle-hysteresis event flagged");
-				kbase_csf_firmware_global_input_mask(
+				/* Handle IDLE Hysteresis notification event */
+				if ((glb_req ^ glb_ack) & GLB_REQ_IDLE_EVENT_MASK) {
+					dev_dbg(kbdev->dev, "Idle-hysteresis event flagged");
+					kbase_csf_firmware_global_input_mask(
 						global_iface, GLB_REQ, glb_ack,
 						GLB_REQ_IDLE_EVENT_MASK);

-				kbase_csf_scheduler_process_gpu_idle_event(kbdev);
+					glb_idle_irq_received = true;
+					/* Defer handling this IRQ to account for a race condition
+					 * where the idle worker could be executed before we have
+					 * finished handling all pending IRQs (including CSG IDLE
+					 * IRQs).
+					 */
+					deferred_handling_glb_idle_irq = true;
+				}
+
+				process_prfcnt_interrupts(kbdev, glb_req, glb_ack);
+
+				kbase_csf_scheduler_spin_unlock(kbdev, flags);
+
+				/* Invoke the MCU state machine as a state transition
+				 * might have completed.
+				 */
+				kbase_pm_update_state(kbdev);
 			}
-
-			process_prfcnt_interrupts(kbdev, glb_req, glb_ack);
-
-			kbase_csf_scheduler_spin_unlock(kbdev, flags);
-
-			/* Invoke the MCU state machine as a state transition
-			 * might have completed.
-			 */
-			kbase_pm_update_state(kbdev);
 		}
+
+		if (!glb_idle_irq_received)
+			break;
+		/* Attempt to serve potential IRQs that might have occurred
+		 * whilst handling the previous IRQ. In case we have observed
+		 * the GLB IDLE IRQ without all CSGs having been marked as
+		 * idle, the GPU would be treated as no longer idle and left
+		 * powered on.
+		 */
+		val = kbase_reg_read(kbdev, JOB_CONTROL_REG(JOB_IRQ_STATUS));
+	} while (val);
+
+	if (deferred_handling_glb_idle_irq) {
+		unsigned long flags;
+
+		kbase_csf_scheduler_spin_lock(kbdev, &flags);
+		kbase_csf_scheduler_process_gpu_idle_event(kbdev);
+		kbase_csf_scheduler_spin_unlock(kbdev, flags);
 	}

 	wake_up_all(&kbdev->csf.event_wait);
+
 	KBASE_KTRACE_ADD(kbdev, CSF_INTERRUPT_END, NULL, val);
 }

@ -3037,9 +3267,8 @@ int kbase_csf_doorbell_mapping_init(struct kbase_device *kbdev)
 	if (IS_ERR(filp))
 		return PTR_ERR(filp);

-	ret = kbase_mem_pool_alloc_pages(
-		&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW],
-		1, &phys, false);
+	ret = kbase_mem_pool_alloc_pages(&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW], 1, &phys,
+					 false);

 	if (ret <= 0) {
 		fput(filp);
@ -3073,9 +3302,8 @@ int kbase_csf_setup_dummy_user_reg_page(struct kbase_device *kbdev)

 	kbdev->csf.dummy_user_reg_page = as_tagged(0);

-	ret = kbase_mem_pool_alloc_pages(
-		&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW], 1, &phys,
-		false);
+	ret = kbase_mem_pool_alloc_pages(&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW], 1, &phys,
+					 false);

 	if (ret <= 0)
 		return ret;
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_csg_debugfs.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_csg_debugfs.c
@ -23,12 +23,135 @@
 #include <mali_kbase.h>
 #include <linux/seq_file.h>
 #include <linux/delay.h>
-#include <csf/mali_kbase_csf_trace_buffer.h>
 #include <backend/gpu/mali_kbase_pm_internal.h>

 #if IS_ENABLED(CONFIG_DEBUG_FS)
 #include "mali_kbase_csf_tl_reader.h"

+/* Wait time to be used cumulatively for all the CSG slots.
+ * Since scheduler lock is held when STATUS_UPDATE request is sent, there won't be
+ * any other Host request pending on the FW side and usually FW would be responsive
+ * to the Doorbell IRQs as it won't do any polling for a long time and also it won't
+ * have to wait for any HW state transition to complete for publishing the status.
+ * So it is reasonable to expect that handling of STATUS_UPDATE request would be
+ * relatively very quick.
+ */
+#define STATUS_UPDATE_WAIT_TIMEOUT 500
+
+/* The bitmask of CSG slots for which the STATUS_UPDATE request completed.
+ * The access to it is serialized with scheduler lock, so at a time it would
+ * get used either for "active_groups" or per context "groups" debugfs file.
+ */
+static DECLARE_BITMAP(csg_slots_status_updated, MAX_SUPPORTED_CSGS);
+
+static
+bool csg_slot_status_update_finish(struct kbase_device *kbdev, u32 csg_nr)
+{
+	struct kbase_csf_cmd_stream_group_info const *const ginfo =
+		&kbdev->csf.global_iface.groups[csg_nr];
+
+	return !((kbase_csf_firmware_csg_input_read(ginfo, CSG_REQ) ^
+		  kbase_csf_firmware_csg_output(ginfo, CSG_ACK)) &
+			     CSG_REQ_STATUS_UPDATE_MASK);
+}
+
+static
+bool csg_slots_status_update_finish(struct kbase_device *kbdev,
+		const unsigned long *slots_mask)
+{
+	const u32 max_csg_slots = kbdev->csf.global_iface.group_num;
+	bool changed = false;
+	u32 csg_nr;
+
+	lockdep_assert_held(&kbdev->csf.scheduler.lock);
+
+	for_each_set_bit(csg_nr, slots_mask, max_csg_slots) {
+		if (csg_slot_status_update_finish(kbdev, csg_nr)) {
+			set_bit(csg_nr, csg_slots_status_updated);
+			changed = true;
+		}
+	}
+
+	return changed;
+}
+
+static void wait_csg_slots_status_update_finish(struct kbase_device *kbdev,
+		unsigned long *slots_mask)
+{
+	const u32 max_csg_slots = kbdev->csf.global_iface.group_num;
+	long remaining = kbase_csf_timeout_in_jiffies(STATUS_UPDATE_WAIT_TIMEOUT);
+
+	lockdep_assert_held(&kbdev->csf.scheduler.lock);
+
+	bitmap_zero(csg_slots_status_updated, max_csg_slots);
+
+	while (!bitmap_empty(slots_mask, max_csg_slots) && remaining) {
+		remaining = wait_event_timeout(kbdev->csf.event_wait,
+				csg_slots_status_update_finish(kbdev, slots_mask),
+				remaining);
+		if (likely(remaining)) {
+			bitmap_andnot(slots_mask, slots_mask,
+				csg_slots_status_updated, max_csg_slots);
+		} else {
+			dev_warn(kbdev->dev,
+				 "STATUS_UPDATE request timed out for slots 0x%lx",
+				 slots_mask[0]);
+		}
+	}
+}
+
+static void update_active_groups_status(struct kbase_device *kbdev, struct seq_file *file)
+{
+	u32 max_csg_slots = kbdev->csf.global_iface.group_num;
+	DECLARE_BITMAP(used_csgs, MAX_SUPPORTED_CSGS) = { 0 };
+	u32 csg_nr;
+	unsigned long flags;
+
+	lockdep_assert_held(&kbdev->csf.scheduler.lock);
+
+	/* Global doorbell ring for CSG STATUS_UPDATE request or User doorbell
+	 * ring for Extract offset update, shall not be made when MCU has been
+	 * put to sleep otherwise it will undesirably make MCU exit the sleep
+	 * state. Also it isn't really needed as FW will implicitly update the
+	 * status of all on-slot groups when MCU sleep request is sent to it.
+	 */
+	if (kbdev->csf.scheduler.state == SCHED_SLEEPING) {
+		bitmap_copy(csg_slots_status_updated,
+			    kbdev->csf.scheduler.csg_inuse_bitmap, max_csg_slots);
+		return;
+	}
+
+	for (csg_nr = 0; csg_nr < max_csg_slots; csg_nr++) {
+		struct kbase_queue_group *const group =
+			kbdev->csf.scheduler.csg_slots[csg_nr].resident_group;
+		if (!group)
+			continue;
+		/* Ring the User doorbell for FW to update the Extract offset */
+		kbase_csf_ring_doorbell(kbdev, group->doorbell_nr);
+		set_bit(csg_nr, used_csgs);
+	}
+
+	/* Return early if there are no on-slot groups */
+	if (bitmap_empty(used_csgs, max_csg_slots))
+		return;
+
+	kbase_csf_scheduler_spin_lock(kbdev, &flags);
+	for_each_set_bit(csg_nr, used_csgs, max_csg_slots) {
+		struct kbase_csf_cmd_stream_group_info const *const ginfo =
+			&kbdev->csf.global_iface.groups[csg_nr];
+		kbase_csf_firmware_csg_input_mask(ginfo, CSG_REQ,
+						  ~kbase_csf_firmware_csg_output(ginfo, CSG_ACK),
+						  CSG_REQ_STATUS_UPDATE_MASK);
+	}
+
+	BUILD_BUG_ON(MAX_SUPPORTED_CSGS > (sizeof(used_csgs[0]) * BITS_PER_BYTE));
+	kbase_csf_ring_csg_slots_doorbell(kbdev, used_csgs[0]);
+	kbase_csf_scheduler_spin_unlock(kbdev, flags);
+	wait_csg_slots_status_update_finish(kbdev, used_csgs);
+	/* Wait for the User doobell ring to take effect */
+	msleep(100);
+}
+
 #define MAX_SCHED_STATE_STRING_LEN (16)
 static const char *scheduler_state_to_string(struct kbase_device *kbdev,
 			enum kbase_csf_scheduler_state sched_state)
@ -77,16 +200,32 @@ static const char *blocked_reason_to_string(u32 reason_id)
 	return cs_blocked_reason[reason_id];
 }

+static bool sb_source_supported(u32 glb_version)
+{
+	bool supported = false;
+
+	if (((GLB_VERSION_MAJOR_GET(glb_version) == 3) &&
+	     (GLB_VERSION_MINOR_GET(glb_version) >= 5)) ||
+	    ((GLB_VERSION_MAJOR_GET(glb_version) == 2) &&
+	     (GLB_VERSION_MINOR_GET(glb_version) >= 6)) ||
+	    ((GLB_VERSION_MAJOR_GET(glb_version) == 1) &&
+	     (GLB_VERSION_MINOR_GET(glb_version) >= 3)))
+		supported = true;
+
+	return supported;
+}
+
 static void kbasep_csf_scheduler_dump_active_queue_cs_status_wait(
-	struct seq_file *file, u32 wait_status, u32 wait_sync_value,
-	u64 wait_sync_live_value, u64 wait_sync_pointer, u32 sb_status,
-	u32 blocked_reason)
+	struct seq_file *file, u32 glb_version, u32 wait_status, u32 wait_sync_value,
+	u64 wait_sync_live_value, u64 wait_sync_pointer, u32 sb_status, u32 blocked_reason)
 {
 #define WAITING "Waiting"
 #define NOT_WAITING "Not waiting"

 	seq_printf(file, "SB_MASK: %d\n",
 			CS_STATUS_WAIT_SB_MASK_GET(wait_status));
+	if (sb_source_supported(glb_version))
+		seq_printf(file, "SB_SOURCE: %d\n", CS_STATUS_WAIT_SB_SOURCE_GET(wait_status));
 	seq_printf(file, "PROGRESS_WAIT: %s\n",
 			CS_STATUS_WAIT_PROGRESS_WAIT_GET(wait_status) ?
 			WAITING : NOT_WAITING);
@ -156,10 +295,13 @@ static void kbasep_csf_scheduler_dump_active_queue(struct seq_file *file,
 	struct kbase_vmap_struct *mapping;
 	u64 *evt;
 	u64 wait_sync_live_value;
+	u32 glb_version;

 	if (!queue)
 		return;

+	glb_version = queue->kctx->kbdev->csf.global_iface.version;
+
 	if (WARN_ON(queue->csi_index == KBASEP_IF_NR_INVALID ||
 		    !queue->group))
 		return;
@ -200,9 +342,8 @@ static void kbasep_csf_scheduler_dump_active_queue(struct seq_file *file,
 			}

 			kbasep_csf_scheduler_dump_active_queue_cs_status_wait(
-				file, wait_status, wait_sync_value,
-				wait_sync_live_value, wait_sync_pointer,
-				sb_status, blocked_reason);
+				file, glb_version, wait_status, wait_sync_value,
+				wait_sync_live_value, wait_sync_pointer, sb_status, blocked_reason);
 		}
 	} else {
 		struct kbase_device const *const kbdev =
@ -257,9 +398,8 @@ static void kbasep_csf_scheduler_dump_active_queue(struct seq_file *file,
 		}

 		kbasep_csf_scheduler_dump_active_queue_cs_status_wait(
-			file, wait_status, wait_sync_value,
-			wait_sync_live_value, wait_sync_pointer, sb_status,
-			blocked_reason);
+			file, glb_version, wait_status, wait_sync_value, wait_sync_live_value,
+			wait_sync_pointer, sb_status, blocked_reason);
 		/* Dealing with cs_trace */
 		if (kbase_csf_scheduler_queue_has_trace(queue))
 			kbasep_csf_scheduler_dump_active_cs_trace(file, stream);
@ -270,54 +410,6 @@ static void kbasep_csf_scheduler_dump_active_queue(struct seq_file *file,
 	seq_puts(file, "\n");
 }

-static void update_active_group_status(struct seq_file *file,
-		struct kbase_queue_group *const group)
-{
-	struct kbase_device *const kbdev = group->kctx->kbdev;
-	struct kbase_csf_cmd_stream_group_info const *const ginfo =
-		&kbdev->csf.global_iface.groups[group->csg_nr];
-	long remaining = kbase_csf_timeout_in_jiffies(kbdev->csf.fw_timeout_ms);
-	unsigned long flags;
-
-	/* Global doorbell ring for CSG STATUS_UPDATE request or User doorbell
-	 * ring for Extract offset update, shall not be made when MCU has been
-	 * put to sleep otherwise it will undesirably make MCU exit the sleep
-	 * state. Also it isn't really needed as FW will implicitly update the
-	 * status of all on-slot groups when MCU sleep request is sent to it.
-	 */
-	if (kbdev->csf.scheduler.state == SCHED_SLEEPING)
-		return;
-
-	/* Ring the User doobell shared between the queues bound to this
-	 * group, to have FW update the CS_EXTRACT for all the queues
-	 * bound to the group. Ring early so that FW gets adequate time
-	 * for the handling.
-	 */
-	kbase_csf_ring_doorbell(kbdev, group->doorbell_nr);
-
-	kbase_csf_scheduler_spin_lock(kbdev, &flags);
-	kbase_csf_firmware_csg_input_mask(ginfo, CSG_REQ,
-			~kbase_csf_firmware_csg_output(ginfo, CSG_ACK),
-			CSG_REQ_STATUS_UPDATE_MASK);
-	kbase_csf_scheduler_spin_unlock(kbdev, flags);
-	kbase_csf_ring_csg_doorbell(kbdev, group->csg_nr);
-
-	remaining = wait_event_timeout(kbdev->csf.event_wait,
-		!((kbase_csf_firmware_csg_input_read(ginfo, CSG_REQ) ^
-		kbase_csf_firmware_csg_output(ginfo, CSG_ACK)) &
-		CSG_REQ_STATUS_UPDATE_MASK), remaining);
-
-	if (!remaining) {
-		dev_err(kbdev->dev,
-			"Timed out for STATUS_UPDATE on group %d on slot %d",
-			group->handle, group->csg_nr);
-
-		seq_printf(file, "*** Warn: Timed out for STATUS_UPDATE on slot %d\n",
-			group->csg_nr);
-		seq_puts(file, "*** The following group-record is likely stale\n");
-	}
-}
-
 static void kbasep_csf_scheduler_dump_active_group(struct seq_file *file,
 		struct kbase_queue_group *const group)
 {
@ -331,8 +423,6 @@ static void kbasep_csf_scheduler_dump_active_group(struct seq_file *file,
 		u8 slot_priority =
 			kbdev->csf.scheduler.csg_slots[group->csg_nr].priority;

-		update_active_group_status(file, group);
-
 		ep_c = kbase_csf_firmware_csg_output(ginfo,
 				CSG_STATUS_EP_CURRENT);
 		ep_r = kbase_csf_firmware_csg_output(ginfo, CSG_STATUS_EP_REQ);
@ -348,6 +438,12 @@ static void kbasep_csf_scheduler_dump_active_group(struct seq_file *file,
 				CSG_STATUS_STATE_IDLE_MASK)
 			idle = 'Y';

+		if (!test_bit(group->csg_nr, csg_slots_status_updated)) {
+			seq_printf(file, "*** Warn: Timed out for STATUS_UPDATE on slot %d\n",
+				group->csg_nr);
+			seq_puts(file, "*** The following group-record is likely stale\n");
+		}
+
 		seq_puts(file, "GroupID, CSG NR, CSG Prio, Run State, Priority, C_EP(Alloc/Req), F_EP(Alloc/Req), T_EP(Alloc/Req), Exclusive, Idle\n");
 		seq_printf(file, "%7d, %6d, %8d, %9d, %8d, %11d/%3d, %11d/%3d, %11d/%3d, %9c, %4c\n",
 			group->handle,
@ -363,10 +459,6 @@ static void kbasep_csf_scheduler_dump_active_group(struct seq_file *file,
 			CSG_STATUS_EP_REQ_TILER_EP_GET(ep_r),
 			exclusive,
 			idle);
-
-		/* Wait for the User doobell ring to take effect */
-		if (kbdev->csf.scheduler.state != SCHED_SLEEPING)
-			msleep(100);
 	} else {
 		seq_puts(file, "GroupID, CSG NR, Run State, Priority\n");
 		seq_printf(file, "%7d, %6d, %9d, %8d\n",
@ -416,10 +508,11 @@ static int kbasep_csf_queue_group_debugfs_show(struct seq_file *file,
 	kbase_csf_scheduler_lock(kbdev);
 	if (kbdev->csf.scheduler.state == SCHED_SLEEPING) {
 		/* Wait for the MCU sleep request to complete. Please refer the
-		 * update_active_group_status() function for the explanation.
+		 * update_active_groups_status() function for the explanation.
 		 */
 		kbase_pm_wait_for_desired_state(kbdev);
 	}
+	update_active_groups_status(kbdev, file);
 	for (gr = 0; gr < MAX_QUEUE_GROUP_NUM; gr++) {
 		struct kbase_queue_group *const group =
 			kctx->csf.queue_groups[gr];
@ -455,10 +548,11 @@ static int kbasep_csf_scheduler_dump_active_groups(struct seq_file *file,
 	kbase_csf_scheduler_lock(kbdev);
 	if (kbdev->csf.scheduler.state == SCHED_SLEEPING) {
 		/* Wait for the MCU sleep request to complete. Please refer the
-		 * update_active_group_status() function for the explanation.
+		 * update_active_groups_status() function for the explanation.
 		 */
 		kbase_pm_wait_for_desired_state(kbdev);
 	}
+	update_active_groups_status(kbdev, file);
 	for (csg_nr = 0; csg_nr < num_groups; csg_nr++) {
 		struct kbase_queue_group *const group =
 			kbdev->csf.scheduler.csg_slots[csg_nr].resident_group;
@ -664,7 +758,6 @@ void kbase_csf_debugfs_init(struct kbase_device *kbdev)
 			&kbasep_csf_debugfs_scheduler_state_fops);

 	kbase_csf_tl_reader_debugfs_init(kbdev);
-	kbase_csf_firmware_trace_buffer_debugfs_init(kbdev);
 }

 #else
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_defs.h
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_defs.h
@ -31,6 +31,7 @@

 #include "mali_kbase_csf_firmware.h"
 #include "mali_kbase_csf_event.h"
+#include <uapi/gpu/arm/bifrost/csf/mali_kbase_csf_errors_dumpfault.h>

 /* Maximum number of KCPU command queues to be created per GPU address space.
 */
@ -355,14 +356,19 @@ struct kbase_csf_notification {
 * @trace_buffer_size: CS trace buffer size for the queue.
 * @trace_cfg:         CS trace configuration parameters.
 * @error:          GPU command queue fatal information to pass to user space.
- * @fatal_event_work: Work item to handle the CS fatal event reported for this
- *                    queue.
- * @cs_fatal_info:    Records additional information about the CS fatal event.
- * @cs_fatal:         Records information about the CS fatal event.
+ * @cs_error_work:    Work item to handle the CS fatal event reported for this
+ *                    queue or the CS fault event if dump on fault is enabled
+ *                    and acknowledgment for CS fault event needs to be done
+ *                    after dumping is complete.
+ * @cs_error_info:    Records additional information about the CS fatal event or
+ *                    about CS fault event if dump on fault is enabled.
+ * @cs_error:         Records information about the CS fatal event or
+ *                    about CS fault event if dump on fault is enabled.
+ * @cs_error_fatal:   Flag to track if the CS fault or CS fatal event occurred.
 * @pending:          Indicating whether the queue has new submitted work.
- * @extract_ofs: The current EXTRACT offset, this is updated during certain
- *               events such as GPU idle IRQ in order to help detect a
- *               queue's true idle status.
+ * @extract_ofs: The current EXTRACT offset, this is only updated when handling
+ *               the GLB IDLE IRQ if the idle timeout value is non-0 in order
+ *               to help detect a queue's true idle status.
 * @saved_cmd_ptr: The command pointer value for the GPU queue, saved when the
 *                 group to which queue is bound is suspended.
 *                 This can be useful in certain cases to know that till which
@ -377,7 +383,11 @@ struct kbase_queue {
 	int doorbell_nr;
 	unsigned long db_file_offset;
 	struct list_head link;
+#if (KERNEL_VERSION(4, 11, 0) > LINUX_VERSION_CODE)
 	atomic_t refcount;
+#else
+	refcount_t refcount;
+#endif
 	struct kbase_queue_group *group;
 	struct kbase_va_region *queue_reg;
 	struct work_struct oom_event_work;
@ -397,14 +407,15 @@ struct kbase_queue {
 	u32 trace_buffer_size;
 	u32 trace_cfg;
 	struct kbase_csf_notification error;
-	struct work_struct fatal_event_work;
-	u64 cs_fatal_info;
-	u32 cs_fatal;
+	struct work_struct cs_error_work;
+	u64 cs_error_info;
+	u32 cs_error;
+	bool cs_error_fatal;
 	atomic_t pending;
 	u64 extract_ofs;
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 	u64 saved_cmd_ptr;
-#endif
+#endif /* CONFIG_DEBUG_FS */
 };

 /**
@ -498,6 +509,9 @@ struct kbase_protected_suspend_buffer {
 *                   to be returned to userspace if such an error has occurred.
 * @timer_event_work: Work item to handle the progress timeout fatal event
 *                    for the group.
+ * @deschedule_deferred_cnt: Counter keeping a track of the number of threads
+ *                           that tried to deschedule the group and had to defer
+ *                           the descheduling due to the dump on fault.
 */
 struct kbase_queue_group {
 	struct kbase_context *kctx;
@ -539,6 +553,15 @@ struct kbase_queue_group {

 	struct work_struct timer_event_work;

+	/**
+	 * @dvs_buf: Address and size of scratch memory.
+	 *
+	 * Used to store intermediate DVS data by the GPU.
+	 */
+	u64 dvs_buf;
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+	u32 deschedule_deferred_cnt;
+#endif
 };

 /**
@ -548,10 +571,10 @@ struct kbase_queue_group {
 * @lock:   Lock preventing concurrent access to @array and the @in_use bitmap.
 * @array:  Array of pointers to kernel CPU command queues.
 * @in_use: Bitmap which indicates which kernel CPU command queues are in use.
- * @wq:     Dedicated workqueue for processing kernel CPU command queues.
- * @num_cmds:           The number of commands that have been enqueued across
- *                      all the KCPU command queues. This could be used as a
- *                      timestamp to determine the command's enqueueing time.
+ * @cmd_seq_num:        The sequence number assigned to an enqueued command,
+ *                      in incrementing order (older commands shall have a
+ *                      smaller number).
+ * @jit_lock:           Lock to serialise JIT operations.
 * @jit_cmds_head:      A list of the just-in-time memory commands, both
 *                      allocate & free, in submission order, protected
 *                      by kbase_csf_kcpu_queue_context.lock.
@ -564,9 +587,9 @@ struct kbase_csf_kcpu_queue_context {
 	struct mutex lock;
 	struct kbase_kcpu_command_queue *array[KBASEP_MAX_KCPU_QUEUES];
 	DECLARE_BITMAP(in_use, KBASEP_MAX_KCPU_QUEUES);
-	struct workqueue_struct *wq;
-	u64 num_cmds;
+	atomic64_t cmd_seq_num;

+	struct mutex jit_lock;
 	struct list_head jit_cmds_head;
 	struct list_head jit_blocked_queues;
 };
@ -636,6 +659,28 @@ struct kbase_csf_tiler_heap_context {
 	u64 nr_of_heaps;
 };

+/**
+ * struct kbase_csf_ctx_heap_reclaim_info - Object representing the data section of
+ *                                          a kctx for tiler heap reclaim manger
+ * @mgr_link:            Link for hooking up to the heap reclaim manger's kctx lists
+ * @nr_freed_pages:      Number of freed pages from the the kctx, after its attachment
+ *                       to the reclaim manager. This is used for tracking reclaim's
+ *                       free operation progress.
+ * @nr_est_unused_pages: Estimated number of pages that could be freed for the kctx
+ *                       when all its CSGs are off-slot, on attaching to the reclaim
+ *                       manager.
+ * @on_slot_grps:        Number of on-slot groups from this kctx. In principle, if a
+ *                       kctx has groups on-slot, the scheduler will detach it from
+ *                       the tiler heap reclaim manager, i.e. no tiler heap memory
+ *                       reclaiming operations on the kctx.
+ */
+struct kbase_csf_ctx_heap_reclaim_info {
+	struct list_head mgr_link;
+	u32 nr_freed_pages;
+	u32 nr_est_unused_pages;
+	u8 on_slot_grps;
+};
+
 /**
 * struct kbase_csf_scheduler_context - Object representing the scheduler's
 *                                      context for a GPU address space.
@ -657,6 +702,10 @@ struct kbase_csf_tiler_heap_context {
 *                      streams bound to groups of @idle_wait_groups list.
 * @ngrp_to_schedule:	Number of groups added for the context to the
 *                      'groups_to_schedule' list of scheduler instance.
+ * @heap_info:          Heap reclaim information data of the kctx. As the
+ *                      reclaim action needs to be coordinated with the scheduler
+ *                      operations, any manipulations on the data needs holding
+ *                      the scheduler's mutex lock.
 */
 struct kbase_csf_scheduler_context {
 	struct list_head runnable_groups[KBASE_QUEUE_GROUP_PRIORITY_COUNT];
@ -666,6 +715,7 @@ struct kbase_csf_scheduler_context {
 	struct workqueue_struct *sync_update_wq;
 	struct work_struct sync_update_work;
 	u32 ngrp_to_schedule;
+	struct kbase_csf_ctx_heap_reclaim_info heap_info;
 };

 /**
@ -808,6 +858,22 @@ struct kbase_csf_csg_slot {
 	u8 priority;
 };

+/**
+ * struct kbase_csf_sched_heap_reclaim_mgr - Object for managing tiler heap reclaim
+ *                                           kctx lists inside the CSF device's scheduler.
+ *
+ * @heap_reclaim:   Tiler heap reclaim shrinker object.
+ * @ctx_lists:      Array of kctx lists, size matching CSG defined priorities. The
+ *                  lists track the kctxs attached to the reclaim manager.
+ * @unused_pages:   Estimated number of unused pages from the @ctxlist array. The
+ *                  number is indicative for use with reclaim shrinker's count method.
+ */
+struct kbase_csf_sched_heap_reclaim_mgr {
+	struct shrinker heap_reclaim;
+	struct list_head ctx_lists[KBASE_QUEUE_GROUP_PRIORITY_COUNT];
+	atomic_t unused_pages;
+};
+
 /**
 * struct kbase_csf_scheduler - Object representing the scheduler used for
 *                              CSF for an instance of GPU platform device.
@ -880,6 +946,8 @@ struct kbase_csf_csg_slot {
 *                          operation to implement timeslice-based scheduling.
 * @tock_work:              Work item that would perform the schedule on tock
 *                          operation to implement the asynchronous scheduling.
+ * @pending_tock_work:      Indicates that the tock work item should re-execute
+ *                          once it's finished instead of going back to sleep.
 * @ping_work:              Work item that would ping the firmware at regular
 *                          intervals, only if there is a single active CSG
 *                          slot, to check if firmware is alive and would
@ -889,8 +957,6 @@ struct kbase_csf_csg_slot {
 *                          @top_grp.
 * @top_grp:                Pointer to queue group inside @groups_to_schedule
 *                          list that was assigned the highest slot priority.
- * @tock_pending_request:   A "tock" request is pending: a group that is not
- *                          currently on the GPU demands to be scheduled.
 * @active_protm_grp:       Indicates if firmware has been permitted to let GPU
 *                          enter protected mode with the given group. On exit
 *                          from protected mode the pointer is reset to NULL.
@ -903,6 +969,13 @@ struct kbase_csf_csg_slot {
 *                          handler.
 * @gpu_idle_work:          Work item for facilitating the scheduler to bring
 *                          the GPU to a low-power mode on becoming idle.
+ * @fast_gpu_idle_handling: Indicates whether to relax many of the checks
+ *                          normally done in the GPU idle worker. This is
+ *                          set to true when handling the GLB IDLE IRQ if the
+ *                          idle hysteresis timeout is 0, since it makes it
+ *                          possible to receive this IRQ before the extract
+ *                          offset is published (which would cause more
+ *                          extensive GPU idle checks to fail).
 * @gpu_no_longer_idle:     Effective only when the GPU idle worker has been
 *                          queued for execution, this indicates whether the
 *                          GPU has become non-idle since the last time the
@ -934,6 +1007,7 @@ struct kbase_csf_csg_slot {
 *                          groups. It is updated on every tick/tock.
 *                          @interrupt_lock is used to serialize the access.
 * @protm_enter_time:       GPU protected mode enter time.
+ * @reclaim_mgr:            CSGs tiler heap manager object.
 */
 struct kbase_csf_scheduler {
 	struct mutex lock;
@ -960,13 +1034,14 @@ struct kbase_csf_scheduler {
 	struct hrtimer tick_timer;
 	struct work_struct tick_work;
 	struct delayed_work tock_work;
+	atomic_t pending_tock_work;
 	struct delayed_work ping_work;
 	struct kbase_context *top_ctx;
 	struct kbase_queue_group *top_grp;
-	bool tock_pending_request;
 	struct kbase_queue_group *active_protm_grp;
 	struct workqueue_struct *idle_wq;
 	struct work_struct gpu_idle_work;
+	bool fast_gpu_idle_handling;
 	atomic_t gpu_no_longer_idle;
 	atomic_t non_idle_offslot_grps;
 	u32 non_idle_scanout_grps;
@ -975,6 +1050,7 @@ struct kbase_csf_scheduler {
 	bool tick_timer_active;
 	u32 tick_protm_pending_seq;
 	ktime_t protm_enter_time;
+	struct kbase_csf_sched_heap_reclaim_mgr reclaim_mgr;
 };

 /*
@ -1161,6 +1237,7 @@ struct kbase_ipa_control {
 * @flags: bitmask of CSF_FIRMWARE_ENTRY_* conveying the interface attributes
 * @data_start: Offset into firmware image at which the interface data starts
 * @data_end: Offset into firmware image at which the interface data ends
+ * @virtual_exe_start: Starting GPU execution virtual address of this interface
 * @kernel_map: A kernel mapping of the memory or NULL if not required to be
 *              mapped in the kernel
 * @pma: Array of pointers to protected memory allocations.
@ -1177,6 +1254,7 @@ struct kbase_csf_firmware_interface {
 	u32 flags;
 	u32 data_start;
 	u32 data_end;
+	u32 virtual_exe_start;
 	void *kernel_map;
 	struct protected_memory_allocation **pma;
 };
@ -1208,6 +1286,74 @@ struct kbase_csf_mcu_fw {
 	u8 *data;
 };

+/*
+ * Firmware log polling period.
+ */
+#define KBASE_CSF_FIRMWARE_LOG_POLL_PERIOD_MS 25
+
+/**
+ * enum kbase_csf_firmware_log_mode - Firmware log operating mode
+ *
+ * @KBASE_CSF_FIRMWARE_LOG_MODE_MANUAL: Manual mode, firmware log can be read
+ * manually by the userspace (and it will also be dumped automatically into
+ * dmesg on GPU reset).
+ *
+ * @KBASE_CSF_FIRMWARE_LOG_MODE_AUTO_PRINT: Automatic printing mode, firmware log
+ * will be periodically emptied into dmesg, manual reading through debugfs is
+ * disabled.
+ */
+enum kbase_csf_firmware_log_mode {
+	KBASE_CSF_FIRMWARE_LOG_MODE_MANUAL,
+	KBASE_CSF_FIRMWARE_LOG_MODE_AUTO_PRINT
+};
+
+/**
+ * struct kbase_csf_firmware_log - Object containing members for handling firmware log.
+ *
+ * @mode:                      Firmware log operating mode.
+ * @busy:                      Indicating whether a firmware log operation is in progress.
+ * @poll_work:                 Work item that would poll firmware log buffer
+ *                             at regular intervals to perform any periodic
+ *                             activities required by current log mode.
+ * @dump_buf:                  Buffer used for dumping the log.
+ * @func_call_list_va_start:   Virtual address of the start of the call list of FW log functions.
+ * @func_call_list_va_end:     Virtual address of the end of the call list of FW log functions.
+ */
+struct kbase_csf_firmware_log {
+	enum kbase_csf_firmware_log_mode mode;
+	atomic_t busy;
+	struct delayed_work poll_work;
+	u8 *dump_buf;
+	u32 func_call_list_va_start;
+	u32 func_call_list_va_end;
+};
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+/**
+ * struct kbase_csf_dump_on_fault - Faulty information to deliver to the daemon
+ *
+ * @error_code:       Error code.
+ * @kctx_tgid:        tgid value of the Kbase context for which the fault happened.
+ * @kctx_id:          id of the Kbase context for which the fault happened.
+ * @enabled:          Flag to indicate that 'csf_fault' debugfs has been opened
+ *                    so dump on fault is enabled.
+ * @fault_wait_wq:    Waitqueue on which user space client is blocked till kbase
+ *                    reports a fault.
+ * @dump_wait_wq:     Waitqueue on which kbase threads are blocked till user space client
+ *                    completes the dump on fault.
+ * @lock:             Lock to protect this struct members from concurrent access.
+ */
+struct kbase_csf_dump_on_fault {
+	enum dumpfault_error_type error_code;
+	u32 kctx_tgid;
+	u32 kctx_id;
+	atomic_t enabled;
+	wait_queue_head_t fault_wait_wq;
+	wait_queue_head_t dump_wait_wq;
+	spinlock_t lock;
+};
+#endif /* CONFIG_DEBUG_FS*/
+
 /**
 * struct kbase_csf_device - Object representing CSF for an instance of GPU
 *                           platform device.
@ -1251,11 +1397,14 @@ struct kbase_csf_mcu_fw {
 *                          in the address space of every process, that created
 *                          a Base context, to enable the access to LATEST_FLUSH
 *                          register from userspace.
+ * @nr_user_page_mapped:    The number of clients using the mapping of USER page.
+ *                          This is used to maintain backward compatibility.
+ *                          It's protected by @reg_lock.
 * @mali_file_inode:        Pointer to the inode corresponding to mali device
 *                          file. This is needed in order to switch to the
 *                          @dummy_user_reg_page on GPU power down.
 *                          All instances of the mali device file will point to
- *                          the same inode.
+ *                          the same inode. It's protected by @reg_lock.
 * @reg_lock:               Lock to serialize the MCU firmware related actions
 *                          that affect all contexts such as allocation of
 *                          regions from shared interface area, assignment of
@ -1320,6 +1469,8 @@ struct kbase_csf_mcu_fw {
 * @hwcnt:                  Contain members required for handling the dump of
 *                          HW counters.
 * @fw:                     Copy of the loaded MCU firmware image.
+ * @fw_log:                 Contain members required for handling firmware log.
+ * @dof:                    Structure for dump on fault.
 */
 struct kbase_csf_device {
 	struct kbase_mmu_table mcu_mmu;
@ -1334,6 +1485,7 @@ struct kbase_csf_device {
 	u32 db_file_offsets;
 	struct tagged_addr dummy_db_page;
 	struct tagged_addr dummy_user_reg_page;
+	u32 nr_user_page_mapped;
 	struct inode *mali_file_inode;
 	struct mutex reg_lock;
 	wait_queue_head_t event_wait;
@ -1360,6 +1512,10 @@ struct kbase_csf_device {
 	unsigned int fw_timeout_ms;
 	struct kbase_csf_hwcnt hwcnt;
 	struct kbase_csf_mcu_fw fw;
+	struct kbase_csf_firmware_log fw_log;
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+	struct kbase_csf_dump_on_fault dof;
+#endif /* CONFIG_DEBUG_FS */
 };

 /**
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_event.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_event.c
@ -169,7 +169,8 @@ void kbase_csf_event_term(struct kbase_context *kctx)
 		kfree(event_cb);
 	}

-	WARN_ON(!list_empty(&kctx->csf.event.error_list));
+	WARN(!list_empty(&kctx->csf.event.error_list),
+	     "Error list not empty for ctx %d_%d\n", kctx->tgid, kctx->id);

 	spin_unlock_irqrestore(&kctx->csf.event.lock, flags);
 }
@ -244,6 +245,14 @@ bool kbase_csf_event_error_pending(struct kbase_context *kctx)
 	bool error_pending = false;
 	unsigned long flags;

+	/* Withhold the error event if the dump on fault is ongoing.
+	 * This would prevent the Userspace from taking error recovery actions
+	 * (which can potentially affect the state that is being dumped).
+	 * Event handling thread would eventually notice the error event.
+	 */
+	if (unlikely(!kbase_debug_csf_fault_dump_complete(kctx->kbdev)))
+		return false;
+
 	spin_lock_irqsave(&kctx->csf.event.lock, flags);
 	error_pending = !list_empty(&kctx->csf.event.error_list);

--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware.c
@ -21,6 +21,7 @@

 #include "mali_kbase.h"
 #include "mali_kbase_csf_firmware_cfg.h"
+#include "mali_kbase_csf_firmware_log.h"
 #include "mali_kbase_csf_trace_buffer.h"
 #include "mali_kbase_csf_timeout.h"
 #include "mali_kbase_mem.h"
@ -77,9 +78,11 @@ MODULE_PARM_DESC(fw_debug,
 	"Enables effective use of a debugger for debugging firmware code.");
 #endif

-#define FIRMWARE_HEADER_MAGIC    (0xC3F13A6Eul)
-#define FIRMWARE_HEADER_VERSION  (0ul)
-#define FIRMWARE_HEADER_LENGTH   (0x14ul)
+
+#define FIRMWARE_HEADER_MAGIC		(0xC3F13A6Eul)
+#define FIRMWARE_HEADER_VERSION_MAJOR	(0ul)
+#define FIRMWARE_HEADER_VERSION_MINOR	(2ul)
+#define FIRMWARE_HEADER_LENGTH		(0x14ul)

 #define CSF_FIRMWARE_ENTRY_SUPPORTED_FLAGS \
 	(CSF_FIRMWARE_ENTRY_READ | \
@ -92,10 +95,10 @@ MODULE_PARM_DESC(fw_debug,

 #define CSF_FIRMWARE_ENTRY_TYPE_INTERFACE     (0)
 #define CSF_FIRMWARE_ENTRY_TYPE_CONFIGURATION (1)
-#define CSF_FIRMWARE_ENTRY_TYPE_FUTF_TEST     (2)
 #define CSF_FIRMWARE_ENTRY_TYPE_TRACE_BUFFER  (3)
 #define CSF_FIRMWARE_ENTRY_TYPE_TIMELINE_METADATA (4)
 #define CSF_FIRMWARE_ENTRY_TYPE_BUILD_INFO_METADATA (6)
+#define CSF_FIRMWARE_ENTRY_TYPE_FUNC_CALL_LIST    (7)

 #define CSF_FIRMWARE_CACHE_MODE_NONE              (0ul << 3)
 #define CSF_FIRMWARE_CACHE_MODE_CACHED            (1ul << 3)
@ -431,8 +434,8 @@ static void load_fw_image_section(struct kbase_device *kbdev, const u8 *data,
 			memset(p + copy_len, 0, zi_len);
 		}

-		kbase_sync_single_for_device(kbdev, kbase_dma_addr(page),
-				PAGE_SIZE, DMA_TO_DEVICE);
+		kbase_sync_single_for_device(kbdev, kbase_dma_addr_from_tagged(phys[page_num]),
+					     PAGE_SIZE, DMA_TO_DEVICE);
 		kunmap_atomic(p);
 	}
 }
@ -525,6 +528,58 @@ static inline bool entry_find_large_page_to_reuse(
 	*pma = NULL;


+	/* If the section starts at 2MB aligned boundary,
+	 * then use 2MB page(s) for it.
+	 */
+	if (!(virtual_start & (SZ_2M - 1))) {
+		*num_pages_aligned =
+			round_up(*num_pages_aligned, NUM_4K_PAGES_IN_2MB_PAGE);
+		*is_small_page = false;
+		goto out;
+	}
+
+	/* If the section doesn't lie within the same 2MB aligned boundary,
+	 * then use 4KB pages as it would be complicated to use a 2MB page
+	 * for such section.
+	 */
+	if ((virtual_start & ~(SZ_2M - 1)) != (virtual_end & ~(SZ_2M - 1)))
+		goto out;
+
+	/* Find the nearest 2MB aligned section which comes before the current
+	 * section.
+	 */
+	list_for_each_entry(interface, &kbdev->csf.firmware_interfaces, node) {
+		const u32 virtual_diff = virtual_start - interface->virtual;
+
+		if (interface->virtual > virtual_end)
+			continue;
+
+		if (interface->virtual & (SZ_2M - 1))
+			continue;
+
+		if (virtual_diff < virtual_diff_min) {
+			target_interface = interface;
+			virtual_diff_min = virtual_diff;
+		}
+	}
+
+	if (target_interface) {
+		const u32 page_index = virtual_diff_min >> PAGE_SHIFT;
+
+		if (page_index >= target_interface->num_pages_aligned)
+			goto out;
+
+		if (target_interface->phys)
+			*phys = &target_interface->phys[page_index];
+
+		if (target_interface->pma)
+			*pma = &target_interface->pma[page_index / NUM_4K_PAGES_IN_2MB_PAGE];
+
+		*is_small_page = false;
+		reuse_large_page = true;
+	}
+
+out:
 	return reuse_large_page;
 }

@ -555,6 +610,8 @@ static int parse_memory_setup_entry(struct kbase_device *kbdev,
 	u32 num_pages;
 	u32 num_pages_aligned;
 	char *name;
+	void *name_entry;
+	unsigned int name_len;
 	struct tagged_addr *phys = NULL;
 	struct kbase_csf_firmware_interface *interface = NULL;
 	bool allocated_pages = false, protected_mode = false;
@ -625,8 +682,8 @@ static int parse_memory_setup_entry(struct kbase_device *kbdev,
 	} else {
 		if (!reuse_pages) {
 			ret = kbase_mem_pool_alloc_pages(
-				kbase_mem_pool_group_select(
-					kbdev, KBASE_MEM_GROUP_CSF_FW, is_small_page),
+				kbase_mem_pool_group_select(kbdev, KBASE_MEM_GROUP_CSF_FW,
+							    is_small_page),
 				num_pages_aligned, phys, false);
 		}
 	}
@ -643,21 +700,24 @@ static int parse_memory_setup_entry(struct kbase_device *kbdev,
 			data_start, data_end);

 	/* Allocate enough memory for the struct kbase_csf_firmware_interface and
-	 * the name of the interface. An extra byte is allocated to place a
-	 * NUL-terminator in. This should already be included according to the
-	 * specification but here we add it anyway to be robust against a
-	 * corrupt firmware image.
+	 * the name of the interface.
 	 */
-	interface = kmalloc(sizeof(*interface) +
-			size - INTERFACE_ENTRY_NAME_OFFSET + 1, GFP_KERNEL);
+	name_entry = (void *)entry + INTERFACE_ENTRY_NAME_OFFSET;
+	name_len = strnlen(name_entry, size - INTERFACE_ENTRY_NAME_OFFSET);
+	if (size < (INTERFACE_ENTRY_NAME_OFFSET + name_len + 1 + sizeof(u32))) {
+		dev_err(kbdev->dev, "Memory setup entry too short to contain virtual_exe_start");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	interface = kmalloc(sizeof(*interface) + name_len + 1, GFP_KERNEL);
 	if (!interface) {
 		ret = -ENOMEM;
 		goto out;
 	}
 	name = (void *)(interface + 1);
-	memcpy(name, entry + (INTERFACE_ENTRY_NAME_OFFSET / sizeof(*entry)),
-			size - INTERFACE_ENTRY_NAME_OFFSET);
-	name[size - INTERFACE_ENTRY_NAME_OFFSET] = 0;
+	memcpy(name, name_entry, name_len);
+	name[name_len] = 0;

 	interface->name = name;
 	interface->phys = phys;
@ -672,6 +732,11 @@ static int parse_memory_setup_entry(struct kbase_device *kbdev,
 	interface->data_end = data_end;
 	interface->pma = pma;

+	/* Discover the virtual execution address field after the end of the name
+	 * field taking into account the NULL-termination character.
+	 */
+	interface->virtual_exe_start = *((u32 *)(name_entry + name_len + 1));
+
 	mem_flags = convert_mem_flags(kbdev, flags, &cache_mode);

 	if (flags & CSF_FIRMWARE_ENTRY_SHARED) {
@ -956,6 +1021,15 @@ static int load_firmware_entry(struct kbase_device *kbdev, const struct kbase_cs
 			return -EINVAL;
 		}
 		return parse_build_info_metadata_entry(kbdev, fw, entry, size);
+	case CSF_FIRMWARE_ENTRY_TYPE_FUNC_CALL_LIST:
+		/* Function call list section */
+		if (size < 2 * sizeof(*entry)) {
+			dev_err(kbdev->dev, "Function call list entry too short (size=%u)\n",
+				size);
+			return -EINVAL;
+		}
+		kbase_csf_firmware_log_parse_logging_call_list_entry(kbdev, entry);
+		break;
 	}

 	if (!optional) {
@ -1179,40 +1253,80 @@ static int parse_capabilities(struct kbase_device *kbdev)
 	return 0;
 }

+static inline void access_firmware_memory_common(struct kbase_device *kbdev,
+		struct kbase_csf_firmware_interface *interface, u32 offset_bytes,
+		u32 *value, const bool read)
+{
+	u32 page_num = offset_bytes >> PAGE_SHIFT;
+	u32 offset_in_page = offset_bytes & ~PAGE_MASK;
+	struct page *target_page = as_page(interface->phys[page_num]);
+	uintptr_t cpu_addr = (uintptr_t)kmap_atomic(target_page);
+	u32 *addr = (u32 *)(cpu_addr + offset_in_page);
+
+	if (read) {
+		kbase_sync_single_for_device(kbdev,
+			kbase_dma_addr_from_tagged(interface->phys[page_num]) + offset_in_page,
+			sizeof(u32), DMA_BIDIRECTIONAL);
+		*value = *addr;
+	} else {
+		*addr = *value;
+		kbase_sync_single_for_device(kbdev,
+			kbase_dma_addr_from_tagged(interface->phys[page_num]) + offset_in_page,
+			sizeof(u32), DMA_BIDIRECTIONAL);
+	}
+
+	kunmap_atomic((u32 *)cpu_addr);
+}
+
 static inline void access_firmware_memory(struct kbase_device *kbdev,
 	u32 gpu_addr, u32 *value, const bool read)
 {
-	struct kbase_csf_firmware_interface *interface;
+	struct kbase_csf_firmware_interface *interface, *access_interface = NULL;
+	u32 offset_bytes = 0;

 	list_for_each_entry(interface, &kbdev->csf.firmware_interfaces, node) {
 		if ((gpu_addr >= interface->virtual) &&
 			(gpu_addr < interface->virtual + (interface->num_pages << PAGE_SHIFT))) {
-			u32 offset_bytes = gpu_addr - interface->virtual;
-			u32 page_num = offset_bytes >> PAGE_SHIFT;
-			u32 offset_in_page = offset_bytes & ~PAGE_MASK;
-			struct page *target_page = as_page(
-				interface->phys[page_num]);
-			u32 *cpu_addr = kmap_atomic(target_page);
-
-			if (read) {
-				kbase_sync_single_for_device(kbdev,
-					kbase_dma_addr(target_page) + offset_in_page,
-					sizeof(u32), DMA_BIDIRECTIONAL);
-
-				*value = cpu_addr[offset_in_page >> 2];
-			} else {
-				cpu_addr[offset_in_page >> 2] = *value;
-
-				kbase_sync_single_for_device(kbdev,
-					kbase_dma_addr(target_page) + offset_in_page,
-					sizeof(u32), DMA_BIDIRECTIONAL);
-			}
-
-			kunmap_atomic(cpu_addr);
-			return;
+			offset_bytes = gpu_addr - interface->virtual;
+			access_interface = interface;
+			break;
 		}
 	}
-	dev_warn(kbdev->dev, "Invalid GPU VA %x passed\n", gpu_addr);
+
+	if (access_interface)
+		access_firmware_memory_common(kbdev, access_interface, offset_bytes, value, read);
+	else
+		dev_warn(kbdev->dev, "Invalid GPU VA %x passed", gpu_addr);
+}
+
+static inline void access_firmware_memory_exe(struct kbase_device *kbdev,
+	u32 gpu_addr, u32 *value, const bool read)
+{
+	struct kbase_csf_firmware_interface *interface, *access_interface = NULL;
+	u32 offset_bytes = 0;
+
+	list_for_each_entry(interface, &kbdev->csf.firmware_interfaces, node) {
+		if ((gpu_addr >= interface->virtual_exe_start) &&
+			(gpu_addr < interface->virtual_exe_start +
+				(interface->num_pages << PAGE_SHIFT))) {
+			offset_bytes = gpu_addr - interface->virtual_exe_start;
+			access_interface = interface;
+
+			/* If there's an overlap in execution address range between a moved and a
+			 * non-moved areas, always prefer the moved one. The idea is that FW may
+			 * move sections around during init time, but after the layout is settled,
+			 * any moved sections are going to override non-moved areas at the same
+			 * location.
+			 */
+			if (interface->virtual_exe_start != interface->virtual)
+				break;
+		}
+	}
+
+	if (access_interface)
+		access_firmware_memory_common(kbdev, access_interface, offset_bytes, value, read);
+	else
+		dev_warn(kbdev->dev, "Invalid GPU VA %x passed", gpu_addr);
 }

 void kbase_csf_read_firmware_memory(struct kbase_device *kbdev,
@ -1227,6 +1341,18 @@ void kbase_csf_update_firmware_memory(struct kbase_device *kbdev,
 	access_firmware_memory(kbdev, gpu_addr, &value, false);
 }

+void kbase_csf_read_firmware_memory_exe(struct kbase_device *kbdev,
+	u32 gpu_addr, u32 *value)
+{
+	access_firmware_memory_exe(kbdev, gpu_addr, value, true);
+}
+
+void kbase_csf_update_firmware_memory_exe(struct kbase_device *kbdev,
+	u32 gpu_addr, u32 value)
+{
+	access_firmware_memory_exe(kbdev, gpu_addr, &value, false);
+}
+
 void kbase_csf_firmware_cs_input(
 	const struct kbase_csf_cmd_stream_info *const info, const u32 offset,
 	const u32 value)
@ -1462,11 +1588,10 @@ static bool global_request_complete(struct kbase_device *const kbdev,
 	return complete;
 }

-static int wait_for_global_request(struct kbase_device *const kbdev,
-				   u32 const req_mask)
+static int wait_for_global_request_with_timeout(struct kbase_device *const kbdev,
+						u32 const req_mask, unsigned int timeout_ms)
 {
-	const long wait_timeout =
-		kbase_csf_timeout_in_jiffies(kbdev->csf.fw_timeout_ms);
+	const long wait_timeout = kbase_csf_timeout_in_jiffies(timeout_ms);
 	long remaining;
 	int err = 0;

@ -1475,10 +1600,9 @@ static int wait_for_global_request(struct kbase_device *const kbdev,
 				       wait_timeout);

 	if (!remaining) {
-		dev_warn(kbdev->dev, "[%llu] Timeout (%d ms) waiting for global request %x to complete",
-			 kbase_backend_get_cycle_cnt(kbdev),
-			 kbdev->csf.fw_timeout_ms,
-			 req_mask);
+		dev_warn(kbdev->dev,
+			 "[%llu] Timeout (%d ms) waiting for global request %x to complete",
+			 kbase_backend_get_cycle_cnt(kbdev), timeout_ms, req_mask);
 		err = -ETIMEDOUT;

 	}
@ -1486,6 +1610,11 @@ static int wait_for_global_request(struct kbase_device *const kbdev,
 	return err;
 }

+static int wait_for_global_request(struct kbase_device *const kbdev, u32 const req_mask)
+{
+	return wait_for_global_request_with_timeout(kbdev, req_mask, kbdev->csf.fw_timeout_ms);
+}
+
 static void set_global_request(
 	const struct kbase_csf_global_iface *const global_iface,
 	u32 const req_mask)
@ -1559,6 +1688,25 @@ static void enable_gpu_idle_timer(struct kbase_device *const kbdev)
 }


+/**
+ * kbasep_enable_rtu - Enable Ray Tracing Unit on powering up shader core
+ *
+ * @kbdev:     The kbase device structure of the device
+ *
+ * This function needs to be called to enable the Ray Tracing Unit
+ * by writing SHADER_PWRFEATURES only when host controls shader cores power.
+ */
+static void kbasep_enable_rtu(struct kbase_device *kbdev)
+{
+	const u32 gpu_id = kbdev->gpu_props.props.raw_props.gpu_id;
+
+	if (gpu_id < GPU_ID2_PRODUCT_MAKE(12, 8, 3, 0))
+		return;
+
+	if (kbdev->csf.firmware_hctl_core_pwr)
+		kbase_reg_write(kbdev, GPU_CONTROL_REG(SHADER_PWRFEATURES), 1);
+}
+
 static void global_init(struct kbase_device *const kbdev, u64 core_mask)
 {
 	u32 const ack_irq_mask =
@ -1574,6 +1722,8 @@ static void global_init(struct kbase_device *const kbdev, u64 core_mask)

 	kbase_csf_scheduler_spin_lock(kbdev, &flags);

+	kbasep_enable_rtu(kbdev);
+
 	/* Update shader core allocation enable mask */
 	enable_endpoints_global(global_iface, core_mask);
 	enable_shader_poweroff_timer(kbdev, global_iface);
@ -1854,7 +2004,6 @@ u32 kbase_csf_firmware_set_gpu_idle_hysteresis_time(struct kbase_device *kbdev,

 static u32 convert_dur_to_core_pwroff_count(struct kbase_device *kbdev, const u32 dur_us)
 {
-#define PWROFF_VAL_UNIT_SHIFT (10)
 	/* Get the cntfreq_el0 value, which drives the SYSTEM_TIMESTAMP */
 	u64 freq = arch_timer_get_cntfrq();
 	u64 dur_val = dur_us;
@ -1991,16 +2140,6 @@ int kbase_csf_firmware_early_init(struct kbase_device *kbdev)
 	kbdev->csf.fw_timeout_ms =
 		kbase_get_timeout_ms(kbdev, CSF_FIRMWARE_TIMEOUT);

-	kbdev->csf.gpu_idle_hysteresis_ms = FIRMWARE_IDLE_HYSTERESIS_TIME_MS;
-#ifdef KBASE_PM_RUNTIME
-	if (kbase_pm_gpu_sleep_allowed(kbdev))
-		kbdev->csf.gpu_idle_hysteresis_ms /=
-			FIRMWARE_IDLE_HYSTERESIS_GPU_SLEEP_SCALER;
-#endif
-	WARN_ON(!kbdev->csf.gpu_idle_hysteresis_ms);
-	kbdev->csf.gpu_idle_dur_count = convert_dur_to_idle_count(
-		kbdev, kbdev->csf.gpu_idle_hysteresis_ms);
-
 	kbdev->csf.mcu_core_pwroff_dur_us = DEFAULT_GLB_PWROFF_TIMEOUT_US;
 	kbdev->csf.mcu_core_pwroff_dur_count = convert_dur_to_core_pwroff_count(
 		kbdev, DEFAULT_GLB_PWROFF_TIMEOUT_US);
@ -2020,7 +2159,26 @@ int kbase_csf_firmware_early_init(struct kbase_device *kbdev)
 	return 0;
 }

-int kbase_csf_firmware_init(struct kbase_device *kbdev)
+void kbase_csf_firmware_early_term(struct kbase_device *kbdev)
+{
+	mutex_destroy(&kbdev->csf.reg_lock);
+}
+
+int kbase_csf_firmware_late_init(struct kbase_device *kbdev)
+{
+	kbdev->csf.gpu_idle_hysteresis_ms = FIRMWARE_IDLE_HYSTERESIS_TIME_MS;
+#ifdef KBASE_PM_RUNTIME
+	if (kbase_pm_gpu_sleep_allowed(kbdev))
+		kbdev->csf.gpu_idle_hysteresis_ms /= FIRMWARE_IDLE_HYSTERESIS_GPU_SLEEP_SCALER;
+#endif
+	WARN_ON(!kbdev->csf.gpu_idle_hysteresis_ms);
+	kbdev->csf.gpu_idle_dur_count =
+		convert_dur_to_idle_count(kbdev, kbdev->csf.gpu_idle_hysteresis_ms);
+
+	return 0;
+}
+
+int kbase_csf_firmware_load_init(struct kbase_device *kbdev)
 {
 	const struct firmware *firmware = NULL;
 	struct kbase_csf_mcu_fw *const mcu_fw = &kbdev->csf.fw;
@ -2093,7 +2251,8 @@ int kbase_csf_firmware_init(struct kbase_device *kbdev)
 	version_minor = mcu_fw->data[4];
 	version_major = mcu_fw->data[5];

-	if (version_major != FIRMWARE_HEADER_VERSION) {
+	if (version_major != FIRMWARE_HEADER_VERSION_MAJOR ||
+			version_minor != FIRMWARE_HEADER_VERSION_MINOR) {
 		dev_err(kbdev->dev,
 				"Firmware header version %d.%d not understood\n",
 				version_major, version_minor);
@ -2188,6 +2347,12 @@ int kbase_csf_firmware_init(struct kbase_device *kbdev)
 	if (ret != 0)
 		goto err_out;

+	ret = kbase_csf_firmware_log_init(kbdev);
+	if (ret != 0) {
+		dev_err(kbdev->dev, "Failed to initialize FW trace (err %d)", ret);
+		goto err_out;
+	}
+
 	/* Firmware loaded successfully, ret = 0 */
 	KBASE_KTRACE_ADD(kbdev, CSF_FIRMWARE_BOOT, NULL,
 			(((u64)version_hash) << 32) |
@ -2195,11 +2360,11 @@ int kbase_csf_firmware_init(struct kbase_device *kbdev)
 	return 0;

 err_out:
-	kbase_csf_firmware_term(kbdev);
+	kbase_csf_firmware_unload_term(kbdev);
 	return ret;
 }

-void kbase_csf_firmware_term(struct kbase_device *kbdev)
+void kbase_csf_firmware_unload_term(struct kbase_device *kbdev)
 {
 	unsigned long flags;
 	int ret = 0;
@ -2210,6 +2375,8 @@ void kbase_csf_firmware_term(struct kbase_device *kbdev)

 	WARN(ret, "failed to wait for GPU reset");

+	kbase_csf_firmware_log_term(kbdev);
+
 	kbase_csf_firmware_cfg_term(kbdev);

 	kbase_csf_timeout_term(kbdev);
@ -2297,8 +2464,6 @@ void kbase_csf_firmware_term(struct kbase_device *kbdev)
 	 */
 	kbase_mcu_shared_interface_region_tracker_term(kbdev);

-	mutex_destroy(&kbdev->csf.reg_lock);
-
 	kbase_mmu_term(kbdev, &kbdev->csf.mcu_mmu);

 	/* Release the address space */
@ -2350,10 +2515,11 @@ void kbase_csf_firmware_ping(struct kbase_device *const kbdev)
 	kbase_csf_scheduler_spin_unlock(kbdev, flags);
 }

-int kbase_csf_firmware_ping_wait(struct kbase_device *const kbdev)
+int kbase_csf_firmware_ping_wait(struct kbase_device *const kbdev, unsigned int wait_timeout_ms)
 {
 	kbase_csf_firmware_ping(kbdev);
-	return wait_for_global_request(kbdev, GLB_REQ_PING_MASK);
+
+	return wait_for_global_request_with_timeout(kbdev, GLB_REQ_PING_MASK, wait_timeout_ms);
 }

 int kbase_csf_firmware_set_timeout(struct kbase_device *const kbdev,
@ -2392,7 +2558,7 @@ void kbase_csf_enter_protected_mode(struct kbase_device *kbdev)
 	kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR);
 }

-void kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev)
+int kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev)
 {
 	int err;

@ -2432,12 +2598,14 @@ void kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev)
 		}
 	}

-	if (err) {
+	if (unlikely(err)) {
 		if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_HWC_UNRECOVERABLE_ERROR))
 			kbase_reset_gpu(kbdev);
 	}

 	KBASE_TLSTREAM_AUX_PROTECTED_ENTER_END(kbdev, kbdev);
+
+	return err;
 }

 void kbase_csf_firmware_trigger_mcu_halt(struct kbase_device *kbdev)
@ -2651,9 +2819,8 @@ int kbase_csf_firmware_mcu_shared_mapping_init(
 	if (!page_list)
 		goto page_list_alloc_error;

-	ret = kbase_mem_pool_alloc_pages(
-		&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW],
-		num_pages, phys, false);
+	ret = kbase_mem_pool_alloc_pages(&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW], num_pages,
+					 phys, false);
 	if (ret <= 0)
 		goto phys_mem_pool_alloc_error;

--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware.h
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware.h
@ -364,7 +364,45 @@ void kbase_csf_update_firmware_memory(struct kbase_device *kbdev,
 	u32 gpu_addr, u32 value);

 /**
- * kbase_csf_firmware_early_init() - Early initializatin for the firmware.
+ * kbase_csf_read_firmware_memory_exe - Read a value in a GPU address in the
+ *                                      region of its final execution location.
+ *
+ * @kbdev:     Device pointer
+ * @gpu_addr:  GPU address to read
+ * @value:     Output pointer to which the read value will be written
+ *
+ * This function read a value in a GPU address that belongs to a private loaded
+ * firmware memory region based on its final execution location. The function
+ * assumes that the location is not permanently mapped on the CPU address space,
+ * therefore it maps it and then unmaps it to access it independently. This function
+ * needs to be used when accessing firmware memory regions which will be moved to
+ * their final execution location during firmware boot using an address based on the
+ * final execution location.
+ */
+void kbase_csf_read_firmware_memory_exe(struct kbase_device *kbdev,
+	u32 gpu_addr, u32 *value);
+
+/**
+ * kbase_csf_update_firmware_memory_exe - Write a value in a GPU address in the
+ *                                        region of its final execution location.
+ *
+ * @kbdev:     Device pointer
+ * @gpu_addr:  GPU address to write
+ * @value:     Value to write
+ *
+ * This function writes a value in a GPU address that belongs to a private loaded
+ * firmware memory region based on its final execution location. The function
+ * assumes that the location is not permanently mapped on the CPU address space,
+ * therefore it maps it and then unmaps it to access it independently. This function
+ * needs to be used when accessing firmware memory regions which will be moved to
+ * their final execution location during firmware boot using an address based on the
+ * final execution location.
+ */
+void kbase_csf_update_firmware_memory_exe(struct kbase_device *kbdev,
+	u32 gpu_addr, u32 value);
+
+/**
+ * kbase_csf_firmware_early_init() - Early initialization for the firmware.
 * @kbdev: Kbase device
 *
 * Initialize resources related to the firmware. Must be called at kbase probe.
@ -374,22 +412,43 @@ void kbase_csf_update_firmware_memory(struct kbase_device *kbdev,
 int kbase_csf_firmware_early_init(struct kbase_device *kbdev);

 /**
- * kbase_csf_firmware_init() - Load the firmware for the CSF MCU
+ * kbase_csf_firmware_early_term() - Terminate resources related to the firmware
+ *                                   after the firmware unload has been done.
+ *
+ * @kbdev: Device pointer
+ *
+ * This should be called only when kbase probe fails or gets rmmoded.
+ */
+void kbase_csf_firmware_early_term(struct kbase_device *kbdev);
+
+/**
+ * kbase_csf_firmware_late_init() - Late initialization for the firmware.
+ * @kbdev: Kbase device
+ *
+ * Initialize resources related to the firmware. But must be called after
+ * backend late init is done. Must be used at probe time only.
+ *
+ * Return: 0 if successful, negative error code on failure
+ */
+int kbase_csf_firmware_late_init(struct kbase_device *kbdev);
+
+/**
+ * kbase_csf_firmware_load_init() - Load the firmware for the CSF MCU
 * @kbdev: Kbase device
 *
 * Request the firmware from user space and load it into memory.
 *
 * Return: 0 if successful, negative error code on failure
 */
-int kbase_csf_firmware_init(struct kbase_device *kbdev);
+int kbase_csf_firmware_load_init(struct kbase_device *kbdev);

 /**
- * kbase_csf_firmware_term() - Unload the firmware
+ * kbase_csf_firmware_unload_term() - Unload the firmware
 * @kbdev: Kbase device
 *
- * Frees the memory allocated by kbase_csf_firmware_init()
+ * Frees the memory allocated by kbase_csf_firmware_load_init()
 */
-void kbase_csf_firmware_term(struct kbase_device *kbdev);
+void kbase_csf_firmware_unload_term(struct kbase_device *kbdev);

 /**
 * kbase_csf_firmware_ping - Send the ping request to firmware.
@ -404,13 +463,14 @@ void kbase_csf_firmware_ping(struct kbase_device *kbdev);
 * kbase_csf_firmware_ping_wait - Send the ping request to firmware and waits.
 *
 * @kbdev: Instance of a GPU platform device that implements a CSF interface.
+ * @wait_timeout_ms: Timeout to get the acknowledgment for PING request from FW.
 *
 * The function sends the ping request to firmware and waits to confirm it is
 * alive.
 *
 * Return: 0 on success, or negative on failure.
 */
-int kbase_csf_firmware_ping_wait(struct kbase_device *kbdev);
+int kbase_csf_firmware_ping_wait(struct kbase_device *kbdev, unsigned int wait_timeout_ms);

 /**
 * kbase_csf_firmware_set_timeout - Set a hardware endpoint progress timeout.
@ -447,8 +507,10 @@ void kbase_csf_enter_protected_mode(struct kbase_device *kbdev);
 * This function needs to be called after kbase_csf_enter_protected_mode() to
 * wait for the GPU to actually enter protected mode. GPU reset is triggered if
 * the wait is unsuccessful.
+ *
+ * Return: 0 on success, or negative on failure.
 */
-void kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev);
+int kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev);

 static inline bool kbase_csf_firmware_mcu_halted(struct kbase_device *kbdev)
 {
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware_cfg.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware_cfg.c
@ -20,13 +20,17 @@
 */

 #include <mali_kbase.h>
-#include "mali_kbase_csf_firmware_cfg.h"
 #include <mali_kbase_reset_gpu.h>
 #include <linux/version.h>

+#include "mali_kbase_csf_firmware_cfg.h"
+#include "mali_kbase_csf_firmware_log.h"
+
 #if CONFIG_SYSFS
 #define CSF_FIRMWARE_CFG_SYSFS_DIR_NAME "firmware_config"

+#define CSF_FIRMWARE_CFG_LOG_VERBOSITY_ENTRY_NAME "Log verbosity"
+
 /**
 * struct firmware_config - Configuration item within the MCU firmware
 *
@ -125,7 +129,7 @@ static ssize_t store_fw_cfg(struct kobject *kobj,

 	if (attr == &fw_cfg_attr_cur) {
 		unsigned long flags;
-		u32 val;
+		u32 val, cur_val;
 		int ret = kstrtouint(buf, 0, &val);

 		if (ret) {
@ -140,7 +144,9 @@ static ssize_t store_fw_cfg(struct kobject *kobj,
 			return -EINVAL;

 		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-		if (config->cur_val == val) {
+
+		cur_val = config->cur_val;
+		if (cur_val == val) {
 			spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 			return count;
 		}
@ -177,6 +183,20 @@ static ssize_t store_fw_cfg(struct kobject *kobj,

 		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);

+		/* Enable FW logging only if Log verbosity is non-zero */
+		if (!strcmp(config->name, CSF_FIRMWARE_CFG_LOG_VERBOSITY_ENTRY_NAME) &&
+		    (!cur_val || !val)) {
+			ret = kbase_csf_firmware_log_toggle_logging_calls(kbdev, val);
+			if (ret) {
+				/* Undo FW configuration changes */
+				spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+				config->cur_val = cur_val;
+				kbase_csf_update_firmware_memory(kbdev, config->address, cur_val);
+				spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+				return ret;
+			}
+		}
+
 		/* If we can update the config without firmware reset then
 		 * we need to just trigger FIRMWARE_CONFIG_UPDATE.
 		 */
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware_log.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware_log.c
@ -0,0 +1,451 @@
+// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+/*
+ *
+ * (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#include <mali_kbase.h>
+#include "backend/gpu/mali_kbase_pm_internal.h"
+#include <csf/mali_kbase_csf_firmware_log.h>
+#include <csf/mali_kbase_csf_trace_buffer.h>
+#include <linux/debugfs.h>
+#include <linux/string.h>
+#include <linux/workqueue.h>
+
+/*
+ * ARMv7 instruction: Branch with Link calls a subroutine at a PC-relative address.
+ */
+#define ARMV7_T1_BL_IMM_INSTR		0xd800f000
+
+/*
+ * ARMv7 instruction: Branch with Link calls a subroutine at a PC-relative address, maximum
+ * negative jump offset.
+ */
+#define ARMV7_T1_BL_IMM_RANGE_MIN	-16777216
+
+/*
+ * ARMv7 instruction: Branch with Link calls a subroutine at a PC-relative address, maximum
+ * positive jump offset.
+ */
+#define ARMV7_T1_BL_IMM_RANGE_MAX	16777214
+
+/*
+ * ARMv7 instruction: Double NOP instructions.
+ */
+#define ARMV7_DOUBLE_NOP_INSTR		0xbf00bf00
+
+#if defined(CONFIG_DEBUG_FS)
+
+static int kbase_csf_firmware_log_enable_mask_read(void *data, u64 *val)
+{
+	struct kbase_device *kbdev = (struct kbase_device *)data;
+	struct firmware_trace_buffer *tb =
+		kbase_csf_firmware_get_trace_buffer(kbdev, FIRMWARE_LOG_BUF_NAME);
+
+	if (tb == NULL) {
+		dev_err(kbdev->dev, "Couldn't get the firmware trace buffer");
+		return -EIO;
+	}
+	/* The enabled traces limited to u64 here, regarded practical */
+	*val = kbase_csf_firmware_trace_buffer_get_active_mask64(tb);
+	return 0;
+}
+
+static int kbase_csf_firmware_log_enable_mask_write(void *data, u64 val)
+{
+	struct kbase_device *kbdev = (struct kbase_device *)data;
+	struct firmware_trace_buffer *tb =
+		kbase_csf_firmware_get_trace_buffer(kbdev, FIRMWARE_LOG_BUF_NAME);
+	u64 new_mask;
+	unsigned int enable_bits_count;
+
+	if (tb == NULL) {
+		dev_err(kbdev->dev, "Couldn't get the firmware trace buffer");
+		return -EIO;
+	}
+
+	/* Ignore unsupported types */
+	enable_bits_count = kbase_csf_firmware_trace_buffer_get_trace_enable_bits_count(tb);
+	if (enable_bits_count > 64) {
+		dev_dbg(kbdev->dev, "Limit enabled bits count from %u to 64", enable_bits_count);
+		enable_bits_count = 64;
+	}
+	new_mask = val & ((1 << enable_bits_count) - 1);
+
+	if (new_mask != kbase_csf_firmware_trace_buffer_get_active_mask64(tb))
+		return kbase_csf_firmware_trace_buffer_set_active_mask64(tb, new_mask);
+	else
+		return 0;
+}
+
+static int kbasep_csf_firmware_log_debugfs_open(struct inode *in, struct file *file)
+{
+	struct kbase_device *kbdev = in->i_private;
+
+	file->private_data = kbdev;
+	dev_dbg(kbdev->dev, "Opened firmware trace buffer dump debugfs file");
+
+	return 0;
+}
+
+static ssize_t kbasep_csf_firmware_log_debugfs_read(struct file *file, char __user *buf,
+						    size_t size, loff_t *ppos)
+{
+	struct kbase_device *kbdev = file->private_data;
+	struct kbase_csf_firmware_log *fw_log = &kbdev->csf.fw_log;
+	unsigned int n_read;
+	unsigned long not_copied;
+	/* Limit reads to the kernel dump buffer size */
+	size_t mem = MIN(size, FIRMWARE_LOG_DUMP_BUF_SIZE);
+	int ret;
+
+	struct firmware_trace_buffer *tb =
+		kbase_csf_firmware_get_trace_buffer(kbdev, FIRMWARE_LOG_BUF_NAME);
+
+	if (tb == NULL) {
+		dev_err(kbdev->dev, "Couldn't get the firmware trace buffer");
+		return -EIO;
+	}
+
+	if (atomic_cmpxchg(&fw_log->busy, 0, 1) != 0)
+		return -EBUSY;
+
+	/* Reading from userspace is only allowed in manual mode */
+	if (fw_log->mode != KBASE_CSF_FIRMWARE_LOG_MODE_MANUAL) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	n_read = kbase_csf_firmware_trace_buffer_read_data(tb, fw_log->dump_buf, mem);
+
+	/* Do the copy, if we have obtained some trace data */
+	not_copied = (n_read) ? copy_to_user(buf, fw_log->dump_buf, n_read) : 0;
+
+	if (not_copied) {
+		dev_err(kbdev->dev, "Couldn't copy trace buffer data to user space buffer");
+		ret = -EFAULT;
+		goto out;
+	}
+
+	*ppos += n_read;
+	ret = n_read;
+
+out:
+	atomic_set(&fw_log->busy, 0);
+	return ret;
+}
+
+static int kbase_csf_firmware_log_mode_read(void *data, u64 *val)
+{
+	struct kbase_device *kbdev = (struct kbase_device *)data;
+	struct kbase_csf_firmware_log *fw_log = &kbdev->csf.fw_log;
+
+	*val = fw_log->mode;
+	return 0;
+}
+
+static int kbase_csf_firmware_log_mode_write(void *data, u64 val)
+{
+	struct kbase_device *kbdev = (struct kbase_device *)data;
+	struct kbase_csf_firmware_log *fw_log = &kbdev->csf.fw_log;
+	int ret = 0;
+
+	if (atomic_cmpxchg(&fw_log->busy, 0, 1) != 0)
+		return -EBUSY;
+
+	if (val == fw_log->mode)
+		goto out;
+
+	switch (val) {
+	case KBASE_CSF_FIRMWARE_LOG_MODE_MANUAL:
+		cancel_delayed_work_sync(&fw_log->poll_work);
+		break;
+	case KBASE_CSF_FIRMWARE_LOG_MODE_AUTO_PRINT:
+		schedule_delayed_work(&fw_log->poll_work,
+				      msecs_to_jiffies(KBASE_CSF_FIRMWARE_LOG_POLL_PERIOD_MS));
+		break;
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+
+	fw_log->mode = val;
+
+out:
+	atomic_set(&fw_log->busy, 0);
+	return ret;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(kbase_csf_firmware_log_enable_mask_fops,
+			 kbase_csf_firmware_log_enable_mask_read,
+			 kbase_csf_firmware_log_enable_mask_write, "%llx\n");
+
+static const struct file_operations kbasep_csf_firmware_log_debugfs_fops = {
+	.owner = THIS_MODULE,
+	.open = kbasep_csf_firmware_log_debugfs_open,
+	.read = kbasep_csf_firmware_log_debugfs_read,
+	.llseek = no_llseek,
+};
+
+DEFINE_DEBUGFS_ATTRIBUTE(kbase_csf_firmware_log_mode_fops, kbase_csf_firmware_log_mode_read,
+			 kbase_csf_firmware_log_mode_write, "%llu\n");
+
+#endif /* CONFIG_DEBUG_FS */
+
+static void kbase_csf_firmware_log_poll(struct work_struct *work)
+{
+	struct kbase_device *kbdev =
+		container_of(work, struct kbase_device, csf.fw_log.poll_work.work);
+	struct kbase_csf_firmware_log *fw_log = &kbdev->csf.fw_log;
+
+	schedule_delayed_work(&fw_log->poll_work,
+			      msecs_to_jiffies(KBASE_CSF_FIRMWARE_LOG_POLL_PERIOD_MS));
+
+	kbase_csf_firmware_log_dump_buffer(kbdev);
+}
+
+int kbase_csf_firmware_log_init(struct kbase_device *kbdev)
+{
+	struct kbase_csf_firmware_log *fw_log = &kbdev->csf.fw_log;
+
+	/* Add one byte for null-termination */
+	fw_log->dump_buf = kmalloc(FIRMWARE_LOG_DUMP_BUF_SIZE + 1, GFP_KERNEL);
+	if (fw_log->dump_buf == NULL)
+		return -ENOMEM;
+
+	/* Ensure null-termination for all strings */
+	fw_log->dump_buf[FIRMWARE_LOG_DUMP_BUF_SIZE] = 0;
+
+	fw_log->mode = KBASE_CSF_FIRMWARE_LOG_MODE_MANUAL;
+
+	atomic_set(&fw_log->busy, 0);
+	INIT_DEFERRABLE_WORK(&fw_log->poll_work, kbase_csf_firmware_log_poll);
+
+#if defined(CONFIG_DEBUG_FS)
+	debugfs_create_file("fw_trace_enable_mask", 0644, kbdev->mali_debugfs_directory, kbdev,
+			    &kbase_csf_firmware_log_enable_mask_fops);
+	debugfs_create_file("fw_traces", 0444, kbdev->mali_debugfs_directory, kbdev,
+			    &kbasep_csf_firmware_log_debugfs_fops);
+	debugfs_create_file("fw_trace_mode", 0644, kbdev->mali_debugfs_directory, kbdev,
+			    &kbase_csf_firmware_log_mode_fops);
+#endif /* CONFIG_DEBUG_FS */
+
+	return 0;
+}
+
+void kbase_csf_firmware_log_term(struct kbase_device *kbdev)
+{
+	struct kbase_csf_firmware_log *fw_log = &kbdev->csf.fw_log;
+
+	if (fw_log->dump_buf) {
+		cancel_delayed_work_sync(&fw_log->poll_work);
+		kfree(fw_log->dump_buf);
+		fw_log->dump_buf = NULL;
+	}
+}
+
+void kbase_csf_firmware_log_dump_buffer(struct kbase_device *kbdev)
+{
+	struct kbase_csf_firmware_log *fw_log = &kbdev->csf.fw_log;
+	u8 *buf = fw_log->dump_buf, *p, *pnewline, *pend, *pendbuf;
+	unsigned int read_size, remaining_size;
+	struct firmware_trace_buffer *tb =
+		kbase_csf_firmware_get_trace_buffer(kbdev, FIRMWARE_LOG_BUF_NAME);
+
+	if (tb == NULL) {
+		dev_dbg(kbdev->dev, "Can't get the trace buffer, firmware trace dump skipped");
+		return;
+	}
+
+	if (atomic_cmpxchg(&fw_log->busy, 0, 1) != 0)
+		return;
+
+	/* FW should only print complete messages, so there's no need to handle
+	 * partial messages over multiple invocations of this function
+	 */
+
+	p = buf;
+	pendbuf = &buf[FIRMWARE_LOG_DUMP_BUF_SIZE];
+
+	while ((read_size = kbase_csf_firmware_trace_buffer_read_data(tb, p, pendbuf - p))) {
+		pend = p + read_size;
+		p = buf;
+
+		while (p < pend && (pnewline = memchr(p, '\n', pend - p))) {
+			/* Null-terminate the string */
+			*pnewline = 0;
+
+			dev_err(kbdev->dev, "FW> %s", p);
+
+			p = pnewline + 1;
+		}
+
+		remaining_size = pend - p;
+
+		if (!remaining_size) {
+			p = buf;
+		} else if (remaining_size < FIRMWARE_LOG_DUMP_BUF_SIZE) {
+			/* Copy unfinished string to the start of the buffer */
+			memmove(buf, p, remaining_size);
+			p = &buf[remaining_size];
+		} else {
+			/* Print abnormally long string without newlines */
+			dev_err(kbdev->dev, "FW> %s", buf);
+			p = buf;
+		}
+	}
+
+	if (p != buf) {
+		/* Null-terminate and print last unfinished string */
+		*p = 0;
+		dev_err(kbdev->dev, "FW> %s", buf);
+	}
+
+	atomic_set(&fw_log->busy, 0);
+}
+
+void kbase_csf_firmware_log_parse_logging_call_list_entry(struct kbase_device *kbdev,
+							  const uint32_t *entry)
+{
+	kbdev->csf.fw_log.func_call_list_va_start = entry[0];
+	kbdev->csf.fw_log.func_call_list_va_end = entry[1];
+}
+
+/**
+ * toggle_logging_calls_in_loaded_image - Toggles FW log func calls in loaded FW image.
+ *
+ * @kbdev:  Instance of a GPU platform device that implements a CSF interface.
+ * @enable: Whether to enable or disable the function calls.
+ */
+static void toggle_logging_calls_in_loaded_image(struct kbase_device *kbdev, bool enable)
+{
+	uint32_t bl_instruction, diff;
+	uint32_t imm11, imm10, i1, i2, j1, j2, sign;
+	uint32_t calling_address = 0, callee_address = 0;
+	uint32_t list_entry = kbdev->csf.fw_log.func_call_list_va_start;
+	const uint32_t list_va_end = kbdev->csf.fw_log.func_call_list_va_end;
+
+	if (list_entry == 0 || list_va_end == 0)
+		return;
+
+	if (enable) {
+		for (; list_entry < list_va_end; list_entry += 2 * sizeof(uint32_t)) {
+			/* Read calling address */
+			kbase_csf_read_firmware_memory(kbdev, list_entry, &calling_address);
+			/* Read callee address */
+			kbase_csf_read_firmware_memory(kbdev, list_entry + sizeof(uint32_t),
+					&callee_address);
+
+			diff = callee_address - calling_address - 4;
+			sign = !!(diff & 0x80000000);
+			if (ARMV7_T1_BL_IMM_RANGE_MIN > (int32_t)diff &&
+					ARMV7_T1_BL_IMM_RANGE_MAX < (int32_t)diff) {
+				dev_warn(kbdev->dev, "FW log patch 0x%x out of range, skipping",
+						calling_address);
+				continue;
+			}
+
+			i1 = (diff & 0x00800000) >> 23;
+			j1 = !i1 ^ sign;
+			i2 = (diff & 0x00400000) >> 22;
+			j2 = !i2 ^ sign;
+			imm11 = (diff & 0xffe) >> 1;
+			imm10 = (diff & 0x3ff000) >> 12;
+
+			/* Compose BL instruction */
+			bl_instruction = ARMV7_T1_BL_IMM_INSTR;
+			bl_instruction |= j1 << 29;
+			bl_instruction |= j2 << 27;
+			bl_instruction |= imm11 << 16;
+			bl_instruction |= sign << 10;
+			bl_instruction |= imm10;
+
+			/* Patch logging func calls in their load location */
+			dev_dbg(kbdev->dev, "FW log patch 0x%x: 0x%x\n", calling_address,
+					bl_instruction);
+			kbase_csf_update_firmware_memory_exe(kbdev, calling_address,
+					bl_instruction);
+		}
+	} else {
+		for (; list_entry < list_va_end; list_entry += 2 * sizeof(uint32_t)) {
+			/* Read calling address */
+			kbase_csf_read_firmware_memory(kbdev, list_entry, &calling_address);
+
+			/* Overwrite logging func calls with 2 NOP instructions */
+			kbase_csf_update_firmware_memory_exe(kbdev, calling_address,
+					ARMV7_DOUBLE_NOP_INSTR);
+		}
+	}
+}
+
+int kbase_csf_firmware_log_toggle_logging_calls(struct kbase_device *kbdev, u32 val)
+{
+	unsigned long flags;
+	struct kbase_csf_firmware_log *fw_log = &kbdev->csf.fw_log;
+	bool mcu_inactive;
+	bool resume_needed = false;
+	int ret = 0;
+	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
+
+	if (atomic_cmpxchg(&fw_log->busy, 0, 1) != 0)
+		return -EBUSY;
+
+	/* Suspend all the active CS groups */
+	dev_dbg(kbdev->dev, "Suspend all the active CS groups");
+
+	kbase_csf_scheduler_lock(kbdev);
+	while (scheduler->state != SCHED_SUSPENDED) {
+		kbase_csf_scheduler_unlock(kbdev);
+		kbase_csf_scheduler_pm_suspend(kbdev);
+		kbase_csf_scheduler_lock(kbdev);
+		resume_needed = true;
+	}
+
+	/* Wait for the MCU to get disabled */
+	dev_info(kbdev->dev, "Wait for the MCU to get disabled");
+	ret = kbase_pm_wait_for_desired_state(kbdev);
+	if (ret) {
+		dev_err(kbdev->dev,
+			"wait for PM state failed when toggling FW logging calls");
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	mcu_inactive =
+		kbase_pm_is_mcu_inactive(kbdev, kbdev->pm.backend.mcu_state);
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+	if (!mcu_inactive) {
+		dev_err(kbdev->dev,
+			"MCU not inactive after PM state wait when toggling FW logging calls");
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	/* Toggle FW logging call in the loaded FW image */
+	toggle_logging_calls_in_loaded_image(kbdev, val);
+	dev_dbg(kbdev->dev, "FW logging: %s", val ? "enabled" : "disabled");
+
+out:
+	kbase_csf_scheduler_unlock(kbdev);
+	if (resume_needed)
+		/* Resume queue groups and start mcu */
+		kbase_csf_scheduler_pm_resume(kbdev);
+	atomic_set(&fw_log->busy, 0);
+	return ret;
+}
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware_log.h
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware_log.h
@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _KBASE_CSF_FIRMWARE_LOG_H_
+#define _KBASE_CSF_FIRMWARE_LOG_H_
+
+#include <mali_kbase.h>
+
+/*
+ * Firmware log dumping buffer size.
+ */
+#define FIRMWARE_LOG_DUMP_BUF_SIZE PAGE_SIZE
+
+/**
+ * kbase_csf_firmware_log_init - Initialize firmware log handling.
+ *
+ * @kbdev: Pointer to the Kbase device
+ *
+ * Return: The initialization error code.
+ */
+int kbase_csf_firmware_log_init(struct kbase_device *kbdev);
+
+/**
+ * kbase_csf_firmware_log_term - Terminate firmware log handling.
+ *
+ * @kbdev: Pointer to the Kbase device
+ */
+void kbase_csf_firmware_log_term(struct kbase_device *kbdev);
+
+/**
+ * kbase_csf_firmware_log_dump_buffer - Read remaining data in the firmware log
+ *                                  buffer and print it to dmesg.
+ *
+ * @kbdev: Pointer to the Kbase device
+ */
+void kbase_csf_firmware_log_dump_buffer(struct kbase_device *kbdev);
+
+/**
+ * kbase_csf_firmware_log_parse_logging_call_list_entry - Parse FW logging function call list entry.
+ *
+ * @kbdev: Instance of a GPU platform device that implements a CSF interface.
+ * @entry: Pointer to section.
+ */
+void kbase_csf_firmware_log_parse_logging_call_list_entry(struct kbase_device *kbdev,
+							  const uint32_t *entry);
+/**
+ * kbase_csf_firmware_log_toggle_logging_calls - Enables/Disables FW logging function calls.
+ *
+ * @kbdev:  Instance of a GPU platform device that implements a CSF interface.
+ * @val:    Configuration option value.
+ *
+ * Return: 0 if successful, negative error code on failure
+ */
+int kbase_csf_firmware_log_toggle_logging_calls(struct kbase_device *kbdev, u32 val);
+
+#endif /* _KBASE_CSF_FIRMWARE_LOG_H_ */
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware_no_mali.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware_no_mali.c
@ -273,6 +273,18 @@ void kbase_csf_update_firmware_memory(struct kbase_device *kbdev,
 	/* NO_MALI: Nothing to do here */
 }

+void kbase_csf_read_firmware_memory_exe(struct kbase_device *kbdev,
+	u32 gpu_addr, u32 *value)
+{
+	/* NO_MALI: Nothing to do here */
+}
+
+void kbase_csf_update_firmware_memory_exe(struct kbase_device *kbdev,
+	u32 gpu_addr, u32 value)
+{
+	/* NO_MALI: Nothing to do here */
+}
+
 void kbase_csf_firmware_cs_input(
 	const struct kbase_csf_cmd_stream_info *const info, const u32 offset,
 	const u32 value)
@ -971,7 +983,6 @@ u32 kbase_csf_firmware_set_gpu_idle_hysteresis_time(struct kbase_device *kbdev,

 static u32 convert_dur_to_core_pwroff_count(struct kbase_device *kbdev, const u32 dur_us)
 {
-#define PWROFF_VAL_UNIT_SHIFT (10)
 	/* Get the cntfreq_el0 value, which drives the SYSTEM_TIMESTAMP */
 	u64 freq = arch_timer_get_cntfrq();
 	u64 dur_val = dur_us;
@ -1046,16 +1057,6 @@ int kbase_csf_firmware_early_init(struct kbase_device *kbdev)
 	kbdev->csf.fw_timeout_ms =
 		kbase_get_timeout_ms(kbdev, CSF_FIRMWARE_TIMEOUT);

-	kbdev->csf.gpu_idle_hysteresis_ms = FIRMWARE_IDLE_HYSTERESIS_TIME_MS;
-#ifdef KBASE_PM_RUNTIME
-	if (kbase_pm_gpu_sleep_allowed(kbdev))
-		kbdev->csf.gpu_idle_hysteresis_ms /=
-			FIRMWARE_IDLE_HYSTERESIS_GPU_SLEEP_SCALER;
-#endif
-	WARN_ON(!kbdev->csf.gpu_idle_hysteresis_ms);
-	kbdev->csf.gpu_idle_dur_count = convert_dur_to_idle_count(
-		kbdev, kbdev->csf.gpu_idle_hysteresis_ms);
-
 	INIT_LIST_HEAD(&kbdev->csf.firmware_interfaces);
 	INIT_LIST_HEAD(&kbdev->csf.firmware_config);
 	INIT_LIST_HEAD(&kbdev->csf.firmware_trace_buffers.list);
@ -1068,7 +1069,26 @@ int kbase_csf_firmware_early_init(struct kbase_device *kbdev)
 	return 0;
 }

-int kbase_csf_firmware_init(struct kbase_device *kbdev)
+void kbase_csf_firmware_early_term(struct kbase_device *kbdev)
+{
+	mutex_destroy(&kbdev->csf.reg_lock);
+}
+
+int kbase_csf_firmware_late_init(struct kbase_device *kbdev)
+{
+	kbdev->csf.gpu_idle_hysteresis_ms = FIRMWARE_IDLE_HYSTERESIS_TIME_MS;
+#ifdef KBASE_PM_RUNTIME
+	if (kbase_pm_gpu_sleep_allowed(kbdev))
+		kbdev->csf.gpu_idle_hysteresis_ms /= FIRMWARE_IDLE_HYSTERESIS_GPU_SLEEP_SCALER;
+#endif
+	WARN_ON(!kbdev->csf.gpu_idle_hysteresis_ms);
+	kbdev->csf.gpu_idle_dur_count =
+		convert_dur_to_idle_count(kbdev, kbdev->csf.gpu_idle_hysteresis_ms);
+
+	return 0;
+}
+
+int kbase_csf_firmware_load_init(struct kbase_device *kbdev)
 {
 	int ret;

@ -1134,11 +1154,11 @@ int kbase_csf_firmware_init(struct kbase_device *kbdev)
 	return 0;

 error:
-	kbase_csf_firmware_term(kbdev);
+	kbase_csf_firmware_unload_term(kbdev);
 	return ret;
 }

-void kbase_csf_firmware_term(struct kbase_device *kbdev)
+void kbase_csf_firmware_unload_term(struct kbase_device *kbdev)
 {
 	cancel_work_sync(&kbdev->csf.fw_error_work);

@ -1173,8 +1193,6 @@ void kbase_csf_firmware_term(struct kbase_device *kbdev)

 	/* NO_MALI: No trace buffers to terminate */

-	mutex_destroy(&kbdev->csf.reg_lock);
-
 	/* This will also free up the region allocated for the shared interface
 	 * entry parsed from the firmware image.
 	 */
@ -1227,8 +1245,9 @@ void kbase_csf_firmware_ping(struct kbase_device *const kbdev)
 	kbase_csf_scheduler_spin_unlock(kbdev, flags);
 }

-int kbase_csf_firmware_ping_wait(struct kbase_device *const kbdev)
+int kbase_csf_firmware_ping_wait(struct kbase_device *const kbdev, unsigned int wait_timeout_ms)
 {
+	CSTD_UNUSED(wait_timeout_ms);
 	kbase_csf_firmware_ping(kbdev);
 	return wait_for_global_request(kbdev, GLB_REQ_PING_MASK);
 }
@ -1267,7 +1286,7 @@ void kbase_csf_enter_protected_mode(struct kbase_device *kbdev)
 	kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR);
 }

-void kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev)
+int kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev)
 {
 	int err = wait_for_global_request(kbdev, GLB_REQ_PROTM_ENTER_MASK);

@ -1275,6 +1294,8 @@ void kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev)
 		if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE))
 			kbase_reset_gpu(kbdev);
 	}
+
+	return err;
 }

 void kbase_csf_firmware_trigger_mcu_halt(struct kbase_device *kbdev)
@ -1483,9 +1504,8 @@ int kbase_csf_firmware_mcu_shared_mapping_init(
 	if (!page_list)
 		goto page_list_alloc_error;

-	ret = kbase_mem_pool_alloc_pages(
-		&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW],
-		num_pages, phys, false);
+	ret = kbase_mem_pool_alloc_pages(&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW], num_pages,
+					 phys, false);
 	if (ret <= 0)
 		goto phys_mem_pool_alloc_error;

--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_heap_context_alloc.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_heap_context_alloc.c
@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
 *
- * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -154,8 +154,8 @@ u64 kbase_csf_heap_context_allocator_alloc(
 	struct kbase_csf_heap_context_allocator *const ctx_alloc)
 {
 	struct kbase_context *const kctx = ctx_alloc->kctx;
-	u64 flags = BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR |
-		BASE_MEM_PROT_CPU_WR | BASEP_MEM_NO_USER_FREE;
+	u64 flags = BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR | BASE_MEM_PROT_CPU_WR |
+		    BASEP_MEM_NO_USER_FREE | BASE_MEM_PROT_CPU_RD;
 	u64 nr_pages = PFN_UP(HEAP_CTX_REGION_SIZE);
 	u64 heap_gpu_va = 0;

@ -164,10 +164,6 @@ u64 kbase_csf_heap_context_allocator_alloc(
 	 */
 	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;

-#ifdef CONFIG_MALI_VECTOR_DUMP
-	flags |= BASE_MEM_PROT_CPU_RD;
-#endif
-
 	mutex_lock(&ctx_alloc->lock);

 	/* If the pool of heap contexts wasn't already allocated then
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_kcpu.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_kcpu.c
@ -55,7 +55,7 @@ static int kbase_kcpu_map_import_prepare(
 	long i;
 	int ret = 0;

-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kcpu_queue->lock);

 	/* Take the processes mmap lock */
 	down_read(kbase_mem_get_process_mmap_lock());
@ -114,7 +114,7 @@ static int kbase_kcpu_unmap_import_prepare_internal(
 	struct kbase_va_region *reg;
 	int ret = 0;

-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kcpu_queue->lock);

 	kbase_gpu_vm_lock(kctx);

@ -182,7 +182,8 @@ static void kbase_jit_add_to_pending_alloc_list(
 			&kctx->csf.kcpu_queues.jit_blocked_queues;
 	struct kbase_kcpu_command_queue *blocked_queue;

-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&queue->lock);
+	lockdep_assert_held(&kctx->csf.kcpu_queues.jit_lock);

 	list_for_each_entry(blocked_queue,
 			&kctx->csf.kcpu_queues.jit_blocked_queues,
@ -227,25 +228,28 @@ static int kbase_kcpu_jit_allocate_process(
 	u32 i;
 	int ret;

-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
-
-	if (alloc_info->blocked) {
-		list_del(&queue->jit_blocked);
-		alloc_info->blocked = false;
-	}
+	lockdep_assert_held(&queue->lock);

 	if (WARN_ON(!info))
 		return -EINVAL;

+	mutex_lock(&kctx->csf.kcpu_queues.jit_lock);
+
 	/* Check if all JIT IDs are not in use */
 	for (i = 0; i < count; i++, info++) {
 		/* The JIT ID is still in use so fail the allocation */
 		if (kctx->jit_alloc[info->id]) {
 			dev_dbg(kctx->kbdev->dev, "JIT ID still in use");
-			return -EINVAL;
+			ret = -EINVAL;
+			goto fail;
 		}
 	}

+	if (alloc_info->blocked) {
+		list_del(&queue->jit_blocked);
+		alloc_info->blocked = false;
+	}
+
 	/* Now start the allocation loop */
 	for (i = 0, info = alloc_info->info; i < count; i++, info++) {
 		/* Create a JIT allocation */
@ -280,7 +284,7 @@ static int kbase_kcpu_jit_allocate_process(
 				 */
 				dev_warn_ratelimited(kctx->kbdev->dev, "JIT alloc command failed: %pK\n", cmd);
 				ret = -ENOMEM;
-				goto fail;
+				goto fail_rollback;
 			}

 			/* There are pending frees for an active allocation
@ -298,7 +302,8 @@ static int kbase_kcpu_jit_allocate_process(
 				kctx->jit_alloc[info->id] = NULL;
 			}

-			return -EAGAIN;
+			ret = -EAGAIN;
+			goto fail;
 		}

 		/* Bind it to the user provided ID. */
@ -314,7 +319,7 @@ static int kbase_kcpu_jit_allocate_process(
 				KBASE_REG_CPU_WR, &mapping);
 		if (!ptr) {
 			ret = -ENOMEM;
-			goto fail;
+			goto fail_rollback;
 		}

 		reg = kctx->jit_alloc[info->id];
@ -323,9 +328,11 @@ static int kbase_kcpu_jit_allocate_process(
 		kbase_vunmap(kctx, &mapping);
 	}

+	mutex_unlock(&kctx->csf.kcpu_queues.jit_lock);
+
 	return 0;

-fail:
+fail_rollback:
 	/* Roll back completely */
 	for (i = 0, info = alloc_info->info; i < count; i++, info++) {
 		/* Free the allocations that were successful.
@ -338,6 +345,8 @@ static int kbase_kcpu_jit_allocate_process(

 		kctx->jit_alloc[info->id] = KBASE_RESERVED_REG_JIT_ALLOC;
 	}
+fail:
+	mutex_unlock(&kctx->csf.kcpu_queues.jit_lock);

 	return ret;
 }
@ -354,7 +363,7 @@ static int kbase_kcpu_jit_allocate_prepare(
 	int ret = 0;
 	u32 i;

-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kcpu_queue->lock);

 	if (!data || count > kcpu_queue->kctx->jit_max_allocations ||
 			count > ARRAY_SIZE(kctx->jit_alloc)) {
@ -392,11 +401,13 @@ static int kbase_kcpu_jit_allocate_prepare(
 	}

 	current_command->type = BASE_KCPU_COMMAND_TYPE_JIT_ALLOC;
-	list_add_tail(&current_command->info.jit_alloc.node,
-			&kctx->csf.kcpu_queues.jit_cmds_head);
 	current_command->info.jit_alloc.info = info;
 	current_command->info.jit_alloc.count = count;
 	current_command->info.jit_alloc.blocked = false;
+	mutex_lock(&kctx->csf.kcpu_queues.jit_lock);
+	list_add_tail(&current_command->info.jit_alloc.node,
+			&kctx->csf.kcpu_queues.jit_cmds_head);
+	mutex_unlock(&kctx->csf.kcpu_queues.jit_lock);

 	return 0;
 out_free:
@ -415,7 +426,9 @@ static void kbase_kcpu_jit_allocate_finish(
 		struct kbase_kcpu_command_queue *queue,
 		struct kbase_kcpu_command *cmd)
 {
-	lockdep_assert_held(&queue->kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&queue->lock);
+
+	mutex_lock(&queue->kctx->csf.kcpu_queues.jit_lock);

 	/* Remove this command from the jit_cmds_head list */
 	list_del(&cmd->info.jit_alloc.node);
@ -429,6 +442,8 @@ static void kbase_kcpu_jit_allocate_finish(
 		cmd->info.jit_alloc.blocked = false;
 	}

+	mutex_unlock(&queue->kctx->csf.kcpu_queues.jit_lock);
+
 	kfree(cmd->info.jit_alloc.info);
 }

@ -441,18 +456,17 @@ static void kbase_kcpu_jit_retry_pending_allocs(struct kbase_context *kctx)
 {
 	struct kbase_kcpu_command_queue *blocked_queue;

-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kctx->csf.kcpu_queues.jit_lock);

 	/*
 	 * Reschedule all queues blocked by JIT_ALLOC commands.
 	 * NOTE: This code traverses the list of blocked queues directly. It
 	 * only works as long as the queued works are not executed at the same
 	 * time. This precondition is true since we're holding the
-	 * kbase_csf_kcpu_queue_context.lock .
+	 * kbase_csf_kcpu_queue_context.jit_lock .
 	 */
-	list_for_each_entry(blocked_queue,
-			&kctx->csf.kcpu_queues.jit_blocked_queues, jit_blocked)
-		queue_work(kctx->csf.kcpu_queues.wq, &blocked_queue->work);
+	list_for_each_entry(blocked_queue, &kctx->csf.kcpu_queues.jit_blocked_queues, jit_blocked)
+		queue_work(blocked_queue->wq, &blocked_queue->work);
 }

 static int kbase_kcpu_jit_free_process(struct kbase_kcpu_command_queue *queue,
@ -469,7 +483,8 @@ static int kbase_kcpu_jit_free_process(struct kbase_kcpu_command_queue *queue,
 	if (WARN_ON(!ids))
 		return -EINVAL;

-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&queue->lock);
+	mutex_lock(&kctx->csf.kcpu_queues.jit_lock);

 	KBASE_TLSTREAM_TL_KBASE_ARRAY_BEGIN_KCPUQUEUE_EXECUTE_JIT_FREE_END(queue->kctx->kbdev,
 									   queue);
@ -501,9 +516,6 @@ static int kbase_kcpu_jit_free_process(struct kbase_kcpu_command_queue *queue,
 			queue->kctx->kbdev, queue, item_err, pages_used);
 	}

-	/* Free the list of ids */
-	kfree(ids);
-
 	/*
 	 * Remove this command from the jit_cmds_head list and retry pending
 	 * allocations.
@ -511,6 +523,11 @@ static int kbase_kcpu_jit_free_process(struct kbase_kcpu_command_queue *queue,
 	list_del(&cmd->info.jit_free.node);
 	kbase_kcpu_jit_retry_pending_allocs(kctx);

+	mutex_unlock(&kctx->csf.kcpu_queues.jit_lock);
+
+	/* Free the list of ids */
+	kfree(ids);
+
 	return rc;
 }

@ -526,7 +543,7 @@ static int kbase_kcpu_jit_free_prepare(
 	int ret;
 	u32 i;

-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kcpu_queue->lock);

 	/* Sanity checks */
 	if (!count || count > ARRAY_SIZE(kctx->jit_alloc)) {
@ -572,10 +589,12 @@ static int kbase_kcpu_jit_free_prepare(
 	}

 	current_command->type = BASE_KCPU_COMMAND_TYPE_JIT_FREE;
-	list_add_tail(&current_command->info.jit_free.node,
-			&kctx->csf.kcpu_queues.jit_cmds_head);
 	current_command->info.jit_free.ids = ids;
 	current_command->info.jit_free.count = count;
+	mutex_lock(&kctx->csf.kcpu_queues.jit_lock);
+	list_add_tail(&current_command->info.jit_free.node,
+			&kctx->csf.kcpu_queues.jit_cmds_head);
+	mutex_unlock(&kctx->csf.kcpu_queues.jit_lock);

 	return 0;
 out_free:
@ -601,7 +620,7 @@ static int kbase_csf_queue_group_suspend_prepare(
 	int pinned_pages = 0, ret = 0;
 	struct kbase_va_region *reg;

-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kcpu_queue->lock);

 	if (suspend_buf->size < csg_suspend_buf_size)
 		return -EINVAL;
@ -652,9 +671,12 @@ static int kbase_csf_queue_group_suspend_prepare(
 		u64 start, end, i;

 		if (((reg->flags & KBASE_REG_ZONE_MASK) != KBASE_REG_ZONE_SAME_VA) ||
-				reg->nr_pages < nr_pages ||
-				kbase_reg_current_backed_size(reg) !=
-					reg->nr_pages) {
+		    (kbase_reg_current_backed_size(reg) < nr_pages) ||
+		    !(reg->flags & KBASE_REG_CPU_WR) ||
+		    (reg->gpu_alloc->type != KBASE_MEM_TYPE_NATIVE) ||
+		    (reg->flags & KBASE_REG_DONT_NEED) ||
+		    (reg->flags & KBASE_REG_ACTIVE_JIT_ALLOC) ||
+		    (reg->flags & KBASE_REG_NO_USER_FREE)) {
 			ret = -EINVAL;
 			goto out_clean_pages;
 		}
@ -703,9 +725,8 @@ static enum kbase_csf_event_callback_action event_cqs_callback(void *param)
 {
 	struct kbase_kcpu_command_queue *kcpu_queue =
 		(struct kbase_kcpu_command_queue *)param;
-	struct kbase_context *const kctx = kcpu_queue->kctx;

-	queue_work(kctx->csf.kcpu_queues.wq, &kcpu_queue->work);
+	queue_work(kcpu_queue->wq, &kcpu_queue->work);

 	return KBASE_CSF_EVENT_CALLBACK_KEEP;
 }
@ -735,7 +756,7 @@ static int kbase_kcpu_cqs_wait_process(struct kbase_device *kbdev,
 {
 	u32 i;

-	lockdep_assert_held(&queue->kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&queue->lock);

 	if (WARN_ON(!cqs_wait->objs))
 		return -EINVAL;
@ -803,7 +824,7 @@ static int kbase_kcpu_cqs_wait_prepare(struct kbase_kcpu_command_queue *queue,
 	struct base_cqs_wait_info *objs;
 	unsigned int nr_objs = cqs_wait_info->nr_objs;

-	lockdep_assert_held(&queue->kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&queue->lock);

 	if (nr_objs > BASEP_KCPU_CQS_MAX_NUM_OBJS)
 		return -EINVAL;
@ -857,7 +878,7 @@ static void kbase_kcpu_cqs_set_process(struct kbase_device *kbdev,
 {
 	unsigned int i;

-	lockdep_assert_held(&queue->kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&queue->lock);

 	if (WARN_ON(!cqs_set->objs))
 		return;
@ -898,11 +919,10 @@ static int kbase_kcpu_cqs_set_prepare(
 		struct base_kcpu_command_cqs_set_info *cqs_set_info,
 		struct kbase_kcpu_command *current_command)
 {
-	struct kbase_context *const kctx = kcpu_queue->kctx;
 	struct base_cqs_set *objs;
 	unsigned int nr_objs = cqs_set_info->nr_objs;

-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kcpu_queue->lock);

 	if (nr_objs > BASEP_KCPU_CQS_MAX_NUM_OBJS)
 		return -EINVAL;
@ -952,7 +972,7 @@ static int kbase_kcpu_cqs_wait_operation_process(struct kbase_device *kbdev,
 {
 	u32 i;

-	lockdep_assert_held(&queue->kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&queue->lock);

 	if (WARN_ON(!cqs_wait_operation->objs))
 		return -EINVAL;
@ -1039,7 +1059,7 @@ static int kbase_kcpu_cqs_wait_operation_prepare(struct kbase_kcpu_command_queue
 	struct base_cqs_wait_operation_info *objs;
 	unsigned int nr_objs = cqs_wait_operation_info->nr_objs;

-	lockdep_assert_held(&queue->kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&queue->lock);

 	if (nr_objs > BASEP_KCPU_CQS_MAX_NUM_OBJS)
 		return -EINVAL;
@ -1094,7 +1114,7 @@ static void kbase_kcpu_cqs_set_operation_process(
 {
 	unsigned int i;

-	lockdep_assert_held(&queue->kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&queue->lock);

 	if (WARN_ON(!cqs_set_operation->objs))
 		return;
@ -1161,11 +1181,10 @@ static int kbase_kcpu_cqs_set_operation_prepare(
 		struct base_kcpu_command_cqs_set_operation_info *cqs_set_operation_info,
 		struct kbase_kcpu_command *current_command)
 {
-	struct kbase_context *const kctx = kcpu_queue->kctx;
 	struct base_cqs_set_operation_info *objs;
 	unsigned int nr_objs = cqs_set_operation_info->nr_objs;

-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kcpu_queue->lock);

 	if (nr_objs > BASEP_KCPU_CQS_MAX_NUM_OBJS)
 		return -EINVAL;
@ -1212,7 +1231,7 @@ static void kbase_csf_fence_wait_callback(struct dma_fence *fence,
 				  fence->context, fence->seqno);

 	/* Resume kcpu command queue processing. */
-	queue_work(kctx->csf.kcpu_queues.wq, &kcpu_queue->work);
+	queue_work(kcpu_queue->wq, &kcpu_queue->work);
 }

 static void kbase_kcpu_fence_wait_cancel(
@ -1221,7 +1240,7 @@ static void kbase_kcpu_fence_wait_cancel(
 {
 	struct kbase_context *const kctx = kcpu_queue->kctx;

-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kcpu_queue->lock);

 	if (WARN_ON(!fence_info->fence))
 		return;
@ -1293,7 +1312,7 @@ static void fence_timeout_callback(struct timer_list *timer)
 	kbase_sync_fence_info_get(fence, &info);

 	if (info.status == 1) {
-		queue_work(kctx->csf.kcpu_queues.wq, &kcpu_queue->work);
+		queue_work(kcpu_queue->wq, &kcpu_queue->work);
 	} else if (info.status == 0) {
 		dev_warn(kctx->kbdev->dev, "fence has not yet signalled in %ums",
 			 FENCE_WAIT_TIMEOUT_MS);
@ -1345,7 +1364,7 @@ static int kbase_kcpu_fence_wait_process(
 #endif
 	struct kbase_context *const kctx = kcpu_queue->kctx;

-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kcpu_queue->lock);

 	if (WARN_ON(!fence_info->fence))
 		return -EINVAL;
@ -1401,7 +1420,6 @@ static int kbase_kcpu_fence_wait_prepare(
 		struct base_kcpu_command_fence_info *fence_info,
 		struct kbase_kcpu_command *current_command)
 {
-	struct kbase_context *const kctx = kcpu_queue->kctx;
 #if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
 	struct fence *fence_in;
 #else
@ -1409,7 +1427,7 @@ static int kbase_kcpu_fence_wait_prepare(
 #endif
 	struct base_fence fence;

-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kcpu_queue->lock);

 	if (copy_from_user(&fence, u64_to_user_ptr(fence_info->fence),
 			sizeof(fence)))
@ -1460,7 +1478,6 @@ static int kbase_kcpu_fence_signal_prepare(
 		struct base_kcpu_command_fence_info *fence_info,
 		struct kbase_kcpu_command *current_command)
 {
-	struct kbase_context *const kctx = kcpu_queue->kctx;
 #if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
 	struct fence *fence_out;
 #else
@ -1471,7 +1488,7 @@ static int kbase_kcpu_fence_signal_prepare(
 	int ret = 0;
 	int fd;

-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kcpu_queue->lock);

 	if (copy_from_user(&fence, u64_to_user_ptr(fence_info->fence),
 			sizeof(fence)))
@ -1549,11 +1566,9 @@ static void kcpu_queue_process_worker(struct work_struct *data)
 	struct kbase_kcpu_command_queue *queue = container_of(data,
 				struct kbase_kcpu_command_queue, work);

-	mutex_lock(&queue->kctx->csf.kcpu_queues.lock);
-
+	mutex_lock(&queue->lock);
 	kcpu_queue_process(queue, false);
-
-	mutex_unlock(&queue->kctx->csf.kcpu_queues.lock);
+	mutex_unlock(&queue->lock);
 }

 static int delete_queue(struct kbase_context *kctx, u32 id)
@ -1569,6 +1584,17 @@ static int delete_queue(struct kbase_context *kctx, u32 id)
 		KBASE_KTRACE_ADD_CSF_KCPU(kctx->kbdev, KCPU_QUEUE_DELETE,
 			queue, queue->num_pending_cmds, queue->cqs_wait_count);

+		/* Disassociate the queue from the system to prevent further
+		 * submissions. Draining pending commands would be acceptable
+		 * even if a new queue is created using the same ID.
+		 */
+		kctx->csf.kcpu_queues.array[id] = NULL;
+		bitmap_clear(kctx->csf.kcpu_queues.in_use, id, 1);
+
+		mutex_unlock(&kctx->csf.kcpu_queues.lock);
+
+		mutex_lock(&queue->lock);
+
 		/* Drain the remaining work for this queue first and go past
 		 * all the waits.
 		 */
@ -1580,17 +1606,17 @@ static int delete_queue(struct kbase_context *kctx, u32 id)
 		/* All CQS wait commands should have been cleaned up */
 		WARN_ON(queue->cqs_wait_count);

-		kctx->csf.kcpu_queues.array[id] = NULL;
-		bitmap_clear(kctx->csf.kcpu_queues.in_use, id, 1);
-
 		/* Fire the tracepoint with the mutex held to enforce correct
 		 * ordering with the summary stream.
 		 */
 		KBASE_TLSTREAM_TL_KBASE_DEL_KCPUQUEUE(kctx->kbdev, queue);

-		mutex_unlock(&kctx->csf.kcpu_queues.lock);
+		mutex_unlock(&queue->lock);

 		cancel_work_sync(&queue->work);
+		destroy_workqueue(queue->wq);
+
+		mutex_destroy(&queue->lock);

 		kfree(queue);
 	} else {
@ -1657,7 +1683,7 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue,
 	bool process_next = true;
 	size_t i;

-	lockdep_assert_held(&queue->kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&queue->lock);

 	for (i = 0; i != queue->num_pending_cmds; ++i) {
 		struct kbase_kcpu_command *cmd =
@ -2058,9 +2084,11 @@ int kbase_csf_kcpu_queue_enqueue(struct kbase_context *kctx,

 	/* The offset to the first command that is being processed or yet to
 	 * be processed is of u8 type, so the number of commands inside the
-	 * queue cannot be more than 256.
+	 * queue cannot be more than 256. The current implementation expects
+	 * exactly 256, any other size will require the addition of wrapping
+	 * logic.
 	 */
-	BUILD_BUG_ON(KBASEP_KCPU_QUEUE_SIZE > 256);
+	BUILD_BUG_ON(KBASEP_KCPU_QUEUE_SIZE != 256);

 	/* Whilst the backend interface allows enqueueing multiple commands in
 	 * a single operation, the Base interface does not expose any mechanism
@ -2076,13 +2104,13 @@ int kbase_csf_kcpu_queue_enqueue(struct kbase_context *kctx,
 	}

 	mutex_lock(&kctx->csf.kcpu_queues.lock);
-
-	if (!kctx->csf.kcpu_queues.array[enq->id]) {
-		ret = -EINVAL;
-		goto out;
-	}
-
 	queue = kctx->csf.kcpu_queues.array[enq->id];
+	mutex_unlock(&kctx->csf.kcpu_queues.lock);
+
+	if (queue == NULL)
+		return -EINVAL;
+
+	mutex_lock(&queue->lock);

 	if (kcpu_queue_get_space(queue) < enq->nr_commands) {
 		ret = -EBUSY;
@ -2097,7 +2125,7 @@ int kbase_csf_kcpu_queue_enqueue(struct kbase_context *kctx,
 	 * for the possibility to roll back.
 	 */

-	for (i = 0; (i != enq->nr_commands) && !ret; ++i, ++kctx->csf.kcpu_queues.num_cmds) {
+	for (i = 0; (i != enq->nr_commands) && !ret; ++i) {
 		struct kbase_kcpu_command *kcpu_cmd =
 			&queue->commands[(u8)(queue->start_offset + queue->num_pending_cmds + i)];
 		struct base_kcpu_command command;
@ -2120,7 +2148,7 @@ int kbase_csf_kcpu_queue_enqueue(struct kbase_context *kctx,
 			}
 		}

-		kcpu_cmd->enqueue_ts = kctx->csf.kcpu_queues.num_cmds;
+		kcpu_cmd->enqueue_ts = atomic64_inc_return(&kctx->csf.kcpu_queues.cmd_seq_num);
 		switch (command.type) {
 		case BASE_KCPU_COMMAND_TYPE_FENCE_WAIT:
 #if IS_ENABLED(CONFIG_SYNC_FILE)
@ -2208,13 +2236,10 @@ int kbase_csf_kcpu_queue_enqueue(struct kbase_context *kctx,

 		queue->num_pending_cmds += enq->nr_commands;
 		kcpu_queue_process(queue, false);
-	} else {
-		/* Roll back the number of enqueued commands */
-		kctx->csf.kcpu_queues.num_cmds -= i;
 	}

 out:
-	mutex_unlock(&kctx->csf.kcpu_queues.lock);
+	mutex_unlock(&queue->lock);

 	return ret;
 }
@ -2228,14 +2253,9 @@ int kbase_csf_kcpu_queue_context_init(struct kbase_context *kctx)
 	for (idx = 0; idx < KBASEP_MAX_KCPU_QUEUES; ++idx)
 		kctx->csf.kcpu_queues.array[idx] = NULL;

-	kctx->csf.kcpu_queues.wq = alloc_workqueue("mali_kbase_csf_kcpu",
-					WQ_UNBOUND | WQ_HIGHPRI, 0);
-	if (!kctx->csf.kcpu_queues.wq)
-		return -ENOMEM;
-
 	mutex_init(&kctx->csf.kcpu_queues.lock);

-	kctx->csf.kcpu_queues.num_cmds = 0;
+	atomic64_set(&kctx->csf.kcpu_queues.cmd_seq_num, 0);

 	return 0;
 }
@ -2253,7 +2273,6 @@ void kbase_csf_kcpu_queue_context_term(struct kbase_context *kctx)
 			(void)delete_queue(kctx, id);
 	}

-	destroy_workqueue(kctx->csf.kcpu_queues.wq);
 	mutex_destroy(&kctx->csf.kcpu_queues.lock);
 }

@ -2297,8 +2316,17 @@ int kbase_csf_kcpu_queue_new(struct kbase_context *kctx,
 		goto out;
 	}

+	queue->wq = alloc_workqueue("mali_kbase_csf_kcpu_wq_%i", WQ_UNBOUND | WQ_HIGHPRI, 0, idx);
+	if (queue->wq == NULL) {
+		kfree(queue);
+		ret = -ENOMEM;
+
+		goto out;
+	}
+
 	bitmap_set(kctx->csf.kcpu_queues.in_use, idx, 1);
 	kctx->csf.kcpu_queues.array[idx] = queue;
+	mutex_init(&queue->lock);
 	queue->kctx = kctx;
 	queue->start_offset = 0;
 	queue->num_pending_cmds = 0;
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_kcpu.h
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_kcpu.h
@ -236,9 +236,11 @@ struct kbase_kcpu_command {
 /**
 * struct kbase_kcpu_command_queue - a command queue executed by the kernel
 *
+ * @lock:			Lock to protect accesses to this queue.
 * @kctx:			The context to which this command queue belongs.
 * @commands:			Array of commands which have been successfully
 *				enqueued to this command queue.
+ * @wq:				Dedicated workqueue for processing commands.
 * @work:			struct work_struct which contains a pointer to
 *				the function which handles processing of kcpu
 *				commands enqueued into a kcpu command queue;
@ -274,8 +276,10 @@ struct kbase_kcpu_command {
 * @fence_timeout:		Timer used to detect the fence wait timeout.
 */
 struct kbase_kcpu_command_queue {
+	struct mutex lock;
 	struct kbase_context *kctx;
 	struct kbase_kcpu_command commands[KBASEP_KCPU_QUEUE_SIZE];
+	struct workqueue_struct *wq;
 	struct work_struct work;
 	u8 start_offset;
 	u8 id;
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_registers.h
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_registers.h
@ -163,6 +163,8 @@
 #define CSG_PROTM_SUSPEND_BUF_HI 0x004C /* () Protected mode suspend buffer, high word */
 #define CSG_CONFIG 0x0050 /* () CSG configuration options */
 #define CSG_ITER_TRACE_CONFIG 0x0054 /* () CSG trace configuration */
+#define CSG_DVS_BUF_LO 0x0060 /* () Normal mode deferred vertex shading work buffer, low word */
+#define CSG_DVS_BUF_HI 0x0064 /* () Normal mode deferred vertex shading work buffer, high word */

 /* CSG_OUTPUT_BLOCK register offsets */
 #define CSG_ACK 0x0000 /* () CSG acknowledge flags */
@ -547,6 +549,13 @@
 #define CS_STATUS_WAIT_SB_MASK_SET(reg_val, value) \
 	(((reg_val) & ~CS_STATUS_WAIT_SB_MASK_MASK) |  \
 	 (((value) << CS_STATUS_WAIT_SB_MASK_SHIFT) & CS_STATUS_WAIT_SB_MASK_MASK))
+#define CS_STATUS_WAIT_SB_SOURCE_SHIFT 16
+#define CS_STATUS_WAIT_SB_SOURCE_MASK (0xF << CS_STATUS_WAIT_SB_SOURCE_SHIFT)
+#define CS_STATUS_WAIT_SB_SOURCE_GET(reg_val)                                                      \
+	(((reg_val)&CS_STATUS_WAIT_SB_SOURCE_MASK) >> CS_STATUS_WAIT_SB_SOURCE_SHIFT)
+#define CS_STATUS_WAIT_SB_SOURCE_SET(reg_val, value)                                               \
+	(((reg_val) & ~CS_STATUS_WAIT_SB_SOURCE_MASK) |                                            \
+	 (((value) << CS_STATUS_WAIT_SB_SOURCE_SHIFT) & CS_STATUS_WAIT_SB_SOURCE_MASK))
 #define CS_STATUS_WAIT_SYNC_WAIT_CONDITION_SHIFT 24
 #define CS_STATUS_WAIT_SYNC_WAIT_CONDITION_MASK (0xF << CS_STATUS_WAIT_SYNC_WAIT_CONDITION_SHIFT)
 #define CS_STATUS_WAIT_SYNC_WAIT_CONDITION_GET(reg_val) \
@ -557,6 +566,7 @@
 /* CS_STATUS_WAIT_SYNC_WAIT_CONDITION values */
 #define CS_STATUS_WAIT_SYNC_WAIT_CONDITION_LE 0x0
 #define CS_STATUS_WAIT_SYNC_WAIT_CONDITION_GT 0x1
+#define CS_STATUS_WAIT_SYNC_WAIT_CONDITION_GE 0x5
 /* End of CS_STATUS_WAIT_SYNC_WAIT_CONDITION values */
 #define CS_STATUS_WAIT_PROGRESS_WAIT_SHIFT 28
 #define CS_STATUS_WAIT_PROGRESS_WAIT_MASK (0x1 << CS_STATUS_WAIT_PROGRESS_WAIT_SHIFT)
@ -835,11 +845,6 @@
 #define CSG_REQ_IDLE_GET(reg_val) (((reg_val)&CSG_REQ_IDLE_MASK) >> CSG_REQ_IDLE_SHIFT)
 #define CSG_REQ_IDLE_SET(reg_val, value) \
 	(((reg_val) & ~CSG_REQ_IDLE_MASK) | (((value) << CSG_REQ_IDLE_SHIFT) & CSG_REQ_IDLE_MASK))
-#define CSG_REQ_DOORBELL_SHIFT 30
-#define CSG_REQ_DOORBELL_MASK (0x1 << CSG_REQ_DOORBELL_SHIFT)
-#define CSG_REQ_DOORBELL_GET(reg_val) (((reg_val)&CSG_REQ_DOORBELL_MASK) >> CSG_REQ_DOORBELL_SHIFT)
-#define CSG_REQ_DOORBELL_SET(reg_val, value) \
-	(((reg_val) & ~CSG_REQ_DOORBELL_MASK) | (((value) << CSG_REQ_DOORBELL_SHIFT) & CSG_REQ_DOORBELL_MASK))
 #define CSG_REQ_PROGRESS_TIMER_EVENT_SHIFT 31
 #define CSG_REQ_PROGRESS_TIMER_EVENT_MASK (0x1 << CSG_REQ_PROGRESS_TIMER_EVENT_SHIFT)
 #define CSG_REQ_PROGRESS_TIMER_EVENT_GET(reg_val) \
@ -956,6 +961,21 @@
 	(((reg_val) & ~CSG_PROTM_SUSPEND_BUF_POINTER_MASK) |  \
 	 (((value) << CSG_PROTM_SUSPEND_BUF_POINTER_SHIFT) & CSG_PROTM_SUSPEND_BUF_POINTER_MASK))

+/* CSG_DVS_BUF_BUFFER register */
+#define CSG_DVS_BUF_BUFFER_SIZE_SHIFT GPU_U(0)
+#define CSG_DVS_BUF_BUFFER_SIZE_MASK (GPU_U(0xFFF) << CSG_DVS_BUF_BUFFER_SIZE_SHIFT)
+#define CSG_DVS_BUF_BUFFER_SIZE_GET(reg_val) (((reg_val)&CSG_DVS_BUF_BUFFER_SIZE_MASK) >> CSG_DVS_BUF_BUFFER_SIZE_SHIFT)
+#define CSG_DVS_BUF_BUFFER_SIZE_SET(reg_val, value) \
+	(((reg_val) & ~CSG_DVS_BUF_BUFFER_SIZE_MASK) |  \
+	 (((value) << CSG_DVS_BUF_BUFFER_SIZE_SHIFT) & CSG_DVS_BUF_BUFFER_SIZE_MASK))
+#define CSG_DVS_BUF_BUFFER_POINTER_SHIFT GPU_U(12)
+#define CSG_DVS_BUF_BUFFER_POINTER_MASK                                                            \
+	(GPU_ULL(0xFFFFFFFFFFFFF) << CSG_DVS_BUF_BUFFER_POINTER_SHIFT)
+#define CSG_DVS_BUF_BUFFER_POINTER_GET(reg_val) \
+	(((reg_val)&CSG_DVS_BUF_BUFFER_POINTER_MASK) >> CSG_DVS_BUF_BUFFER_POINTER_SHIFT)
+#define CSG_DVS_BUF_BUFFER_POINTER_SET(reg_val, value) \
+	(((reg_val) & ~CSG_DVS_BUF_BUFFER_POINTER_MASK) |  \
+	 (((value) << CSG_DVS_BUF_BUFFER_POINTER_SHIFT) & CSG_DVS_BUF_BUFFER_POINTER_MASK))

 /* End of CSG_INPUT_BLOCK register set definitions */

--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_reset_gpu.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_reset_gpu.c
@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
 *
- * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -21,7 +21,7 @@

 #include <mali_kbase.h>
 #include <mali_kbase_ctx_sched.h>
-#include <mali_kbase_hwcnt_context.h>
+#include <hwcnt/mali_kbase_hwcnt_context.h>
 #include <device/mali_kbase_device.h>
 #include <backend/gpu/mali_kbase_irq_internal.h>
 #include <backend/gpu/mali_kbase_pm_internal.h>
@ -29,7 +29,7 @@
 #include <csf/mali_kbase_csf_trace_buffer.h>
 #include <csf/ipa_control/mali_kbase_csf_ipa_control.h>
 #include <mali_kbase_reset_gpu.h>
-#include <linux/string.h>
+#include <csf/mali_kbase_csf_firmware_log.h>

 enum kbasep_soft_reset_status {
 	RESET_SUCCESS = 0,
@ -257,68 +257,6 @@ static void kbase_csf_debug_dump_registers(struct kbase_device *kbdev)
 		kbase_reg_read(kbdev, GPU_CONTROL_REG(TILER_CONFIG)));
 }

-static void kbase_csf_dump_firmware_trace_buffer(struct kbase_device *kbdev)
-{
-	u8 *buf, *p, *pnewline, *pend, *pendbuf;
-	unsigned int read_size, remaining_size;
-	struct firmware_trace_buffer *tb =
-		kbase_csf_firmware_get_trace_buffer(kbdev, FW_TRACE_BUF_NAME);
-
-	if (tb == NULL) {
-		dev_dbg(kbdev->dev, "Can't get the trace buffer, firmware trace dump skipped");
-		return;
-	}
-
-	buf = kmalloc(PAGE_SIZE + 1, GFP_KERNEL);
-	if (buf == NULL) {
-		dev_err(kbdev->dev, "Short of memory, firmware trace dump skipped");
-		return;
-	}
-
-	buf[PAGE_SIZE] = 0;
-
-	p = buf;
-	pendbuf = &buf[PAGE_SIZE];
-
-	dev_err(kbdev->dev, "Firmware trace buffer dump:");
-	while ((read_size = kbase_csf_firmware_trace_buffer_read_data(tb, p,
-								pendbuf - p))) {
-		pend = p + read_size;
-		p = buf;
-
-		while (p < pend && (pnewline = memchr(p, '\n', pend - p))) {
-			/* Null-terminate the string */
-			*pnewline = 0;
-
-			dev_err(kbdev->dev, "FW> %s", p);
-
-			p = pnewline + 1;
-		}
-
-		remaining_size = pend - p;
-
-		if (!remaining_size) {
-			p = buf;
-		} else if (remaining_size < PAGE_SIZE) {
-			/* Copy unfinished string to the start of the buffer */
-			memmove(buf, p, remaining_size);
-			p = &buf[remaining_size];
-		} else {
-			/* Print abnormal page-long string without newlines */
-			dev_err(kbdev->dev, "FW> %s", buf);
-			p = buf;
-		}
-	}
-
-	if (p != buf) {
-		/* Null-terminate and print last unfinished string */
-		*p = 0;
-		dev_err(kbdev->dev, "FW> %s", buf);
-	}
-
-	kfree(buf);
-}
-
 /**
 * kbase_csf_hwcnt_on_reset_error() - Sets HWCNT to appropriate state in the
 *                                    event of an error during GPU reset.
@ -378,7 +316,6 @@ static enum kbasep_soft_reset_status kbase_csf_reset_gpu_once(struct kbase_devic
 		"The flush has completed so reset the active indicator\n");
 	kbdev->irq_reset_flush = false;

-	mutex_lock(&kbdev->pm.lock);
 	if (!silent)
 		dev_err(kbdev->dev, "Resetting GPU (allowing up to %d ms)",
 								RESET_TIMEOUT);
@ -389,7 +326,7 @@ static enum kbasep_soft_reset_status kbase_csf_reset_gpu_once(struct kbase_devic
 	if (!silent) {
 		kbase_csf_debug_dump_registers(kbdev);
 		if (likely(firmware_inited))
-			kbase_csf_dump_firmware_trace_buffer(kbdev);
+			kbase_csf_firmware_log_dump_buffer(kbdev);
 	}

 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
@ -403,6 +340,7 @@ static enum kbasep_soft_reset_status kbase_csf_reset_gpu_once(struct kbase_devic
 	 */
 	kbase_hwcnt_backend_csf_on_before_reset(&kbdev->hwcnt_gpu_iface);

+	mutex_lock(&kbdev->pm.lock);
 	/* Reset the GPU */
 	err = kbase_pm_init_hw(kbdev, 0);

@ -633,6 +571,11 @@ bool kbase_reset_gpu_is_active(struct kbase_device *kbdev)
 	return kbase_csf_reset_state_is_active(reset_state);
 }

+bool kbase_reset_gpu_is_not_pending(struct kbase_device *kbdev)
+{
+	return atomic_read(&kbdev->csf.reset.state) == KBASE_CSF_RESET_GPU_NOT_PENDING;
+}
+
 int kbase_reset_gpu_wait(struct kbase_device *kbdev)
 {
 	const long wait_timeout =
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_scheduler.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_scheduler.c
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_scheduler.h
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_scheduler.h
@ -36,7 +36,9 @@
 * If the CSG is already scheduled and resident, the CSI will be started
 * right away, otherwise once the group is made resident.
 *
- * Return: 0 on success, or negative on failure.
+ * Return: 0 on success, or negative on failure. -EBUSY is returned to
+ * indicate to the caller that queue could not be enabled due to Scheduler
+ * state and the caller can try to enable the queue after sometime.
 */
 int kbase_csf_scheduler_queue_start(struct kbase_queue *queue);

@ -530,12 +532,30 @@ static inline void kbase_csf_scheduler_invoke_tick(struct kbase_device *kbdev)
 	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
 	unsigned long flags;

+	KBASE_KTRACE_ADD(kbdev, SCHEDULER_TICK_INVOKE, NULL, 0u);
 	spin_lock_irqsave(&scheduler->interrupt_lock, flags);
 	if (!scheduler->tick_timer_active)
 		queue_work(scheduler->wq, &scheduler->tick_work);
 	spin_unlock_irqrestore(&scheduler->interrupt_lock, flags);
 }

+/**
+ * kbase_csf_scheduler_invoke_tock() - Invoke the scheduling tock
+ *
+ * @kbdev: Pointer to the device
+ *
+ * This function will queue the scheduling tock work item for immediate
+ * execution.
+ */
+static inline void kbase_csf_scheduler_invoke_tock(struct kbase_device *kbdev)
+{
+	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
+
+	KBASE_KTRACE_ADD(kbdev, SCHEDULER_TOCK_INVOKE, NULL, 0u);
+	if (atomic_cmpxchg(&scheduler->pending_tock_work, false, true) == false)
+		mod_delayed_work(scheduler->wq, &scheduler->tock_work, 0);
+}
+
 /**
 * kbase_csf_scheduler_queue_has_trace() - report whether the queue has been
 *                                         configured to operate with the
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap.c
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap.h
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap.h
@ -23,7 +23,6 @@
 #define _KBASE_CSF_TILER_HEAP_H_

 #include <mali_kbase.h>
-
 /**
 * kbase_csf_tiler_heap_context_init - Initialize the tiler heaps context for a
 *                                     GPU address space
@ -58,6 +57,12 @@ void kbase_csf_tiler_heap_context_term(struct kbase_context *kctx);
 * @target_in_flight: Number of render-passes that the driver should attempt to
 *                    keep in flight for which allocation of new chunks is
 *                    allowed. Must not be zero.
+ * @buf_desc_va: Buffer descriptor GPU virtual address. This is a hint for
+ *               indicating that the caller is intending to perform tiler heap
+ *               chunks reclaim for those that are hoarded with hardware while
+ *               the associated shader activites are suspended and the CSGs are
+ *               off slots. If the referred reclaiming is not desired, can
+ *               set it to 0.
 * @gpu_heap_va: Where to store the GPU virtual address of the context that was
 *               set up for the tiler heap.
 * @first_chunk_va: Where to store the GPU virtual address of the first chunk
@ -66,10 +71,9 @@ void kbase_csf_tiler_heap_context_term(struct kbase_context *kctx);
 *
 * Return: 0 if successful or a negative error code on failure.
 */
-int kbase_csf_tiler_heap_init(struct kbase_context *kctx,
-	u32 chunk_size, u32 initial_chunks, u32 max_chunks,
-	u16 target_in_flight, u64 *gpu_heap_va,
-	u64 *first_chunk_va);
+int kbase_csf_tiler_heap_init(struct kbase_context *kctx, u32 chunk_size, u32 initial_chunks,
+			      u32 max_chunks, u16 target_in_flight, u64 const buf_desc_va,
+			      u64 *gpu_heap_va, u64 *first_chunk_va);

 /**
 * kbase_csf_tiler_heap_term - Terminate a chunked tiler memory heap.
@ -112,4 +116,27 @@ int kbase_csf_tiler_heap_term(struct kbase_context *kctx, u64 gpu_heap_va);
 */
 int kbase_csf_tiler_heap_alloc_new_chunk(struct kbase_context *kctx,
 	u64 gpu_heap_va, u32 nr_in_flight, u32 pending_frag_count, u64 *new_chunk_ptr);
+
+/**
+ * kbase_csf_tiler_heap_scan_kctx_unused_pages - Performs the tiler heap shrinker calim's scan
+ *                                               functionality.
+ *
+ * @kctx:               Pointer to the kbase context for which the tiler heap recalim is to be
+ *                      operated with.
+ * @to_free:            Number of pages suggested for the reclaim scan (free) method to reach.
+ *
+ * Return: the actual number of pages the scan method has freed from the call.
+ */
+u32 kbase_csf_tiler_heap_scan_kctx_unused_pages(struct kbase_context *kctx, u32 to_free);
+
+/**
+ * kbase_csf_tiler_heap_count_kctx_unused_pages - Performs the tiler heap shrinker calim's count
+ *                                                functionality.
+ *
+ * @kctx:               Pointer to the kbase context for which the tiler heap recalim is to be
+ *                      operated with.
+ *
+ * Return: a number of pages that could likely be freed on the subsequent scan method call.
+ */
+u32 kbase_csf_tiler_heap_count_kctx_unused_pages(struct kbase_context *kctx);
 #endif
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap_def.h
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap_def.h
@ -56,12 +56,20 @@
 	((CHUNK_HDR_NEXT_ADDR_MASK >> CHUNK_HDR_NEXT_ADDR_POS) << \
 	 CHUNK_HDR_NEXT_ADDR_ENCODE_SHIFT)

+/* The size of the area needed to be vmapped prior to handing the tiler heap
+ * over to the tiler, so that the shrinker could be invoked.
+ */
+#define NEXT_CHUNK_ADDR_SIZE (sizeof(u64))
+
 /**
 * struct kbase_csf_tiler_heap_chunk - A tiler heap chunk managed by the kernel
 *
 * @link:   Link to this chunk in a list of chunks belonging to a
 *          @kbase_csf_tiler_heap.
 * @region: Pointer to the GPU memory region allocated for the chunk.
+ * @map:    Kernel VA mapping so that we would not need to use vmap in the
+ *          shrinker callback, which can allocate. This maps only the header
+ *          of the chunk, so it could be traversed.
 * @gpu_va: GPU virtual address of the start of the memory region.
 *          This points to the header of the chunk and not to the low address
 *          of free memory within it.
@ -75,9 +83,12 @@
 struct kbase_csf_tiler_heap_chunk {
 	struct list_head link;
 	struct kbase_va_region *region;
+	struct kbase_vmap_struct map;
 	u64 gpu_va;
 };

+#define HEAP_BUF_DESCRIPTOR_CHECKED (1 << 0)
+
 /**
 * struct kbase_csf_tiler_heap - A tiler heap managed by the kernel
 *
@ -85,6 +96,20 @@ struct kbase_csf_tiler_heap_chunk {
 *                   associated.
 * @link:            Link to this heap in a list of tiler heaps belonging to
 *                   the @kbase_csf_tiler_heap_context.
+ * @chunks_list:     Linked list of allocated chunks.
+ * @gpu_va:          The GPU virtual address of the heap context structure that
+ *                   was allocated for the firmware. This is also used to
+ *                   uniquely identify the heap.
+ * @heap_id:         Unique id representing the heap, assigned during heap
+ *                   initialization.
+ * @buf_desc_va:     Buffer descriptor GPU VA. Can be 0 for backward compatible
+ *                   to earlier version base interfaces.
+ * @buf_desc_reg:    Pointer to the VA region that covers the provided buffer
+ *                   descriptor memory object pointed to by buf_desc_va.
+ * @gpu_va_map:      Kernel VA mapping of the GPU VA region.
+ * @buf_desc_map:    Kernel VA mapping of the buffer descriptor, read from
+ *                   during the tiler heap shrinker. Sync operations may need
+ *                   to be done before each read.
 * @chunk_size:      Size of each chunk, in bytes. Must be page-aligned.
 * @chunk_count:     The number of chunks currently allocated. Must not be
 *                   zero or greater than @max_chunks.
@ -93,22 +118,23 @@ struct kbase_csf_tiler_heap_chunk {
 * @target_in_flight: Number of render-passes that the driver should attempt
 *                    to keep in flight for which allocation of new chunks is
 *                    allowed. Must not be zero.
- * @gpu_va:          The GPU virtual address of the heap context structure that
- *                   was allocated for the firmware. This is also used to
- *                   uniquely identify the heap.
- * @heap_id:         Unique id representing the heap, assigned during heap
- *                   initialization.
- * @chunks_list:     Linked list of allocated chunks.
+ * @buf_desc_checked: Indicates if runtime check on buffer descriptor has been done.
 */
 struct kbase_csf_tiler_heap {
 	struct kbase_context *kctx;
 	struct list_head link;
+	struct list_head chunks_list;
+	u64 gpu_va;
+	u64 heap_id;
+	u64 buf_desc_va;
+	struct kbase_va_region *buf_desc_reg;
+	struct kbase_vmap_struct buf_desc_map;
+	struct kbase_vmap_struct gpu_va_map;
 	u32 chunk_size;
 	u32 chunk_count;
 	u32 max_chunks;
 	u16 target_in_flight;
-	u64 gpu_va;
-	u64 heap_id;
-	struct list_head chunks_list;
+	bool buf_desc_checked;
 };
+
 #endif /* !_KBASE_CSF_TILER_HEAP_DEF_H_ */
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap_reclaim.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap_reclaim.c
@ -0,0 +1,367 @@
+// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+/*
+ *
+ * (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#include <mali_kbase.h>
+#include "mali_kbase_csf.h"
+#include "mali_kbase_csf_tiler_heap.h"
+#include "mali_kbase_csf_tiler_heap_reclaim.h"
+
+/* Tiler heap shrinker seek value, needs to be higher than jit and memory pools */
+#define HEAP_SHRINKER_SEEKS (DEFAULT_SEEKS + 2)
+
+/* Tiler heap shrinker batch value */
+#define HEAP_SHRINKER_BATCH (512)
+
+/* Tiler heap reclaim scan (free) method size for limiting a scan run length */
+#define HEAP_RECLAIM_SCAN_BATCH_SIZE (HEAP_SHRINKER_BATCH << 7)
+
+static u8 get_kctx_highest_csg_priority(struct kbase_context *kctx)
+{
+	u8 prio;
+
+	for (prio = KBASE_QUEUE_GROUP_PRIORITY_REALTIME; prio < KBASE_QUEUE_GROUP_PRIORITY_LOW;
+	     prio++)
+		if (!list_empty(&kctx->csf.sched.runnable_groups[prio]))
+			break;
+
+	if (prio != KBASE_QUEUE_GROUP_PRIORITY_REALTIME && kctx->csf.sched.num_idle_wait_grps) {
+		struct kbase_queue_group *group;
+
+		list_for_each_entry(group, &kctx->csf.sched.idle_wait_groups, link) {
+			if (group->priority < prio)
+				prio = group->priority;
+		}
+	}
+
+	return prio;
+}
+
+static void detach_ctx_from_heap_reclaim_mgr(struct kbase_context *kctx)
+{
+	struct kbase_csf_scheduler *const scheduler = &kctx->kbdev->csf.scheduler;
+	struct kbase_csf_ctx_heap_reclaim_info *info = &kctx->csf.sched.heap_info;
+
+	lockdep_assert_held(&scheduler->lock);
+
+	if (!list_empty(&info->mgr_link)) {
+		u32 remaining = (info->nr_est_unused_pages > info->nr_freed_pages) ?
+					info->nr_est_unused_pages - info->nr_freed_pages :
+					0;
+
+		list_del_init(&info->mgr_link);
+		if (remaining)
+			WARN_ON(atomic_sub_return(remaining, &scheduler->reclaim_mgr.unused_pages) <
+				0);
+
+		dev_dbg(kctx->kbdev->dev,
+			"Reclaim_mgr_detach: ctx_%d_%d, est_pages=0%u, freed_pages=%u", kctx->tgid,
+			kctx->id, info->nr_est_unused_pages, info->nr_freed_pages);
+	}
+}
+
+static void attach_ctx_to_heap_reclaim_mgr(struct kbase_context *kctx)
+{
+	struct kbase_csf_ctx_heap_reclaim_info *const info = &kctx->csf.sched.heap_info;
+	struct kbase_csf_scheduler *const scheduler = &kctx->kbdev->csf.scheduler;
+	u8 const prio = get_kctx_highest_csg_priority(kctx);
+
+	lockdep_assert_held(&scheduler->lock);
+
+	if (WARN_ON(!list_empty(&info->mgr_link)))
+		list_del_init(&info->mgr_link);
+
+	/* Count the pages that could be freed */
+	info->nr_est_unused_pages = kbase_csf_tiler_heap_count_kctx_unused_pages(kctx);
+	/* Initialize the scan operation tracking pages */
+	info->nr_freed_pages = 0;
+
+	list_add_tail(&info->mgr_link, &scheduler->reclaim_mgr.ctx_lists[prio]);
+	/* Accumulate the estimated pages to the manager total field */
+	atomic_add(info->nr_est_unused_pages, &scheduler->reclaim_mgr.unused_pages);
+
+	dev_dbg(kctx->kbdev->dev, "Reclaim_mgr_attach: ctx_%d_%d, est_count_pages=%u", kctx->tgid,
+		kctx->id, info->nr_est_unused_pages);
+}
+
+void kbase_csf_tiler_heap_reclaim_sched_notify_grp_active(struct kbase_queue_group *group)
+{
+	struct kbase_context *kctx = group->kctx;
+	struct kbase_csf_ctx_heap_reclaim_info *info = &kctx->csf.sched.heap_info;
+
+	lockdep_assert_held(&kctx->kbdev->csf.scheduler.lock);
+
+	info->on_slot_grps++;
+	/* If the kctx has an on-slot change from 0 => 1, detach it from reclaim_mgr */
+	if (info->on_slot_grps == 1) {
+		dev_dbg(kctx->kbdev->dev, "CSG_%d_%d_%d on-slot, remove kctx from reclaim manager",
+			group->kctx->tgid, group->kctx->id, group->handle);
+
+		detach_ctx_from_heap_reclaim_mgr(kctx);
+	}
+}
+
+void kbase_csf_tiler_heap_reclaim_sched_notify_grp_evict(struct kbase_queue_group *group)
+{
+	struct kbase_context *kctx = group->kctx;
+	struct kbase_csf_ctx_heap_reclaim_info *const info = &kctx->csf.sched.heap_info;
+	struct kbase_csf_scheduler *const scheduler = &kctx->kbdev->csf.scheduler;
+	const u32 num_groups = kctx->kbdev->csf.global_iface.group_num;
+	u32 on_slot_grps = 0;
+	u32 i;
+
+	lockdep_assert_held(&scheduler->lock);
+
+	/* Group eviction from the scheduler is a bit more complex, but fairly less
+	 * frequent in operations. Taking the opportunity to actually count the
+	 * on-slot CSGs from the given kctx, for robustness and clearer code logic.
+	 */
+	for_each_set_bit(i, scheduler->csg_inuse_bitmap, num_groups) {
+		struct kbase_csf_csg_slot *csg_slot = &scheduler->csg_slots[i];
+		struct kbase_queue_group *grp = csg_slot->resident_group;
+
+		if (unlikely(!grp))
+			continue;
+
+		if (grp->kctx == kctx)
+			on_slot_grps++;
+	}
+
+	info->on_slot_grps = on_slot_grps;
+
+	/* If the kctx has no other CSGs on-slot, handle the heap reclaim related actions */
+	if (!info->on_slot_grps) {
+		if (kctx->csf.sched.num_runnable_grps || kctx->csf.sched.num_idle_wait_grps) {
+			/* The kctx has other operational CSGs, attach it if not yet done */
+			if (list_empty(&info->mgr_link)) {
+				dev_dbg(kctx->kbdev->dev,
+					"CSG_%d_%d_%d evict, add kctx to reclaim manager",
+					group->kctx->tgid, group->kctx->id, group->handle);
+
+				attach_ctx_to_heap_reclaim_mgr(kctx);
+			}
+		} else {
+			/* The kctx is a zombie after the group eviction, drop it out */
+			dev_dbg(kctx->kbdev->dev,
+				"CSG_%d_%d_%d evict leading to zombie kctx, dettach from reclaim manager",
+				group->kctx->tgid, group->kctx->id, group->handle);
+
+			detach_ctx_from_heap_reclaim_mgr(kctx);
+		}
+	}
+}
+
+void kbase_csf_tiler_heap_reclaim_sched_notify_grp_suspend(struct kbase_queue_group *group)
+{
+	struct kbase_context *kctx = group->kctx;
+	struct kbase_csf_ctx_heap_reclaim_info *info = &kctx->csf.sched.heap_info;
+
+	lockdep_assert_held(&kctx->kbdev->csf.scheduler.lock);
+
+	if (!WARN_ON(info->on_slot_grps == 0))
+		info->on_slot_grps--;
+	/* If the kctx has no CSGs on-slot, attach it to scheduler's reclaim manager */
+	if (info->on_slot_grps == 0) {
+		dev_dbg(kctx->kbdev->dev, "CSG_%d_%d_%d off-slot, add kctx to reclaim manager",
+			group->kctx->tgid, group->kctx->id, group->handle);
+
+		attach_ctx_to_heap_reclaim_mgr(kctx);
+	}
+}
+
+static unsigned long reclaim_unused_heap_pages(struct kbase_device *kbdev)
+{
+	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
+	struct kbase_csf_sched_heap_reclaim_mgr *const mgr = &scheduler->reclaim_mgr;
+	unsigned long total_freed_pages = 0;
+	int prio;
+
+	lockdep_assert_held(&kbdev->csf.scheduler.lock);
+
+	for (prio = KBASE_QUEUE_GROUP_PRIORITY_LOW;
+	     total_freed_pages < HEAP_RECLAIM_SCAN_BATCH_SIZE &&
+	     prio >= KBASE_QUEUE_GROUP_PRIORITY_REALTIME;
+	     prio--) {
+		struct kbase_csf_ctx_heap_reclaim_info *info, *tmp;
+		u32 cnt_ctxs = 0;
+
+		list_for_each_entry_safe(info, tmp, &scheduler->reclaim_mgr.ctx_lists[prio],
+					 mgr_link) {
+			struct kbase_context *kctx =
+				container_of(info, struct kbase_context, csf.sched.heap_info);
+			u32 freed_pages = kbase_csf_tiler_heap_scan_kctx_unused_pages(
+				kctx, info->nr_est_unused_pages);
+
+			if (freed_pages) {
+				/* Remove the freed pages from the manager retained estimate. The
+				 * accumulated removals from the kctx should not exceed the kctx
+				 * initially notified contribution amount:
+				 *   info->nr_est_unused_pages.
+				 */
+				u32 rm_cnt = MIN(info->nr_est_unused_pages - info->nr_freed_pages,
+						 freed_pages);
+
+				WARN_ON(atomic_sub_return(rm_cnt, &mgr->unused_pages) < 0);
+
+				/* tracking the freed pages, before a potential detach call */
+				info->nr_freed_pages += freed_pages;
+				total_freed_pages += freed_pages;
+
+				schedule_work(&kctx->jit_work);
+			}
+
+			/* If the kctx can't offer anymore, drop it from the reclaim manger,
+			 * otherwise leave it remaining in. If the kctx changes its state (i.e.
+			 * some CSGs becoming on-slot), the scheduler will pull it out.
+			 */
+			if (info->nr_freed_pages >= info->nr_est_unused_pages || freed_pages == 0)
+				detach_ctx_from_heap_reclaim_mgr(kctx);
+
+			cnt_ctxs++;
+
+			/* Enough has been freed, break to avoid holding the lock too long */
+			if (total_freed_pages >= HEAP_RECLAIM_SCAN_BATCH_SIZE)
+				break;
+		}
+
+		dev_dbg(kbdev->dev, "Reclaim free heap pages: %lu (cnt_ctxs: %u, prio: %d)",
+			total_freed_pages, cnt_ctxs, prio);
+	}
+
+	dev_dbg(kbdev->dev, "Reclaim free total heap pages: %lu (across all CSG priority)",
+		total_freed_pages);
+
+	return total_freed_pages;
+}
+
+static unsigned long kbase_csf_tiler_heap_reclaim_count_free_pages(struct kbase_device *kbdev,
+								   struct shrink_control *sc)
+{
+	struct kbase_csf_sched_heap_reclaim_mgr *mgr = &kbdev->csf.scheduler.reclaim_mgr;
+	unsigned long page_cnt = atomic_read(&mgr->unused_pages);
+
+	dev_dbg(kbdev->dev, "Reclaim count unused pages (estimate): %lu", page_cnt);
+
+	return page_cnt;
+}
+
+static unsigned long kbase_csf_tiler_heap_reclaim_scan_free_pages(struct kbase_device *kbdev,
+								  struct shrink_control *sc)
+{
+	struct kbase_csf_sched_heap_reclaim_mgr *mgr = &kbdev->csf.scheduler.reclaim_mgr;
+	unsigned long freed = 0;
+	unsigned long avail = 0;
+
+	/* If Scheduler is busy in action, return 0 */
+	if (!mutex_trylock(&kbdev->csf.scheduler.lock)) {
+		struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
+
+		/* Wait for roughly 2-ms */
+		wait_event_timeout(kbdev->csf.event_wait, (scheduler->state != SCHED_BUSY),
+				   msecs_to_jiffies(2));
+		if (!mutex_trylock(&kbdev->csf.scheduler.lock)) {
+			dev_dbg(kbdev->dev, "Tiler heap reclaim scan see device busy (freed: 0)");
+			return 0;
+		}
+	}
+
+	avail = atomic_read(&mgr->unused_pages);
+	if (avail)
+		freed = reclaim_unused_heap_pages(kbdev);
+
+	mutex_unlock(&kbdev->csf.scheduler.lock);
+
+#if (KERNEL_VERSION(4, 14, 0) <= LINUX_VERSION_CODE)
+	if (freed > sc->nr_to_scan)
+		sc->nr_scanned = freed;
+#endif /* (KERNEL_VERSION(4, 14, 0) <= LINUX_VERSION_CODE) */
+
+	dev_info(kbdev->dev, "Tiler heap reclaim scan freed pages: %lu (unused: %lu)", freed,
+		 avail);
+
+	/* On estimate suggesting available, yet actual free failed, return STOP */
+	if (avail && !freed)
+		return SHRINK_STOP;
+	else
+		return freed;
+}
+
+static unsigned long kbase_csf_tiler_heap_reclaim_count_objects(struct shrinker *s,
+								struct shrink_control *sc)
+{
+	struct kbase_device *kbdev =
+		container_of(s, struct kbase_device, csf.scheduler.reclaim_mgr.heap_reclaim);
+
+	return kbase_csf_tiler_heap_reclaim_count_free_pages(kbdev, sc);
+}
+
+static unsigned long kbase_csf_tiler_heap_reclaim_scan_objects(struct shrinker *s,
+							       struct shrink_control *sc)
+{
+	struct kbase_device *kbdev =
+		container_of(s, struct kbase_device, csf.scheduler.reclaim_mgr.heap_reclaim);
+
+	return kbase_csf_tiler_heap_reclaim_scan_free_pages(kbdev, sc);
+}
+
+void kbase_csf_tiler_heap_reclaim_ctx_init(struct kbase_context *kctx)
+{
+	/* Per-kctx heap_info object initialization */
+	memset(&kctx->csf.sched.heap_info, 0, sizeof(struct kbase_csf_ctx_heap_reclaim_info));
+	INIT_LIST_HEAD(&kctx->csf.sched.heap_info.mgr_link);
+}
+
+void kbase_csf_tiler_heap_reclaim_mgr_init(struct kbase_device *kbdev)
+{
+	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
+	struct shrinker *reclaim = &scheduler->reclaim_mgr.heap_reclaim;
+	u8 prio;
+
+	for (prio = KBASE_QUEUE_GROUP_PRIORITY_REALTIME; prio < KBASE_QUEUE_GROUP_PRIORITY_COUNT;
+	     prio++)
+		INIT_LIST_HEAD(&scheduler->reclaim_mgr.ctx_lists[prio]);
+
+	atomic_set(&scheduler->reclaim_mgr.unused_pages, 0);
+
+	reclaim->count_objects = kbase_csf_tiler_heap_reclaim_count_objects;
+	reclaim->scan_objects = kbase_csf_tiler_heap_reclaim_scan_objects;
+	reclaim->seeks = HEAP_SHRINKER_SEEKS;
+	reclaim->batch = HEAP_SHRINKER_BATCH;
+
+#if !defined(CONFIG_MALI_VECTOR_DUMP)
+	register_shrinker(reclaim);
+#endif
+}
+
+void kbase_csf_tiler_heap_reclaim_mgr_term(struct kbase_device *kbdev)
+{
+	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
+	u8 prio;
+
+#if !defined(CONFIG_MALI_VECTOR_DUMP)
+	unregister_shrinker(&scheduler->reclaim_mgr.heap_reclaim);
+#endif
+
+	for (prio = KBASE_QUEUE_GROUP_PRIORITY_REALTIME; prio < KBASE_QUEUE_GROUP_PRIORITY_COUNT;
+	     prio++)
+		WARN_ON(!list_empty(&scheduler->reclaim_mgr.ctx_lists[prio]));
+
+	WARN_ON(atomic_read(&scheduler->reclaim_mgr.unused_pages));
+}
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap_reclaim.h
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap_reclaim.h
@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _KBASE_CSF_TILER_HEAP_RECLAIM_H_
+#define _KBASE_CSF_TILER_HEAP_RECLAIM_H_
+
+#include <mali_kbase.h>
+
+/**
+ * kbase_csf_tiler_heap_reclaim_sched_notify_grp_active - Notifier function for the scheduler
+ *                                                        to use when a group is put on-slot.
+ *
+ * @group: Pointer to the group object that has been placed on-slot for running.
+ *
+ */
+void kbase_csf_tiler_heap_reclaim_sched_notify_grp_active(struct kbase_queue_group *group);
+
+/**
+ * kbase_csf_tiler_heap_reclaim_sched_notify_grp_evict - Notifier function for the scheduler
+ *               to use when a group is evicted out of the schedulder's scope, i.e no run of
+ *               the group is possible afterwards.
+ *
+ * @group: Pointer to the group object that has been evicted.
+ *
+ */
+void kbase_csf_tiler_heap_reclaim_sched_notify_grp_evict(struct kbase_queue_group *group);
+
+/**
+ * kbase_csf_tiler_heap_reclaim_sched_notify_grp_suspend - Notifier function for the scheduler
+ *                to use when a group is suspended from running, but could resume in future.
+ *
+ * @group: Pointer to the group object that is in suspended state.
+ *
+ */
+void kbase_csf_tiler_heap_reclaim_sched_notify_grp_suspend(struct kbase_queue_group *group);
+
+/**
+ * kbase_csf_tiler_heap_reclaim_ctx_init - Initializer on per context data fields for use
+ *                                         with the tiler heap reclaim manager.
+ *
+ * @kctx: Pointer to the kbase_context.
+ *
+ */
+void kbase_csf_tiler_heap_reclaim_ctx_init(struct kbase_context *kctx);
+
+/**
+ * kbase_csf_tiler_heap_reclaim_mgr_init - Initializer for the tiler heap reclaim manger.
+ *
+ * @kbdev: Pointer to the device.
+ *
+ */
+void kbase_csf_tiler_heap_reclaim_mgr_init(struct kbase_device *kbdev);
+
+/**
+ * kbase_csf_tiler_heap_reclaim_mgr_term - Termination call for the tiler heap reclaim manger.
+ *
+ * @kbdev: Pointer to the device.
+ *
+ */
+void kbase_csf_tiler_heap_reclaim_mgr_term(struct kbase_device *kbdev);
+
+#endif
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tl_reader.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tl_reader.c
@ -88,13 +88,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(kbase_csf_tl_poll_interval_fops,
 		kbase_csf_tl_debugfs_poll_interval_read,
 		kbase_csf_tl_debugfs_poll_interval_write, "%llu\n");

-
 void kbase_csf_tl_reader_debugfs_init(struct kbase_device *kbdev)
 {
 	debugfs_create_file("csf_tl_poll_interval_in_ms", 0644,
 		kbdev->debugfs_instr_directory, kbdev,
 		&kbase_csf_tl_poll_interval_fops);
-
 }
 #endif

@ -166,11 +164,10 @@ static int kbase_ts_converter_init(
 *
 * Return: The CPU timestamp.
 */
-static void __maybe_unused
-kbase_ts_converter_convert(const struct kbase_ts_converter *self, u64 *gpu_ts)
+static u64 __maybe_unused
+kbase_ts_converter_convert(const struct kbase_ts_converter *self, u64 gpu_ts)
 {
-	u64 old_gpu_ts = *gpu_ts;
-	*gpu_ts = div64_u64(old_gpu_ts * self->multiplier, self->divisor) +
+	return div64_u64(gpu_ts * self->multiplier, self->divisor) +
 		  self->offset;
 }

@ -250,7 +247,6 @@ static void tl_reader_reset(struct kbase_csf_tl_reader *self)
 	self->tl_header.btc = 0;
 }

-
 int kbase_csf_tl_reader_flush_buffer(struct kbase_csf_tl_reader *self)
 {
 	int ret = 0;
@ -275,7 +271,6 @@ int kbase_csf_tl_reader_flush_buffer(struct kbase_csf_tl_reader *self)
 		return -EBUSY;
 	}

-
 	/* Copying the whole buffer in a single shot. We assume
 	 * that the buffer will not contain partially written messages.
 	 */
@ -326,8 +321,8 @@ int kbase_csf_tl_reader_flush_buffer(struct kbase_csf_tl_reader *self)
 		{
 			struct kbase_csffw_tl_message *msg =
 				(struct kbase_csffw_tl_message *) csffw_data_it;
-			kbase_ts_converter_convert(&self->ts_converter,
-						   &msg->timestamp);
+			msg->timestamp = kbase_ts_converter_convert(&self->ts_converter,
+						   msg->timestamp);
 		}

 		/* Copy the message out to the tl_stream. */
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_trace_buffer.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_trace_buffer.c
@ -119,7 +119,7 @@ static const struct firmware_trace_buffer_data trace_buffer_data[] = {
 #if MALI_UNIT_TEST
 	{ "fwutf", { 0 }, 1 },
 #endif
-	{ FW_TRACE_BUF_NAME, { 0 }, 4 },
+	{ FIRMWARE_LOG_BUF_NAME, { 0 }, 4 },
 	{ "benchmark", { 0 }, 2 },
 	{ "timeline", { 0 }, KBASE_CSF_TL_BUFFER_NR_PAGES },
 };
@ -506,10 +506,16 @@ unsigned int kbase_csf_firmware_trace_buffer_read_data(
 }
 EXPORT_SYMBOL(kbase_csf_firmware_trace_buffer_read_data);

-#if IS_ENABLED(CONFIG_DEBUG_FS)
+static void update_trace_buffer_active_mask64(struct firmware_trace_buffer *tb, u64 mask)
+{
+	unsigned int i;
+
+	for (i = 0; i < tb->trace_enable_entry_count; i++)
+		kbasep_csf_firmware_trace_buffer_update_trace_enable_bit(tb, i, (mask >> i) & 1);
+}

 #define U32_BITS 32
-static u64 get_trace_buffer_active_mask64(struct firmware_trace_buffer *tb)
+u64 kbase_csf_firmware_trace_buffer_get_active_mask64(struct firmware_trace_buffer *tb)
 {
 	u64 active_mask = tb->trace_enable_init_mask[0];

@ -519,18 +525,7 @@ static u64 get_trace_buffer_active_mask64(struct firmware_trace_buffer *tb)
 	return active_mask;
 }

-static void update_trace_buffer_active_mask64(struct firmware_trace_buffer *tb,
-		u64 mask)
-{
-	unsigned int i;
-
-	for (i = 0; i < tb->trace_enable_entry_count; i++)
-		kbasep_csf_firmware_trace_buffer_update_trace_enable_bit(
-			tb, i, (mask >> i) & 1);
-}
-
-static int set_trace_buffer_active_mask64(struct firmware_trace_buffer *tb,
-		u64 mask)
+int kbase_csf_firmware_trace_buffer_set_active_mask64(struct firmware_trace_buffer *tb, u64 mask)
 {
 	struct kbase_device *kbdev = tb->kbdev;
 	unsigned long flags;
@ -558,123 +553,3 @@ static int set_trace_buffer_active_mask64(struct firmware_trace_buffer *tb,

 	return err;
 }
-
-static int kbase_csf_firmware_trace_enable_mask_read(void *data, u64 *val)
-{
-	struct kbase_device *kbdev = (struct kbase_device *)data;
-	struct firmware_trace_buffer *tb =
-		kbase_csf_firmware_get_trace_buffer(kbdev, FW_TRACE_BUF_NAME);
-
-	if (tb == NULL) {
-		dev_err(kbdev->dev, "Couldn't get the firmware trace buffer");
-		return -EIO;
-	}
-	/* The enabled traces limited to u64 here, regarded practical */
-	*val = get_trace_buffer_active_mask64(tb);
-	return 0;
-}
-
-static int kbase_csf_firmware_trace_enable_mask_write(void *data, u64 val)
-{
-	struct kbase_device *kbdev = (struct kbase_device *)data;
-	struct firmware_trace_buffer *tb =
-		kbase_csf_firmware_get_trace_buffer(kbdev, FW_TRACE_BUF_NAME);
-	u64 new_mask;
-	unsigned int enable_bits_count;
-
-	if (tb == NULL) {
-		dev_err(kbdev->dev, "Couldn't get the firmware trace buffer");
-		return -EIO;
-	}
-
-	/* Ignore unsupported types */
-	enable_bits_count =
-	    kbase_csf_firmware_trace_buffer_get_trace_enable_bits_count(tb);
-	if (enable_bits_count > 64) {
-		dev_dbg(kbdev->dev, "Limit enabled bits count from %u to 64",
-			enable_bits_count);
-		enable_bits_count = 64;
-	}
-	new_mask = val & ((1 << enable_bits_count) - 1);
-
-	if (new_mask != get_trace_buffer_active_mask64(tb))
-		return set_trace_buffer_active_mask64(tb, new_mask);
-	else
-		return 0;
-}
-
-static int kbasep_csf_firmware_trace_debugfs_open(struct inode *in,
-		struct file *file)
-{
-	struct kbase_device *kbdev = in->i_private;
-
-	file->private_data = kbdev;
-	dev_dbg(kbdev->dev, "Opened firmware trace buffer dump debugfs file");
-
-	return 0;
-}
-
-static ssize_t kbasep_csf_firmware_trace_debugfs_read(struct file *file,
-		char __user *buf, size_t size, loff_t *ppos)
-{
-	struct kbase_device *kbdev = file->private_data;
-	u8 *pbyte;
-	unsigned int n_read;
-	unsigned long not_copied;
-	/* Limit the kernel buffer to no more than two pages */
-	size_t mem = MIN(size, 2 * PAGE_SIZE);
-	unsigned long flags;
-
-	struct firmware_trace_buffer *tb =
-		kbase_csf_firmware_get_trace_buffer(kbdev, FW_TRACE_BUF_NAME);
-
-	if (tb == NULL) {
-		dev_err(kbdev->dev, "Couldn't get the firmware trace buffer");
-		return -EIO;
-	}
-
-	pbyte = kmalloc(mem, GFP_KERNEL);
-	if (pbyte == NULL) {
-		dev_err(kbdev->dev, "Couldn't allocate memory for trace buffer dump");
-		return -ENOMEM;
-	}
-
-	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-	n_read = kbase_csf_firmware_trace_buffer_read_data(tb, pbyte, mem);
-	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
-
-	/* Do the copy, if we have obtained some trace data */
-	not_copied = (n_read) ? copy_to_user(buf, pbyte, n_read) : 0;
-	kfree(pbyte);
-
-	if (!not_copied) {
-		*ppos += n_read;
-		return n_read;
-	}
-
-	dev_err(kbdev->dev, "Couldn't copy trace buffer data to user space buffer");
-	return -EFAULT;
-}
-
-DEFINE_DEBUGFS_ATTRIBUTE(kbase_csf_firmware_trace_enable_mask_fops,
-			 kbase_csf_firmware_trace_enable_mask_read,
-			 kbase_csf_firmware_trace_enable_mask_write, "%llx\n");
-
-static const struct file_operations kbasep_csf_firmware_trace_debugfs_fops = {
-	.owner = THIS_MODULE,
-	.open = kbasep_csf_firmware_trace_debugfs_open,
-	.read = kbasep_csf_firmware_trace_debugfs_read,
-	.llseek = no_llseek,
-};
-
-void kbase_csf_firmware_trace_buffer_debugfs_init(struct kbase_device *kbdev)
-{
-	debugfs_create_file("fw_trace_enable_mask", 0644,
-			    kbdev->mali_debugfs_directory, kbdev,
-			    &kbase_csf_firmware_trace_enable_mask_fops);
-
-	debugfs_create_file("fw_traces", 0444,
-			    kbdev->mali_debugfs_directory, kbdev,
-			    &kbasep_csf_firmware_trace_debugfs_fops);
-}
-#endif /* CONFIG_DEBUG_FS */
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_trace_buffer.h
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_trace_buffer.h
@ -25,7 +25,7 @@
 #include <linux/types.h>

 #define CSF_FIRMWARE_TRACE_ENABLE_INIT_MASK_MAX (4)
-#define FW_TRACE_BUF_NAME "fwlog"
+#define FIRMWARE_LOG_BUF_NAME "fwlog"

 /* Forward declarations */
 struct firmware_trace_buffer;
@ -165,14 +165,23 @@ bool kbase_csf_firmware_trace_buffer_is_empty(
 unsigned int kbase_csf_firmware_trace_buffer_read_data(
 	struct firmware_trace_buffer *trace_buffer, u8 *data, unsigned int num_bytes);

-#if IS_ENABLED(CONFIG_DEBUG_FS)
 /**
- * kbase_csf_firmware_trace_buffer_debugfs_init() - Add debugfs entries for
- * setting enable mask and dumping the binary firmware trace buffer
+ * kbase_csf_firmware_trace_buffer_get_active_mask64 - Get trace buffer active mask
 *
- * @kbdev: Pointer to the device
+ * @tb: Trace buffer handle
+ *
+ * Return: Trace buffer active mask.
 */
-void kbase_csf_firmware_trace_buffer_debugfs_init(struct kbase_device *kbdev);
-#endif /* CONFIG_DEBUG_FS */
+u64 kbase_csf_firmware_trace_buffer_get_active_mask64(struct firmware_trace_buffer *tb);
+
+/**
+ * kbase_csf_firmware_trace_buffer_set_active_mask64 - Set trace buffer active mask
+ *
+ * @tb: Trace buffer handle
+ * @mask: New active mask
+ *
+ * Return: 0 if successful, negative error code on failure.
+ */
+int kbase_csf_firmware_trace_buffer_set_active_mask64(struct firmware_trace_buffer *tb, u64 mask);

 #endif /* _KBASE_CSF_TRACE_BUFFER_H_ */
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_debug_csf_fault.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_debug_csf_fault.c
@ -0,0 +1,271 @@
+// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+/*
+ *
+ * (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#include <mali_kbase.h>
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+
+/**
+ * kbasep_fault_occurred - Check if fault occurred.
+ *
+ * @kbdev:  Device pointer
+ *
+ * Return: true if a fault occurred.
+ */
+static bool kbasep_fault_occurred(struct kbase_device *kbdev)
+{
+	unsigned long flags;
+	bool ret;
+
+	spin_lock_irqsave(&kbdev->csf.dof.lock, flags);
+	ret = (kbdev->csf.dof.error_code != DF_NO_ERROR);
+	spin_unlock_irqrestore(&kbdev->csf.dof.lock, flags);
+
+	return ret;
+}
+
+void kbase_debug_csf_fault_wait_completion(struct kbase_device *kbdev)
+{
+	if (likely(!kbase_debug_csf_fault_dump_enabled(kbdev))) {
+		dev_dbg(kbdev->dev, "No userspace client for dumping exists");
+		return;
+	}
+
+	wait_event(kbdev->csf.dof.dump_wait_wq, kbase_debug_csf_fault_dump_complete(kbdev));
+}
+KBASE_EXPORT_TEST_API(kbase_debug_csf_fault_wait_completion);
+
+/**
+ * kbase_debug_csf_fault_wakeup - Wake up a waiting user space client.
+ *
+ * @kbdev:   Kbase device
+ */
+static void kbase_debug_csf_fault_wakeup(struct kbase_device *kbdev)
+{
+	wake_up_interruptible(&kbdev->csf.dof.fault_wait_wq);
+}
+
+bool kbase_debug_csf_fault_notify(struct kbase_device *kbdev,
+	struct kbase_context *kctx, enum dumpfault_error_type error)
+{
+	unsigned long flags;
+
+	if (likely(!kbase_debug_csf_fault_dump_enabled(kbdev)))
+		return false;
+
+	if (WARN_ON(error == DF_NO_ERROR))
+		return false;
+
+	if (kctx && kbase_ctx_flag(kctx, KCTX_DYING)) {
+		dev_info(kbdev->dev, "kctx %d_%d is dying when error %d is reported",
+			kctx->tgid, kctx->id, error);
+		kctx = NULL;
+	}
+
+	spin_lock_irqsave(&kbdev->csf.dof.lock, flags);
+
+	/* Only one fault at a time can be processed */
+	if (kbdev->csf.dof.error_code) {
+		dev_info(kbdev->dev, "skip this fault as there's a pending fault");
+		goto unlock;
+	}
+
+	kbdev->csf.dof.kctx_tgid = kctx ? kctx->tgid : 0;
+	kbdev->csf.dof.kctx_id = kctx ? kctx->id : 0;
+	kbdev->csf.dof.error_code = error;
+	kbase_debug_csf_fault_wakeup(kbdev);
+
+unlock:
+	spin_unlock_irqrestore(&kbdev->csf.dof.lock, flags);
+	return true;
+}
+
+static ssize_t debug_csf_fault_read(struct file *file, char __user *buffer, size_t size,
+				    loff_t *f_pos)
+{
+#define BUF_SIZE 64
+	struct kbase_device *kbdev;
+	unsigned long flags;
+	int count;
+	char buf[BUF_SIZE];
+	u32 tgid, ctx_id;
+	enum dumpfault_error_type error_code;
+
+	if (unlikely(!file)) {
+		pr_warn("%s: file is NULL", __func__);
+		return -EINVAL;
+	}
+
+	kbdev = file->private_data;
+	if (unlikely(!buffer)) {
+		dev_warn(kbdev->dev, "%s: buffer is NULL", __func__);
+		return -EINVAL;
+	}
+
+	if (unlikely(*f_pos < 0)) {
+		dev_warn(kbdev->dev, "%s: f_pos is negative", __func__);
+		return -EINVAL;
+	}
+
+	if (size < sizeof(buf)) {
+		dev_warn(kbdev->dev, "%s: buffer is too small", __func__);
+		return -EINVAL;
+	}
+
+	if (wait_event_interruptible(kbdev->csf.dof.fault_wait_wq, kbasep_fault_occurred(kbdev)))
+		return -ERESTARTSYS;
+
+	spin_lock_irqsave(&kbdev->csf.dof.lock, flags);
+	tgid = kbdev->csf.dof.kctx_tgid;
+	ctx_id = kbdev->csf.dof.kctx_id;
+	error_code = kbdev->csf.dof.error_code;
+	BUILD_BUG_ON(sizeof(buf) < (sizeof(tgid) + sizeof(ctx_id) + sizeof(error_code)));
+	count = scnprintf(buf, sizeof(buf), "%u_%u_%u\n", tgid, ctx_id, error_code);
+	spin_unlock_irqrestore(&kbdev->csf.dof.lock, flags);
+
+	dev_info(kbdev->dev, "debug csf fault info read");
+	return simple_read_from_buffer(buffer, size, f_pos, buf, count);
+}
+
+static int debug_csf_fault_open(struct inode *in, struct file *file)
+{
+	struct kbase_device *kbdev;
+
+	if (unlikely(!in)) {
+		pr_warn("%s: inode is NULL", __func__);
+		return -EINVAL;
+	}
+
+	kbdev = in->i_private;
+	if (unlikely(!file)) {
+		dev_warn(kbdev->dev, "%s: file is NULL", __func__);
+		return -EINVAL;
+	}
+
+	if (atomic_cmpxchg(&kbdev->csf.dof.enabled, 0, 1) == 1) {
+		dev_warn(kbdev->dev, "Only one client is allowed for dump on fault");
+		return -EBUSY;
+	}
+
+	dev_info(kbdev->dev, "debug csf fault file open");
+
+	return simple_open(in, file);
+}
+
+static ssize_t debug_csf_fault_write(struct file *file, const char __user *ubuf, size_t count,
+				     loff_t *ppos)
+{
+	struct kbase_device *kbdev;
+	unsigned long flags;
+
+	if (unlikely(!file)) {
+		pr_warn("%s: file is NULL", __func__);
+		return -EINVAL;
+	}
+
+	kbdev = file->private_data;
+	spin_lock_irqsave(&kbdev->csf.dof.lock, flags);
+	kbdev->csf.dof.error_code = DF_NO_ERROR;
+	kbdev->csf.dof.kctx_tgid = 0;
+	kbdev->csf.dof.kctx_id = 0;
+	dev_info(kbdev->dev, "debug csf fault dump complete");
+	spin_unlock_irqrestore(&kbdev->csf.dof.lock, flags);
+
+	/* User space finished the dump.
+	 * Wake up blocked kernel threads to proceed.
+	 */
+	wake_up(&kbdev->csf.dof.dump_wait_wq);
+
+	return count;
+}
+
+static int debug_csf_fault_release(struct inode *in, struct file *file)
+{
+	struct kbase_device *kbdev;
+	unsigned long flags;
+
+	if (unlikely(!in)) {
+		pr_warn("%s: inode is NULL", __func__);
+		return -EINVAL;
+	}
+
+	kbdev = in->i_private;
+	spin_lock_irqsave(&kbdev->csf.dof.lock, flags);
+	kbdev->csf.dof.kctx_tgid = 0;
+	kbdev->csf.dof.kctx_id = 0;
+	kbdev->csf.dof.error_code = DF_NO_ERROR;
+	spin_unlock_irqrestore(&kbdev->csf.dof.lock, flags);
+
+	atomic_set(&kbdev->csf.dof.enabled, 0);
+	dev_info(kbdev->dev, "debug csf fault file close");
+
+	/* User space closed the debugfs file.
+	 * Wake up blocked kernel threads to resume.
+	 */
+	wake_up(&kbdev->csf.dof.dump_wait_wq);
+
+	return 0;
+}
+
+static const struct file_operations kbasep_debug_csf_fault_fops = {
+	.owner = THIS_MODULE,
+	.open = debug_csf_fault_open,
+	.read = debug_csf_fault_read,
+	.write = debug_csf_fault_write,
+	.llseek = default_llseek,
+	.release = debug_csf_fault_release,
+};
+
+void kbase_debug_csf_fault_debugfs_init(struct kbase_device *kbdev)
+{
+	const char *fname = "csf_fault";
+
+	if (unlikely(!kbdev)) {
+		pr_warn("%s: kbdev is NULL", __func__);
+		return;
+	}
+
+	debugfs_create_file(fname, 0600, kbdev->mali_debugfs_directory, kbdev,
+			    &kbasep_debug_csf_fault_fops);
+}
+
+int kbase_debug_csf_fault_init(struct kbase_device *kbdev)
+{
+	if (unlikely(!kbdev)) {
+		pr_warn("%s: kbdev is NULL", __func__);
+		return -EINVAL;
+	}
+
+	init_waitqueue_head(&(kbdev->csf.dof.fault_wait_wq));
+	init_waitqueue_head(&(kbdev->csf.dof.dump_wait_wq));
+	spin_lock_init(&kbdev->csf.dof.lock);
+	kbdev->csf.dof.kctx_tgid = 0;
+	kbdev->csf.dof.kctx_id = 0;
+	kbdev->csf.dof.error_code = DF_NO_ERROR;
+	atomic_set(&kbdev->csf.dof.enabled, 0);
+
+	return 0;
+}
+
+void kbase_debug_csf_fault_term(struct kbase_device *kbdev)
+{
+}
+#endif /* CONFIG_DEBUG_FS */
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_debug_csf_fault.h
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_debug_csf_fault.h
@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _KBASE_DEBUG_CSF_FAULT_H
+#define _KBASE_DEBUG_CSF_FAULT_H
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+/**
+ * kbase_debug_csf_fault_debugfs_init - Initialize CSF fault debugfs
+ * @kbdev:	Device pointer
+ */
+void kbase_debug_csf_fault_debugfs_init(struct kbase_device *kbdev);
+
+/**
+ * kbase_debug_csf_fault_init - Create the fault event wait queue per device
+ *                              and initialize the required resources.
+ * @kbdev:    Device pointer
+ *
+ * Return: Zero on success or a negative error code.
+ */
+int kbase_debug_csf_fault_init(struct kbase_device *kbdev);
+
+/**
+ * kbase_debug_csf_fault_term - Clean up resources created by
+ *		                @kbase_debug_csf_fault_init.
+ * @kbdev:    Device pointer
+ */
+void kbase_debug_csf_fault_term(struct kbase_device *kbdev);
+
+/**
+ * kbase_debug_csf_fault_wait_completion - Wait for the client to complete.
+ *
+ * @kbdev:    Device Pointer
+ *
+ * Wait for the user space client to finish reading the fault information.
+ * This function must be called in thread context.
+ */
+void kbase_debug_csf_fault_wait_completion(struct kbase_device *kbdev);
+
+/**
+ * kbase_debug_csf_fault_notify - Notify client of a fault.
+ *
+ * @kbdev:    Device pointer
+ * @kctx:     Faulty context (can be NULL)
+ * @error:    Error code.
+ *
+ * Store fault information and wake up the user space client.
+ *
+ * Return: true if a dump on fault was initiated or was is in progress and
+ *         so caller can opt to wait for the dumping to complete.
+ */
+bool kbase_debug_csf_fault_notify(struct kbase_device *kbdev,
+		struct kbase_context *kctx, enum dumpfault_error_type error);
+
+/**
+ * kbase_debug_csf_fault_dump_enabled - Check if dump on fault is enabled.
+ *
+ * @kbdev:  Device pointer
+ *
+ * Return: true if debugfs file is opened so dump on fault is enabled.
+ */
+static inline bool kbase_debug_csf_fault_dump_enabled(struct kbase_device *kbdev)
+{
+	return atomic_read(&kbdev->csf.dof.enabled);
+}
+
+/**
+ * kbase_debug_csf_fault_dump_complete - Check if dump on fault is completed.
+ *
+ * @kbdev:  Device pointer
+ *
+ * Return: true if dump on fault completes or file is closed.
+ */
+static inline bool kbase_debug_csf_fault_dump_complete(struct kbase_device *kbdev)
+{
+	unsigned long flags;
+	bool ret;
+
+	if (likely(!kbase_debug_csf_fault_dump_enabled(kbdev)))
+		return true;
+
+	spin_lock_irqsave(&kbdev->csf.dof.lock, flags);
+	ret = (kbdev->csf.dof.error_code == DF_NO_ERROR);
+	spin_unlock_irqrestore(&kbdev->csf.dof.lock, flags);
+
+	return ret;
+}
+#else /* CONFIG_DEBUG_FS */
+static inline int kbase_debug_csf_fault_init(struct kbase_device *kbdev)
+{
+	return 0;
+}
+
+static inline void kbase_debug_csf_fault_term(struct kbase_device *kbdev)
+{
+}
+
+static inline void kbase_debug_csf_fault_wait_completion(struct kbase_device *kbdev)
+{
+}
+
+static inline bool kbase_debug_csf_fault_notify(struct kbase_device *kbdev,
+		struct kbase_context *kctx, enum dumpfault_error_type error)
+{
+	return false;
+}
+
+static inline bool kbase_debug_csf_fault_dump_enabled(struct kbase_device *kbdev)
+{
+	return false;
+}
+
+static inline bool kbase_debug_csf_fault_dump_complete(struct kbase_device *kbdev)
+{
+	return true;
+}
+#endif /* CONFIG_DEBUG_FS */
+
+#endif /*_KBASE_DEBUG_CSF_FAULT_H*/
--- a/drivers/gpu/arm/bifrost/debug/backend/mali_kbase_debug_ktrace_codes_csf.h
+++ b/drivers/gpu/arm/bifrost/debug/backend/mali_kbase_debug_ktrace_codes_csf.h
@ -42,19 +42,25 @@ int dummy_array[] = {
 	/*
 	 * Generic CSF events
 	 */
+	/* info_val = 0 */
 	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_EVICT_CTX_SLOTS_START),
+	/* info_val == number of CSGs supported */
+	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_EVICT_CTX_SLOTS_END),
 	/* info_val[0:7]   == fw version_minor
 	 * info_val[15:8]  == fw version_major
 	 * info_val[63:32] == fw version_hash
 	 */
 	KBASE_KTRACE_CODE_MAKE_CODE(CSF_FIRMWARE_BOOT),
 	KBASE_KTRACE_CODE_MAKE_CODE(CSF_FIRMWARE_REBOOT),
+	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_TOCK_INVOKE),
+	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_TICK_INVOKE),
 	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_TOCK_START),
 	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_TOCK_END),
 	/* info_val == total number of runnable groups across all kctxs */
 	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_TICK_START),
 	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_TICK_END),
 	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_RESET_START),
+	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_RESET_END),
 	/* info_val = timeout in ms */
 	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_PROTM_WAIT_QUIT_START),
 	/* info_val = remaining ms timeout, or 0 if timedout */
@ -101,6 +107,8 @@ int dummy_array[] = {
 	 * purpose.
 	 */
 	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_GPU_IDLE_WORKER_HANDLING_START),
+	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_GPU_IDLE_WORKER_HANDLING_END),
+
 	KBASE_KTRACE_CODE_MAKE_CODE(CSF_FIRMWARE_MCU_HALTED),
 	KBASE_KTRACE_CODE_MAKE_CODE(CSF_FIRMWARE_MCU_SLEEP),

@ -126,6 +134,8 @@ int dummy_array[] = {
 	 * group->csg_nr indicates which bit was set
 	 */
 	KBASE_KTRACE_CODE_MAKE_CODE(CSG_SLOT_IDLE_SET),
+	KBASE_KTRACE_CODE_MAKE_CODE(CSG_INTERRUPT_NO_NON_IDLE_GROUPS),
+	KBASE_KTRACE_CODE_MAKE_CODE(CSG_INTERRUPT_NON_IDLE_GROUPS),
 	/* info_val = scheduler's new csg_slots_idle_mask[0]
 	 * group->csg_nr indicates which bit was cleared
 	 *
@ -190,10 +200,37 @@ int dummy_array[] = {
 	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_NONIDLE_OFFSLOT_GRP_INC),
 	/* info_val == new count of off-slot non-idle groups */
 	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_NONIDLE_OFFSLOT_GRP_DEC),
+	/* info_val = scheduler's new csg_slots_idle_mask[0]
+	 * group->csg_nr indicates which bit was set
+	 */
+	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_HANDLE_IDLE_SLOTS),

 	KBASE_KTRACE_CODE_MAKE_CODE(PROTM_EVENT_WORKER_START),
 	KBASE_KTRACE_CODE_MAKE_CODE(PROTM_EVENT_WORKER_END),

+	/* info_val = scheduler state */
+	KBASE_KTRACE_CODE_MAKE_CODE(SCHED_BUSY),
+	KBASE_KTRACE_CODE_MAKE_CODE(SCHED_INACTIVE),
+	KBASE_KTRACE_CODE_MAKE_CODE(SCHED_SUSPENDED),
+	KBASE_KTRACE_CODE_MAKE_CODE(SCHED_SLEEPING),
+
+	/* info_val = mcu state */
+#define KBASEP_MCU_STATE(n) KBASE_KTRACE_CODE_MAKE_CODE(PM_MCU_ ## n),
+#include "backend/gpu/mali_kbase_pm_mcu_states.h"
+#undef KBASEP_MCU_STATE
+
+	/* info_val = number of runnable groups */
+	KBASE_KTRACE_CODE_MAKE_CODE(CSF_GROUP_INACTIVE),
+	KBASE_KTRACE_CODE_MAKE_CODE(CSF_GROUP_RUNNABLE),
+	KBASE_KTRACE_CODE_MAKE_CODE(CSF_GROUP_IDLE),
+	KBASE_KTRACE_CODE_MAKE_CODE(CSF_GROUP_SUSPENDED),
+	KBASE_KTRACE_CODE_MAKE_CODE(CSF_GROUP_SUSPENDED_ON_IDLE),
+	KBASE_KTRACE_CODE_MAKE_CODE(CSF_GROUP_SUSPENDED_ON_WAIT_SYNC),
+	/* info_val = new run state of the evicted group */
+	KBASE_KTRACE_CODE_MAKE_CODE(CSF_GROUP_FAULT_EVICTED),
+	/* info_val = get the number of active CSGs */
+	KBASE_KTRACE_CODE_MAKE_CODE(CSF_GROUP_TERMINATED),
+
 	/*
 	 * Group + Queue events
 	 */
--- a/drivers/gpu/arm/bifrost/debug/backend/mali_kbase_debug_linux_ktrace_csf.h
+++ b/drivers/gpu/arm/bifrost/debug/backend/mali_kbase_debug_linux_ktrace_csf.h
@ -31,13 +31,17 @@
 * Generic CSF events - using the common DEFINE_MALI_ADD_EVENT
 */
 DEFINE_MALI_ADD_EVENT(SCHEDULER_EVICT_CTX_SLOTS_START);
+DEFINE_MALI_ADD_EVENT(SCHEDULER_EVICT_CTX_SLOTS_END);
 DEFINE_MALI_ADD_EVENT(CSF_FIRMWARE_BOOT);
 DEFINE_MALI_ADD_EVENT(CSF_FIRMWARE_REBOOT);
+DEFINE_MALI_ADD_EVENT(SCHEDULER_TOCK_INVOKE);
+DEFINE_MALI_ADD_EVENT(SCHEDULER_TICK_INVOKE);
 DEFINE_MALI_ADD_EVENT(SCHEDULER_TOCK_START);
 DEFINE_MALI_ADD_EVENT(SCHEDULER_TOCK_END);
 DEFINE_MALI_ADD_EVENT(SCHEDULER_TICK_START);
 DEFINE_MALI_ADD_EVENT(SCHEDULER_TICK_END);
 DEFINE_MALI_ADD_EVENT(SCHEDULER_RESET_START);
+DEFINE_MALI_ADD_EVENT(SCHEDULER_RESET_END);
 DEFINE_MALI_ADD_EVENT(SCHEDULER_PROTM_WAIT_QUIT_START);
 DEFINE_MALI_ADD_EVENT(SCHEDULER_PROTM_WAIT_QUIT_END);
 DEFINE_MALI_ADD_EVENT(SCHEDULER_GROUP_SYNC_UPDATE_EVENT);
@ -58,8 +62,16 @@ DEFINE_MALI_ADD_EVENT(SCHEDULER_GROUP_SYNC_UPDATE_WORKER_START);
 DEFINE_MALI_ADD_EVENT(SCHEDULER_GROUP_SYNC_UPDATE_WORKER_END);
 DEFINE_MALI_ADD_EVENT(SCHEDULER_UPDATE_IDLE_SLOTS_ACK);
 DEFINE_MALI_ADD_EVENT(SCHEDULER_GPU_IDLE_WORKER_HANDLING_START);
+DEFINE_MALI_ADD_EVENT(SCHEDULER_GPU_IDLE_WORKER_HANDLING_END);
 DEFINE_MALI_ADD_EVENT(CSF_FIRMWARE_MCU_HALTED);
 DEFINE_MALI_ADD_EVENT(CSF_FIRMWARE_MCU_SLEEP);
+DEFINE_MALI_ADD_EVENT(SCHED_BUSY);
+DEFINE_MALI_ADD_EVENT(SCHED_INACTIVE);
+DEFINE_MALI_ADD_EVENT(SCHED_SUSPENDED);
+DEFINE_MALI_ADD_EVENT(SCHED_SLEEPING);
+#define KBASEP_MCU_STATE(n) DEFINE_MALI_ADD_EVENT(PM_MCU_ ## n);
+#include "backend/gpu/mali_kbase_pm_mcu_states.h"
+#undef KBASEP_MCU_STATE

 DECLARE_EVENT_CLASS(mali_csf_grp_q_template,
 	TP_PROTO(struct kbase_device *kbdev, struct kbase_queue_group *group,
@ -136,6 +148,8 @@ DEFINE_MALI_CSF_GRP_EVENT(CSG_SLOT_STOPPED);
 DEFINE_MALI_CSF_GRP_EVENT(CSG_SLOT_CLEANED);
 DEFINE_MALI_CSF_GRP_EVENT(CSG_UPDATE_IDLE_SLOT_REQ);
 DEFINE_MALI_CSF_GRP_EVENT(CSG_SLOT_IDLE_SET);
+DEFINE_MALI_CSF_GRP_EVENT(CSG_INTERRUPT_NO_NON_IDLE_GROUPS);
+DEFINE_MALI_CSF_GRP_EVENT(CSG_INTERRUPT_NON_IDLE_GROUPS);
 DEFINE_MALI_CSF_GRP_EVENT(CSG_SLOT_IDLE_CLEAR);
 DEFINE_MALI_CSF_GRP_EVENT(CSG_SLOT_PRIO_UPDATE);
 DEFINE_MALI_CSF_GRP_EVENT(CSG_INTERRUPT_SYNC_UPDATE);
@ -160,8 +174,17 @@ DEFINE_MALI_CSF_GRP_EVENT(SCHEDULER_PROTM_EXIT);
 DEFINE_MALI_CSF_GRP_EVENT(SCHEDULER_TOP_GRP);
 DEFINE_MALI_CSF_GRP_EVENT(SCHEDULER_NONIDLE_OFFSLOT_GRP_INC);
 DEFINE_MALI_CSF_GRP_EVENT(SCHEDULER_NONIDLE_OFFSLOT_GRP_DEC);
+DEFINE_MALI_CSF_GRP_EVENT(SCHEDULER_HANDLE_IDLE_SLOTS);
 DEFINE_MALI_CSF_GRP_EVENT(PROTM_EVENT_WORKER_START);
 DEFINE_MALI_CSF_GRP_EVENT(PROTM_EVENT_WORKER_END);
+DEFINE_MALI_CSF_GRP_EVENT(CSF_GROUP_INACTIVE);
+DEFINE_MALI_CSF_GRP_EVENT(CSF_GROUP_RUNNABLE);
+DEFINE_MALI_CSF_GRP_EVENT(CSF_GROUP_IDLE);
+DEFINE_MALI_CSF_GRP_EVENT(CSF_GROUP_SUSPENDED);
+DEFINE_MALI_CSF_GRP_EVENT(CSF_GROUP_SUSPENDED_ON_IDLE);
+DEFINE_MALI_CSF_GRP_EVENT(CSF_GROUP_SUSPENDED_ON_WAIT_SYNC);
+DEFINE_MALI_CSF_GRP_EVENT(CSF_GROUP_FAULT_EVICTED);
+DEFINE_MALI_CSF_GRP_EVENT(CSF_GROUP_TERMINATED);

 #undef DEFINE_MALI_CSF_GRP_EVENT

--- a/drivers/gpu/arm/bifrost/debug/mali_kbase_debug_ktrace_codes.h
+++ b/drivers/gpu/arm/bifrost/debug/mali_kbase_debug_ktrace_codes.h
@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 *
- * (C) COPYRIGHT 2011-2015, 2018-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2011-2015, 2018-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -142,6 +142,11 @@ int dummy_array[] = {
 	KBASE_KTRACE_CODE_MAKE_CODE(PM_RUNTIME_SUSPEND_CALLBACK),
 	KBASE_KTRACE_CODE_MAKE_CODE(PM_RUNTIME_RESUME_CALLBACK),

+	/* info_val = l2 state */
+#define KBASEP_L2_STATE(n) KBASE_KTRACE_CODE_MAKE_CODE(PM_L2_ ## n),
+#include "backend/gpu/mali_kbase_pm_l2_states.h"
+#undef KBASEP_L2_STATE
+
 	/*
 	 * Context Scheduler events
 	 */
--- a/drivers/gpu/arm/bifrost/debug/mali_kbase_debug_linux_ktrace.h
+++ b/drivers/gpu/arm/bifrost/debug/mali_kbase_debug_linux_ktrace.h
@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 *
- * (C) COPYRIGHT 2014, 2018, 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014, 2018, 2020-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -98,6 +98,9 @@ DEFINE_MALI_ADD_EVENT(PM_WAKE_WAITERS);
 DEFINE_MALI_ADD_EVENT(PM_POWEROFF_WAIT_WQ);
 DEFINE_MALI_ADD_EVENT(PM_RUNTIME_SUSPEND_CALLBACK);
 DEFINE_MALI_ADD_EVENT(PM_RUNTIME_RESUME_CALLBACK);
+#define KBASEP_L2_STATE(n) DEFINE_MALI_ADD_EVENT(PM_L2_ ## n);
+#include "backend/gpu/mali_kbase_pm_l2_states.h"
+#undef KBASEP_L2_STATE
 DEFINE_MALI_ADD_EVENT(SCHED_RETAIN_CTX_NOLOCK);
 DEFINE_MALI_ADD_EVENT(SCHED_RELEASE_CTX);
 #ifdef CONFIG_MALI_ARBITER_SUPPORT
--- a/drivers/gpu/arm/bifrost/device/backend/mali_kbase_device_csf.c
+++ b/drivers/gpu/arm/bifrost/device/backend/mali_kbase_device_csf.c
@ -23,8 +23,8 @@
 #include <device/mali_kbase_device.h>

 #include <mali_kbase_hwaccess_backend.h>
-#include <mali_kbase_hwcnt_backend_csf_if_fw.h>
-#include <mali_kbase_hwcnt_watchdog_if_timer.h>
+#include <hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.h>
+#include <hwcnt/mali_kbase_hwcnt_watchdog_if_timer.h>
 #include <mali_kbase_ctx_sched.h>
 #include <mali_kbase_reset_gpu.h>
 #include <csf/mali_kbase_csf.h>
@ -40,9 +40,10 @@
 #include <backend/gpu/mali_kbase_js_internal.h>
 #include <backend/gpu/mali_kbase_clk_rate_trace_mgr.h>
 #include <csf/mali_kbase_csf_csg_debugfs.h>
-#include <mali_kbase_hwcnt_virtualizer.h>
+#include <hwcnt/mali_kbase_hwcnt_virtualizer.h>
 #include <mali_kbase_kinstr_prfcnt.h>
 #include <mali_kbase_vinstr.h>
+#include <tl/mali_kbase_timeline.h>

 /**
 * kbase_device_firmware_hwcnt_term - Terminate CSF firmware and HWC
@ -60,7 +61,7 @@ static void kbase_device_firmware_hwcnt_term(struct kbase_device *kbdev)
 		kbase_vinstr_term(kbdev->vinstr_ctx);
 		kbase_hwcnt_virtualizer_term(kbdev->hwcnt_gpu_virt);
 		kbase_hwcnt_backend_csf_metadata_term(&kbdev->hwcnt_gpu_iface);
-		kbase_csf_firmware_term(kbdev);
+		kbase_csf_firmware_unload_term(kbdev);
 	}
 }

@ -197,6 +198,20 @@ static int kbase_csf_early_init(struct kbase_device *kbdev)
 static void kbase_csf_early_term(struct kbase_device *kbdev)
 {
 	kbase_csf_scheduler_early_term(kbdev);
+	kbase_csf_firmware_early_term(kbdev);
+}
+
+/**
+ * kbase_csf_late_init - late initialization for firmware.
+ * @kbdev:	Device pointer
+ *
+ * Return: 0 on success, error code otherwise.
+ */
+static int kbase_csf_late_init(struct kbase_device *kbdev)
+{
+	int err = kbase_csf_firmware_late_init(kbdev);
+
+	return err;
 }

 /**
@ -269,59 +284,48 @@ static void kbase_device_hwcnt_backend_csf_term(struct kbase_device *kbdev)

 static const struct kbase_device_init dev_init[] = {
 #if IS_ENABLED(CONFIG_MALI_BIFROST_NO_MALI)
-	{ kbase_gpu_device_create, kbase_gpu_device_destroy,
-	  "Dummy model initialization failed" },
+	{ kbase_gpu_device_create, kbase_gpu_device_destroy, "Dummy model initialization failed" },
 #else
 	{ assign_irqs, NULL, "IRQ search failed" },
 	{ registers_map, registers_unmap, "Register map failed" },
 #endif
-	{ power_control_init, power_control_term,
-	  "Power control initialization failed" },
+	{ power_control_init, power_control_term, "Power control initialization failed" },
 	{ kbase_device_io_history_init, kbase_device_io_history_term,
 	  "Register access history initialization failed" },
-	{ kbase_device_early_init, kbase_device_early_term,
-	  "Early device initialization failed" },
-	{ kbase_device_populate_max_freq, NULL,
-	  "Populating max frequency failed" },
-	{ kbase_pm_lowest_gpu_freq_init, NULL,
-	  "Lowest freq initialization failed" },
+	{ kbase_device_early_init, kbase_device_early_term, "Early device initialization failed" },
+	{ kbase_device_populate_max_freq, NULL, "Populating max frequency failed" },
+	{ kbase_pm_lowest_gpu_freq_init, NULL, "Lowest freq initialization failed" },
 	{ kbase_device_misc_init, kbase_device_misc_term,
 	  "Miscellaneous device initialization failed" },
 	{ kbase_device_pcm_dev_init, kbase_device_pcm_dev_term,
 	  "Priority control manager initialization failed" },
-	{ kbase_ctx_sched_init, kbase_ctx_sched_term,
-	  "Context scheduler initialization failed" },
-	{ kbase_mem_init, kbase_mem_term,
-	  "Memory subsystem initialization failed" },
+	{ kbase_ctx_sched_init, kbase_ctx_sched_term, "Context scheduler initialization failed" },
+	{ kbase_mem_init, kbase_mem_term, "Memory subsystem initialization failed" },
 	{ kbase_csf_protected_memory_init, kbase_csf_protected_memory_term,
 	  "Protected memory allocator initialization failed" },
 	{ kbase_device_coherency_init, NULL, "Device coherency init failed" },
 	{ kbase_protected_mode_init, kbase_protected_mode_term,
 	  "Protected mode subsystem initialization failed" },
-	{ kbase_device_list_init, kbase_device_list_term,
-	  "Device list setup failed" },
+	{ kbase_device_list_init, kbase_device_list_term, "Device list setup failed" },
 	{ kbase_device_timeline_init, kbase_device_timeline_term,
 	  "Timeline stream initialization failed" },
 	{ kbase_clk_rate_trace_manager_init, kbase_clk_rate_trace_manager_term,
 	  "Clock rate trace manager initialization failed" },
-	{ kbase_device_hwcnt_watchdog_if_init,
-	  kbase_device_hwcnt_watchdog_if_term,
+	{ kbase_device_hwcnt_watchdog_if_init, kbase_device_hwcnt_watchdog_if_term,
 	  "GPU hwcnt backend watchdog interface creation failed" },
-	{ kbase_device_hwcnt_backend_csf_if_init,
-	  kbase_device_hwcnt_backend_csf_if_term,
+	{ kbase_device_hwcnt_backend_csf_if_init, kbase_device_hwcnt_backend_csf_if_term,
 	  "GPU hwcnt backend CSF interface creation failed" },
-	{ kbase_device_hwcnt_backend_csf_init,
-	  kbase_device_hwcnt_backend_csf_term,
+	{ kbase_device_hwcnt_backend_csf_init, kbase_device_hwcnt_backend_csf_term,
 	  "GPU hwcnt backend creation failed" },
 	{ kbase_device_hwcnt_context_init, kbase_device_hwcnt_context_term,
 	  "GPU hwcnt context initialization failed" },
-	{ kbase_csf_early_init, kbase_csf_early_term,
-	  "Early CSF initialization failed" },
-	{ kbase_backend_late_init, kbase_backend_late_term,
-	  "Late backend initialization failed" },
+	{ kbase_csf_early_init, kbase_csf_early_term, "Early CSF initialization failed" },
+	{ kbase_backend_late_init, kbase_backend_late_term, "Late backend initialization failed" },
+	{ kbase_csf_late_init, NULL, "Late CSF initialization failed" },
 	{ NULL, kbase_device_firmware_hwcnt_term, NULL },
-	{ kbase_device_debugfs_init, kbase_device_debugfs_term,
-	  "DebugFS initialization failed" },
+	{ kbase_debug_csf_fault_init, kbase_debug_csf_fault_term,
+	  "CSF fault debug initialization failed" },
+	{ kbase_device_debugfs_init, kbase_device_debugfs_term, "DebugFS initialization failed" },
 	/* Sysfs init needs to happen before registering the device with
 	 * misc_register(), otherwise it causes a race condition between
 	 * registering the device and a uevent event being generated for
@ -339,8 +343,7 @@ static const struct kbase_device_init dev_init[] = {
 	  "Misc device registration failed" },
 	{ kbase_gpuprops_populate_user_buffer, kbase_gpuprops_free_user_buffer,
 	  "GPU property population failed" },
-	{ kbase_device_late_init, kbase_device_late_term,
-	  "Late device initialization failed" },
+	{ kbase_device_late_init, kbase_device_late_term, "Late device initialization failed" },
 };

 static void kbase_device_term_partial(struct kbase_device *kbdev,
@ -468,7 +471,7 @@ static int kbase_csf_firmware_deferred_init(struct kbase_device *kbdev)

 	lockdep_assert_held(&kbdev->fw_load_lock);

-	err = kbase_csf_firmware_init(kbdev);
+	err = kbase_csf_firmware_load_init(kbdev);
 	if (!err) {
 		unsigned long flags;

@ -498,11 +501,12 @@ int kbase_device_firmware_init_once(struct kbase_device *kbdev)

 		ret = kbase_device_hwcnt_csf_deferred_init(kbdev);
 		if (ret) {
-			kbase_csf_firmware_term(kbdev);
+			kbase_csf_firmware_unload_term(kbdev);
 			goto out;
 		}

 		kbase_csf_debugfs_init(kbdev);
+		kbase_timeline_io_debugfs_init(kbdev);
 out:
 		kbase_pm_context_idle(kbdev);
 	}
--- a/drivers/gpu/arm/bifrost/device/backend/mali_kbase_device_hw_csf.c
+++ b/drivers/gpu/arm/bifrost/device/backend/mali_kbase_device_hw_csf.c
@ -115,6 +115,9 @@ void kbase_gpu_interrupt(struct kbase_device *kbdev, u32 val)
 									GPU_EXCEPTION_TYPE_SW_FAULT_0,
 							} } };

+			kbase_debug_csf_fault_notify(kbdev, scheduler->active_protm_grp->kctx,
+						     DF_GPU_PROTECTED_FAULT);
+
 			scheduler->active_protm_grp->faulted = true;
 			kbase_csf_add_group_fatal_error(
 				scheduler->active_protm_grp, &err_payload);
@ -201,8 +204,11 @@ static bool kbase_is_register_accessible(u32 offset)

 void kbase_reg_write(struct kbase_device *kbdev, u32 offset, u32 value)
 {
-	KBASE_DEBUG_ASSERT(kbdev->pm.backend.gpu_powered);
-	KBASE_DEBUG_ASSERT(kbdev->dev != NULL);
+	if (WARN_ON(!kbdev->pm.backend.gpu_powered))
+		return;
+
+	if (WARN_ON(kbdev->dev == NULL))
+		return;

 	if (!kbase_is_register_accessible(offset))
 		return;
@ -222,8 +228,11 @@ u32 kbase_reg_read(struct kbase_device *kbdev, u32 offset)
 {
 	u32 val;

-	KBASE_DEBUG_ASSERT(kbdev->pm.backend.gpu_powered);
-	KBASE_DEBUG_ASSERT(kbdev->dev != NULL);
+	if (WARN_ON(!kbdev->pm.backend.gpu_powered))
+		return 0;
+
+	if (WARN_ON(kbdev->dev == NULL))
+		return 0;

 	if (!kbase_is_register_accessible(offset))
 		return 0;
--- a/drivers/gpu/arm/bifrost/device/backend/mali_kbase_device_jm.c
+++ b/drivers/gpu/arm/bifrost/device/backend/mali_kbase_device_jm.c
@ -27,9 +27,9 @@
 #include <mali_kbase_hwaccess_backend.h>
 #include <mali_kbase_ctx_sched.h>
 #include <mali_kbase_reset_gpu.h>
-#include <mali_kbase_hwcnt_watchdog_if_timer.h>
-#include <mali_kbase_hwcnt_backend_jm.h>
-#include <mali_kbase_hwcnt_backend_jm_watchdog.h>
+#include <hwcnt/mali_kbase_hwcnt_watchdog_if_timer.h>
+#include <hwcnt/backend/mali_kbase_hwcnt_backend_jm.h>
+#include <hwcnt/backend/mali_kbase_hwcnt_backend_jm_watchdog.h>

 #if IS_ENABLED(CONFIG_MALI_BIFROST_NO_MALI)
 #include <backend/gpu/mali_kbase_model_linux.h>
--- a/drivers/gpu/arm/bifrost/device/mali_kbase_device.c
+++ b/drivers/gpu/arm/bifrost/device/mali_kbase_device.c
@ -42,8 +42,8 @@
 #include <tl/mali_kbase_timeline.h>
 #include "mali_kbase_kinstr_prfcnt.h"
 #include "mali_kbase_vinstr.h"
-#include "mali_kbase_hwcnt_context.h"
-#include "mali_kbase_hwcnt_virtualizer.h"
+#include "hwcnt/mali_kbase_hwcnt_context.h"
+#include "hwcnt/mali_kbase_hwcnt_virtualizer.h"

 #include "mali_kbase_device.h"
 #include "mali_kbase_device_internal.h"
@ -56,17 +56,15 @@
 #include "arbiter/mali_kbase_arbiter_pm.h"
 #endif /* CONFIG_MALI_ARBITER_SUPPORT */

-/* NOTE: Magic - 0x45435254 (TRCE in ASCII).
- * Supports tracing feature provided in the base module.
- * Please keep it in sync with the value of base module.
- */
-#define TRACE_BUFFER_HEADER_SPECIAL 0x45435254
+#if defined(CONFIG_DEBUG_FS) && !IS_ENABLED(CONFIG_MALI_BIFROST_NO_MALI)

 /* Number of register accesses for the buffer that we allocate during
 * initialization time. The buffer size can be changed later via debugfs.
 */
 #define KBASEP_DEFAULT_REGISTER_HISTORY_SIZE ((u16)512)

+#endif /* defined(CONFIG_DEBUG_FS) && !IS_ENABLED(CONFIG_MALI_BIFROST_NO_MALI) */
+
 static DEFINE_MUTEX(kbase_dev_list_lock);
 static LIST_HEAD(kbase_dev_list);
 static int kbase_dev_nr;
--- a/drivers/gpu/arm/bifrost/device/mali_kbase_device.h
+++ b/drivers/gpu/arm/bifrost/device/mali_kbase_device.h
@ -130,7 +130,11 @@ bool kbase_is_gpu_removed(struct kbase_device *kbdev);
 *
 * Return: 0 if successful or a negative error code on failure.
 */
-#define kbase_gpu_cache_flush_pa_range_and_busy_wait(kbdev, phys, nr_bytes, flush_op) (0)
+#if MALI_USE_CSF
+int kbase_gpu_cache_flush_pa_range_and_busy_wait(struct kbase_device *kbdev, phys_addr_t phys,
+						 size_t nr_bytes, u32 flush_op);
+#endif /* MALI_USE_CSF */
+
 /**
 * kbase_gpu_cache_flush_and_busy_wait - Start a cache flush and busy wait
 * @kbdev: Kbase device
--- a/drivers/gpu/arm/bifrost/device/mali_kbase_device_hw.c
+++ b/drivers/gpu/arm/bifrost/device/mali_kbase_device_hw.c
@ -27,9 +27,6 @@
 #include <mali_kbase_reset_gpu.h>
 #include <mmu/mali_kbase_mmu.h>

-#define U64_LO_MASK ((1ULL << 32) - 1)
-#define U64_HI_MASK (~U64_LO_MASK)
-
 #if !IS_ENABLED(CONFIG_MALI_BIFROST_NO_MALI)
 bool kbase_is_gpu_removed(struct kbase_device *kbdev)
 {
@ -86,7 +83,38 @@ static int busy_wait_on_irq(struct kbase_device *kbdev, u32 irq_bit)
 	return 0;
 }

-#define kbase_gpu_cache_flush_pa_range_and_busy_wait(kbdev, phys, nr_bytes, flush_op) (0)
+#if MALI_USE_CSF
+#define U64_LO_MASK ((1ULL << 32) - 1)
+#define U64_HI_MASK (~U64_LO_MASK)
+
+int kbase_gpu_cache_flush_pa_range_and_busy_wait(struct kbase_device *kbdev, phys_addr_t phys,
+						 size_t nr_bytes, u32 flush_op)
+{
+	u64 start_pa, end_pa;
+	int ret = 0;
+
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	/* 1. Clear the interrupt FLUSH_PA_RANGE_COMPLETED bit. */
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_CLEAR), FLUSH_PA_RANGE_COMPLETED);
+
+	/* 2. Issue GPU_CONTROL.COMMAND.FLUSH_PA_RANGE operation. */
+	start_pa = phys;
+	end_pa = start_pa + nr_bytes - 1;
+
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND_ARG0_LO), start_pa & U64_LO_MASK);
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND_ARG0_HI),
+			(start_pa & U64_HI_MASK) >> 32);
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND_ARG1_LO), end_pa & U64_LO_MASK);
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND_ARG1_HI), (end_pa & U64_HI_MASK) >> 32);
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND), flush_op);
+
+	/* 3. Busy-wait irq status to be enabled. */
+	ret = busy_wait_on_irq(kbdev, (u32)FLUSH_PA_RANGE_COMPLETED);
+
+	return ret;
+}
+#endif /* MALI_USE_CSF */

 int kbase_gpu_cache_flush_and_busy_wait(struct kbase_device *kbdev,
 					u32 flush_op)
--- a/drivers/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_fault_jm.c
+++ b/drivers/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_fault_jm.c
@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
 *
- * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -170,7 +170,7 @@ const char *kbase_gpu_exception_name(u32 const exception_code)
 	default:
 		e = "UNKNOWN";
 		break;
-	};
+	}

 	return e;
 }
--- a/drivers/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_regmap_csf.h
+++ b/drivers/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_regmap_csf.h
@ -35,10 +35,7 @@
 #define MCU_SUBSYSTEM_BASE 0x20000

 /* IPA control registers */
-#define IPA_CONTROL_BASE       0x40000
-#define IPA_CONTROL_REG(r)     (IPA_CONTROL_BASE+(r))
 #define COMMAND                0x000 /* (WO) Command register */
-#define STATUS                 0x004 /* (RO) Status register */
 #define TIMER                  0x008 /* (RW) Timer control register */

 #define SELECT_CSHW_LO         0x010 /* (RW) Counter select for CS hardware, low word */
@ -127,8 +124,16 @@

 #define MCU_STATUS_HALTED        (1 << 1)

+#define L2_CONFIG_PBHA_HWU_SHIFT GPU_U(12)
+#define L2_CONFIG_PBHA_HWU_MASK (GPU_U(0xF) << L2_CONFIG_PBHA_HWU_SHIFT)
+#define L2_CONFIG_PBHA_HWU_GET(reg_val)                                                            \
+	(((reg_val)&L2_CONFIG_PBHA_HWU_MASK) >> L2_CONFIG_PBHA_HWU_SHIFT)
+#define L2_CONFIG_PBHA_HWU_SET(reg_val, value)                                                     \
+	(((reg_val) & ~L2_CONFIG_PBHA_HWU_MASK) |                                                  \
+	 (((value) << L2_CONFIG_PBHA_HWU_SHIFT) & L2_CONFIG_PBHA_HWU_MASK))
+
 /* JOB IRQ flags */
-#define JOB_IRQ_GLOBAL_IF       (1 << 31)   /* Global interface interrupt received */
+#define JOB_IRQ_GLOBAL_IF (1u << 31) /* Global interface interrupt received */

 /* GPU_COMMAND codes */
 #define GPU_COMMAND_CODE_NOP                0x00 /* No operation, nothing happens */
--- a/drivers/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_regmap_jm.h
+++ b/drivers/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_regmap_jm.h
@ -127,29 +127,12 @@

 #define JOB_SLOT_REG(n, r)      (JOB_CONTROL_REG(JOB_SLOT0 + ((n) << 7)) + (r))

-#define JS_HEAD_LO             0x00	/* (RO) Job queue head pointer for job slot n, low word */
-#define JS_HEAD_HI             0x04	/* (RO) Job queue head pointer for job slot n, high word */
-#define JS_TAIL_LO             0x08	/* (RO) Job queue tail pointer for job slot n, low word */
-#define JS_TAIL_HI             0x0C	/* (RO) Job queue tail pointer for job slot n, high word */
-#define JS_AFFINITY_LO         0x10	/* (RO) Core affinity mask for job slot n, low word */
-#define JS_AFFINITY_HI         0x14	/* (RO) Core affinity mask for job slot n, high word */
-#define JS_CONFIG              0x18	/* (RO) Configuration settings for job slot n */
-/* (RO) Extended affinity mask for job slot n*/
-#define JS_XAFFINITY           0x1C
+#define JS_XAFFINITY           0x1C /* (RO) Extended affinity mask for job slot n*/

 #define JS_COMMAND             0x20	/* (WO) Command register for job slot n */
 #define JS_STATUS              0x24	/* (RO) Status register for job slot n */

-#define JS_HEAD_NEXT_LO        0x40	/* (RW) Next job queue head pointer for job slot n, low word */
-#define JS_HEAD_NEXT_HI        0x44	/* (RW) Next job queue head pointer for job slot n, high word */
-
-#define JS_AFFINITY_NEXT_LO    0x50	/* (RW) Next core affinity mask for job slot n, low word */
-#define JS_AFFINITY_NEXT_HI    0x54	/* (RW) Next core affinity mask for job slot n, high word */
-#define JS_CONFIG_NEXT         0x58	/* (RW) Next configuration settings for job slot n */
-/* (RW) Next extended affinity mask for job slot n */
-#define JS_XAFFINITY_NEXT      0x5C
-
-#define JS_COMMAND_NEXT        0x60	/* (RW) Next command register for job slot n */
+#define JS_XAFFINITY_NEXT      0x5C /* (RW) Next extended affinity mask for job slot n */

 #define JS_FLUSH_ID_NEXT       0x70	/* (RW) Next job slot n cache flush ID */

--- a/drivers/gpu/arm/bifrost/gpu/mali_kbase_gpu_regmap.h
+++ b/drivers/gpu/arm/bifrost/gpu/mali_kbase_gpu_regmap.h
@ -45,9 +45,6 @@
 /* Begin Register Offsets */
 /* GPU control registers */

-#define GPU_CONTROL_BASE        0x0000
-#define GPU_CONTROL_REG(r)      (GPU_CONTROL_BASE + (r))
-#define GPU_ID                  0x000   /* (RO) GPU and revision identifier */
 #define L2_FEATURES             0x004   /* (RO) Level 2 cache features */
 #define TILER_FEATURES          0x00C   /* (RO) Tiler Features */
 #define MEM_FEATURES            0x010   /* (RO) Memory system features */
@ -100,6 +97,10 @@

 #define TEXTURE_FEATURES_REG(n) GPU_CONTROL_REG(TEXTURE_FEATURES_0 + ((n) << 2))

+#define GPU_COMMAND_ARG0_LO 0x0D0 /* (RW) Additional parameter 0 for GPU commands, low word */
+#define GPU_COMMAND_ARG0_HI 0x0D4 /* (RW) Additional parameter 0 for GPU commands, high word */
+#define GPU_COMMAND_ARG1_LO 0x0D8 /* (RW) Additional parameter 1 for GPU commands, low word */
+#define GPU_COMMAND_ARG1_HI 0x0DC /* (RW) Additional parameter 1 for GPU commands, high word */

 #define SHADER_PRESENT_LO       0x100   /* (RO) Shader core present bitmap, low word */
 #define SHADER_PRESENT_HI       0x104   /* (RO) Shader core present bitmap, high word */
@ -113,26 +114,10 @@
 #define STACK_PRESENT_LO        0xE00   /* (RO) Core stack present bitmap, low word */
 #define STACK_PRESENT_HI        0xE04   /* (RO) Core stack present bitmap, high word */

-#define SHADER_READY_LO         0x140   /* (RO) Shader core ready bitmap, low word */
-#define SHADER_READY_HI         0x144   /* (RO) Shader core ready bitmap, high word */
-
-#define TILER_READY_LO          0x150   /* (RO) Tiler core ready bitmap, low word */
-#define TILER_READY_HI          0x154   /* (RO) Tiler core ready bitmap, high word */
-
-#define L2_READY_LO             0x160   /* (RO) Level 2 cache ready bitmap, low word */
-#define L2_READY_HI             0x164   /* (RO) Level 2 cache ready bitmap, high word */
-
 #define STACK_READY_LO          0xE10   /* (RO) Core stack ready bitmap, low word */
 #define STACK_READY_HI          0xE14   /* (RO) Core stack ready bitmap, high word */

-#define SHADER_PWRON_LO         0x180   /* (WO) Shader core power on bitmap, low word */
-#define SHADER_PWRON_HI         0x184   /* (WO) Shader core power on bitmap, high word */
-
-#define TILER_PWRON_LO          0x190   /* (WO) Tiler core power on bitmap, low word */
-#define TILER_PWRON_HI          0x194   /* (WO) Tiler core power on bitmap, high word */
-
-#define L2_PWRON_LO             0x1A0   /* (WO) Level 2 cache power on bitmap, low word */
-#define L2_PWRON_HI             0x1A4   /* (WO) Level 2 cache power on bitmap, high word */
+#define SHADER_PWRFEATURES      0x188   /* (RW) Shader core power features */

 #define STACK_PWRON_LO          0xE20   /* (RO) Core stack power on bitmap, low word */
 #define STACK_PWRON_HI          0xE24   /* (RO) Core stack power on bitmap, high word */
@ -181,6 +166,8 @@
 #define COHERENCY_FEATURES      0x300   /* (RO) Coherency features present */
 #define COHERENCY_ENABLE        0x304   /* (RW) Coherency enable */

+#define AMBA_FEATURES           0x300   /* (RO) AMBA bus supported features */
+#define AMBA_ENABLE             0x304   /* (RW) AMBA features enable */

 #define SHADER_CONFIG           0xF04   /* (RW) Shader core configuration (implementation-specific) */
 #define TILER_CONFIG            0xF08   /* (RW) Tiler core configuration (implementation-specific) */
@ -188,13 +175,7 @@

 /* Job control registers */

-#define JOB_CONTROL_BASE        0x1000
-
-#define JOB_CONTROL_REG(r)      (JOB_CONTROL_BASE + (r))
-
 #define JOB_IRQ_RAWSTAT         0x000   /* Raw interrupt status register */
-#define JOB_IRQ_CLEAR           0x004   /* Interrupt clear register */
-#define JOB_IRQ_MASK            0x008   /* Interrupt mask register */
 #define JOB_IRQ_STATUS          0x00C   /* Interrupt status register */

 /* MMU control registers */
@ -203,7 +184,6 @@
 #define MMU_IRQ_MASK            0x008   /* (RW) Interrupt mask register */
 #define MMU_IRQ_STATUS          0x00C   /* (RO) Interrupt status register */

-#define MMU_AS0                 0x400   /* Configuration registers for address space 0 */
 #define MMU_AS1                 0x440   /* Configuration registers for address space 1 */
 #define MMU_AS2                 0x480   /* Configuration registers for address space 2 */
 #define MMU_AS3                 0x4C0   /* Configuration registers for address space 3 */
@ -221,25 +201,13 @@
 #define MMU_AS15                0x7C0   /* Configuration registers for address space 15 */

 /* MMU address space control registers */
-
-#define MMU_AS_REG(n, r)        (MMU_REG(MMU_AS0 + ((n) << 6)) + (r))
-
-#define AS_TRANSTAB_LO         0x00	/* (RW) Translation Table Base Address for address space n, low word */
-#define AS_TRANSTAB_HI         0x04	/* (RW) Translation Table Base Address for address space n, high word */
-#define AS_MEMATTR_LO          0x08	/* (RW) Memory attributes for address space n, low word. */
-#define AS_MEMATTR_HI          0x0C	/* (RW) Memory attributes for address space n, high word. */
 #define AS_LOCKADDR_LO         0x10	/* (RW) Lock region address for address space n, low word */
 #define AS_LOCKADDR_HI         0x14	/* (RW) Lock region address for address space n, high word */
-#define AS_COMMAND             0x18	/* (WO) MMU command register for address space n */
 #define AS_FAULTSTATUS         0x1C	/* (RO) MMU fault status register for address space n */
 #define AS_FAULTADDRESS_LO     0x20	/* (RO) Fault Address for address space n, low word */
 #define AS_FAULTADDRESS_HI     0x24	/* (RO) Fault Address for address space n, high word */
 #define AS_STATUS              0x28	/* (RO) Status flags for address space n */

-/* (RW) Translation table configuration for address space n, low word */
-#define AS_TRANSCFG_LO         0x30
-/* (RW) Translation table configuration for address space n, high word */
-#define AS_TRANSCFG_HI         0x34
 /* (RO) Secondary fault address for address space n, low word */
 #define AS_FAULTEXTRA_LO       0x38
 /* (RO) Secondary fault address for address space n, high word */
@ -464,6 +432,80 @@
 #define L2_CONFIG_ASN_HASH_ENABLE_MASK         (1ul << L2_CONFIG_ASN_HASH_ENABLE_SHIFT)
 /* End L2_CONFIG register */

+/* AMBA_FEATURES register */
+#define AMBA_FEATURES_ACE_LITE_SHIFT GPU_U(0)
+#define AMBA_FEATURES_ACE_LITE_MASK (GPU_U(0x1) << AMBA_FEATURES_ACE_LITE_SHIFT)
+#define AMBA_FEATURES_ACE_LITE_GET(reg_val)                                    \
+	(((reg_val)&AMBA_FEATURES_ACE_LITE_MASK) >>                            \
+	 AMBA_FEATURES_ACE_LITE_SHIFT)
+#define AMBA_FEATURES_ACE_LITE_SET(reg_val, value)                             \
+	(((reg_val) & ~AMBA_FEATURES_ACE_LITE_MASK) |                          \
+	 (((value) << AMBA_FEATURES_ACE_LITE_SHIFT) &                          \
+	  AMBA_FEATURES_ACE_LITE_MASK))
+#define AMBA_FEATURES_ACE_SHIFT GPU_U(1)
+#define AMBA_FEATURES_ACE_MASK (GPU_U(0x1) << AMBA_FEATURES_ACE_SHIFT)
+#define AMBA_FEATURES_ACE_GET(reg_val)                                         \
+	(((reg_val)&AMBA_FEATURES_ACE_MASK) >> AMBA_FEATURES_ACE_SHIFT)
+#define AMBA_FEATURES_ACE_SET(reg_val, value)                                  \
+	(((reg_val) & ~AMBA_FEATURES_ACE_MASK) |                               \
+	 (((value) << AMBA_FEATURES_ACE_SHIFT) & AMBA_FEATURES_ACE_MASK))
+#define AMBA_FEATURES_MEMORY_CACHE_SUPPORT_SHIFT GPU_U(5)
+#define AMBA_FEATURES_MEMORY_CACHE_SUPPORT_MASK                                \
+	(GPU_U(0x1) << AMBA_FEATURES_MEMORY_CACHE_SUPPORT_SHIFT)
+#define AMBA_FEATURES_MEMORY_CACHE_SUPPORT_GET(reg_val)                        \
+	(((reg_val)&AMBA_FEATURES_MEMORY_CACHE_SUPPORT_MASK) >>                \
+	 AMBA_FEATURES_MEMORY_CACHE_SUPPORT_SHIFT)
+#define AMBA_FEATURES_MEMORY_CACHE_SUPPORT_SET(reg_val, value)                 \
+	(((reg_val) & ~AMBA_FEATURES_MEMORY_CACHE_SUPPORT_MASK) |              \
+	 (((value) << AMBA_FEATURES_MEMORY_CACHE_SUPPORT_SHIFT) &              \
+	  AMBA_FEATURES_MEMORY_CACHE_SUPPORT_MASK))
+#define AMBA_FEATURES_INVALIDATE_HINT_SHIFT GPU_U(6)
+#define AMBA_FEATURES_INVALIDATE_HINT_MASK                                     \
+	(GPU_U(0x1) << AMBA_FEATURES_INVALIDATE_HINT_SHIFT)
+#define AMBA_FEATURES_INVALIDATE_HINT_GET(reg_val)                             \
+	(((reg_val)&AMBA_FEATURES_INVALIDATE_HINT_MASK) >>                     \
+	 AMBA_FEATURES_INVALIDATE_HINT_SHIFT)
+#define AMBA_FEATURES_INVALIDATE_HINT_SET(reg_val, value)                      \
+	(((reg_val) & ~AMBA_FEATURES_INVALIDATE_HINT_MASK) |                   \
+	 (((value) << AMBA_FEATURES_INVALIDATE_HINT_SHIFT) &                   \
+	  AMBA_FEATURES_INVALIDATE_HINT_MASK))
+
+/* AMBA_ENABLE register */
+#define AMBA_ENABLE_COHERENCY_PROTOCOL_SHIFT GPU_U(0)
+#define AMBA_ENABLE_COHERENCY_PROTOCOL_MASK                                    \
+	(GPU_U(0x1F) << AMBA_ENABLE_COHERENCY_PROTOCOL_SHIFT)
+#define AMBA_ENABLE_COHERENCY_PROTOCOL_GET(reg_val)                            \
+	(((reg_val)&AMBA_ENABLE_COHERENCY_PROTOCOL_MASK) >>                    \
+	 AMBA_ENABLE_COHERENCY_PROTOCOL_SHIFT)
+#define AMBA_ENABLE_COHERENCY_PROTOCOL_SET(reg_val, value)                     \
+	(((reg_val) & ~AMBA_ENABLE_COHERENCY_PROTOCOL_MASK) |                  \
+	 (((value) << AMBA_ENABLE_COHERENCY_PROTOCOL_SHIFT) &                  \
+	  AMBA_ENABLE_COHERENCY_PROTOCOL_MASK))
+/* AMBA_ENABLE_coherency_protocol values */
+#define AMBA_ENABLE_COHERENCY_PROTOCOL_ACE_LITE 0x0
+#define AMBA_ENABLE_COHERENCY_PROTOCOL_ACE 0x1
+#define AMBA_ENABLE_COHERENCY_PROTOCOL_NO_COHERENCY 0x1F
+/* End of AMBA_ENABLE_coherency_protocol values */
+#define AMBA_ENABLE_MEMORY_CACHE_SUPPORT_SHIFT GPU_U(5)
+#define AMBA_ENABLE_MEMORY_CACHE_SUPPORT_MASK                                  \
+	(GPU_U(0x1) << AMBA_ENABLE_MEMORY_CACHE_SUPPORT_SHIFT)
+#define AMBA_ENABLE_MEMORY_CACHE_SUPPORT_GET(reg_val)                          \
+	(((reg_val)&AMBA_ENABLE_MEMORY_CACHE_SUPPORT_MASK) >>                  \
+	 AMBA_ENABLE_MEMORY_CACHE_SUPPORT_SHIFT)
+#define AMBA_ENABLE_MEMORY_CACHE_SUPPORT_SET(reg_val, value)                   \
+	(((reg_val) & ~AMBA_ENABLE_MEMORY_CACHE_SUPPORT_MASK) |                \
+	 (((value) << AMBA_ENABLE_MEMORY_CACHE_SUPPORT_SHIFT) &                \
+	  AMBA_ENABLE_MEMORY_CACHE_SUPPORT_MASK))
+#define AMBA_ENABLE_INVALIDATE_HINT_SHIFT GPU_U(6)
+#define AMBA_ENABLE_INVALIDATE_HINT_MASK                                       \
+	(GPU_U(0x1) << AMBA_ENABLE_INVALIDATE_HINT_SHIFT)
+#define AMBA_ENABLE_INVALIDATE_HINT_GET(reg_val)                               \
+	(((reg_val)&AMBA_ENABLE_INVALIDATE_HINT_MASK) >>                       \
+	 AMBA_ENABLE_INVALIDATE_HINT_SHIFT)
+#define AMBA_ENABLE_INVALIDATE_HINT_SET(reg_val, value)                        \
+	(((reg_val) & ~AMBA_ENABLE_INVALIDATE_HINT_MASK) |                     \
+	 (((value) << AMBA_ENABLE_INVALIDATE_HINT_SHIFT) &                     \
+	  AMBA_ENABLE_INVALIDATE_HINT_MASK))

 /* IDVS_GROUP register */
 #define IDVS_GROUP_SIZE_SHIFT (16)
--- a/drivers/base/arm/dma_buf_lock/src/Kbuild
+++ b/drivers/base/arm/dma_buf_lock/src/Kbuild
@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 #
-# (C) COPYRIGHT 2012, 2020-2021 ARM Limited. All rights reserved.
+# (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
 #
 # This program is free software and is provided to you under the terms of the
 # GNU General Public License version 2 as published by the Free Software
@ -18,6 +18,20 @@
 #
 #

-ifeq ($(CONFIG_DMA_BUF_LOCK), y)
-obj-m := dma_buf_lock.o
+bifrost_kbase-y += \
+    hwcnt/mali_kbase_hwcnt.o \
+    hwcnt/mali_kbase_hwcnt_gpu.o \
+    hwcnt/mali_kbase_hwcnt_gpu_narrow.o \
+    hwcnt/mali_kbase_hwcnt_types.o \
+    hwcnt/mali_kbase_hwcnt_virtualizer.o \
+    hwcnt/mali_kbase_hwcnt_watchdog_if_timer.o
+
+ifeq ($(CONFIG_MALI_CSF_SUPPORT),y)
+    bifrost_kbase-y += \
+        hwcnt/backend/mali_kbase_hwcnt_backend_csf.o \
+        hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.o
+else
+    bifrost_kbase-y += \
+        hwcnt/backend/mali_kbase_hwcnt_backend_jm.o \
+        hwcnt/backend/mali_kbase_hwcnt_backend_jm_watchdog.o
 endif
--- a/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend.h
@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 *
- * (C) COPYRIGHT 2018, 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018, 2020-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -56,8 +56,8 @@ struct kbase_hwcnt_backend;
 *
 * Return: Non-NULL pointer to immutable hardware counter metadata.
 */
-typedef const struct kbase_hwcnt_metadata *kbase_hwcnt_backend_metadata_fn(
-	const struct kbase_hwcnt_backend_info *info);
+typedef const struct kbase_hwcnt_metadata *
+kbase_hwcnt_backend_metadata_fn(const struct kbase_hwcnt_backend_info *info);

 /**
 * typedef kbase_hwcnt_backend_init_fn - Initialise a counter backend.
@ -69,9 +69,8 @@ typedef const struct kbase_hwcnt_metadata *kbase_hwcnt_backend_metadata_fn(
 *
 * Return: 0 on success, else error code.
 */
-typedef int kbase_hwcnt_backend_init_fn(
-	const struct kbase_hwcnt_backend_info *info,
-	struct kbase_hwcnt_backend **out_backend);
+typedef int kbase_hwcnt_backend_init_fn(const struct kbase_hwcnt_backend_info *info,
+					struct kbase_hwcnt_backend **out_backend);

 /**
 * typedef kbase_hwcnt_backend_term_fn - Terminate a counter backend.
@ -86,8 +85,7 @@ typedef void kbase_hwcnt_backend_term_fn(struct kbase_hwcnt_backend *backend);
 *
 * Return: Backend timestamp in nanoseconds.
 */
-typedef u64 kbase_hwcnt_backend_timestamp_ns_fn(
-	struct kbase_hwcnt_backend *backend);
+typedef u64 kbase_hwcnt_backend_timestamp_ns_fn(struct kbase_hwcnt_backend *backend);

 /**
 * typedef kbase_hwcnt_backend_dump_enable_fn - Start counter dumping with the
@ -102,9 +100,8 @@ typedef u64 kbase_hwcnt_backend_timestamp_ns_fn(
 *
 * Return: 0 on success, else error code.
 */
-typedef int kbase_hwcnt_backend_dump_enable_fn(
-	struct kbase_hwcnt_backend *backend,
-	const struct kbase_hwcnt_enable_map *enable_map);
+typedef int kbase_hwcnt_backend_dump_enable_fn(struct kbase_hwcnt_backend *backend,
+					       const struct kbase_hwcnt_enable_map *enable_map);

 /**
 * typedef kbase_hwcnt_backend_dump_enable_nolock_fn - Start counter dumping
@ -118,9 +115,9 @@ typedef int kbase_hwcnt_backend_dump_enable_fn(
 *
 * Return: 0 on success, else error code.
 */
-typedef int kbase_hwcnt_backend_dump_enable_nolock_fn(
-	struct kbase_hwcnt_backend *backend,
-	const struct kbase_hwcnt_enable_map *enable_map);
+typedef int
+kbase_hwcnt_backend_dump_enable_nolock_fn(struct kbase_hwcnt_backend *backend,
+					  const struct kbase_hwcnt_enable_map *enable_map);

 /**
 * typedef kbase_hwcnt_backend_dump_disable_fn - Disable counter dumping with
@ -130,8 +127,7 @@ typedef int kbase_hwcnt_backend_dump_enable_nolock_fn(
 * If the backend is already disabled, does nothing.
 * Any undumped counter values since the last dump get will be lost.
 */
-typedef void kbase_hwcnt_backend_dump_disable_fn(
-	struct kbase_hwcnt_backend *backend);
+typedef void kbase_hwcnt_backend_dump_disable_fn(struct kbase_hwcnt_backend *backend);

 /**
 * typedef kbase_hwcnt_backend_dump_clear_fn - Reset all the current undumped
@ -142,8 +138,7 @@ typedef void kbase_hwcnt_backend_dump_disable_fn(
 *
 * Return: 0 on success, else error code.
 */
-typedef int kbase_hwcnt_backend_dump_clear_fn(
-	struct kbase_hwcnt_backend *backend);
+typedef int kbase_hwcnt_backend_dump_clear_fn(struct kbase_hwcnt_backend *backend);

 /**
 * typedef kbase_hwcnt_backend_dump_request_fn - Request an asynchronous counter
@ -157,9 +152,8 @@ typedef int kbase_hwcnt_backend_dump_clear_fn(
 *
 * Return: 0 on success, else error code.
 */
-typedef int kbase_hwcnt_backend_dump_request_fn(
-	struct kbase_hwcnt_backend *backend,
-	u64 *dump_time_ns);
+typedef int kbase_hwcnt_backend_dump_request_fn(struct kbase_hwcnt_backend *backend,
+						u64 *dump_time_ns);

 /**
 * typedef kbase_hwcnt_backend_dump_wait_fn - Wait until the last requested
@ -170,8 +164,7 @@ typedef int kbase_hwcnt_backend_dump_request_fn(
 *
 * Return: 0 on success, else error code.
 */
-typedef int kbase_hwcnt_backend_dump_wait_fn(
-	struct kbase_hwcnt_backend *backend);
+typedef int kbase_hwcnt_backend_dump_wait_fn(struct kbase_hwcnt_backend *backend);

 /**
 * typedef kbase_hwcnt_backend_dump_get_fn - Copy or accumulate enable the
@ -189,11 +182,10 @@ typedef int kbase_hwcnt_backend_dump_wait_fn(
 *
 * Return: 0 on success, else error code.
 */
-typedef int kbase_hwcnt_backend_dump_get_fn(
-	struct kbase_hwcnt_backend *backend,
-	struct kbase_hwcnt_dump_buffer *dump_buffer,
-	const struct kbase_hwcnt_enable_map *enable_map,
-	bool accumulate);
+typedef int kbase_hwcnt_backend_dump_get_fn(struct kbase_hwcnt_backend *backend,
+					    struct kbase_hwcnt_dump_buffer *dump_buffer,
+					    const struct kbase_hwcnt_enable_map *enable_map,
+					    bool accumulate);

 /**
 * struct kbase_hwcnt_backend_interface - Hardware counter backend virtual
--- a/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf.c
+++ b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf.c
--- a/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf.h
@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 *
- * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -27,9 +27,9 @@
 #ifndef _KBASE_HWCNT_BACKEND_CSF_H_
 #define _KBASE_HWCNT_BACKEND_CSF_H_

-#include "mali_kbase_hwcnt_backend.h"
-#include "mali_kbase_hwcnt_backend_csf_if.h"
-#include "mali_kbase_hwcnt_watchdog_if.h"
+#include "hwcnt/backend/mali_kbase_hwcnt_backend.h"
+#include "hwcnt/backend/mali_kbase_hwcnt_backend_csf_if.h"
+#include "hwcnt/mali_kbase_hwcnt_watchdog_if.h"

 /**
 * kbase_hwcnt_backend_csf_create() - Create a CSF hardware counter backend
@ -47,10 +47,9 @@
 *
 * Return: 0 on success, else error code.
 */
-int kbase_hwcnt_backend_csf_create(
-	struct kbase_hwcnt_backend_csf_if *csf_if, u32 ring_buf_cnt,
-	struct kbase_hwcnt_watchdog_interface *watchdog_if,
-	struct kbase_hwcnt_backend_interface *iface);
+int kbase_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_if *csf_if, u32 ring_buf_cnt,
+				   struct kbase_hwcnt_watchdog_interface *watchdog_if,
+				   struct kbase_hwcnt_backend_interface *iface);

 /**
 * kbase_hwcnt_backend_csf_metadata_init() - Initialize the metadata for a CSF
@ -58,16 +57,14 @@ int kbase_hwcnt_backend_csf_create(
 * @iface: Non-NULL pointer to backend interface structure
 * Return: 0 on success, else error code.
 */
-int kbase_hwcnt_backend_csf_metadata_init(
-	struct kbase_hwcnt_backend_interface *iface);
+int kbase_hwcnt_backend_csf_metadata_init(struct kbase_hwcnt_backend_interface *iface);

 /**
 * kbase_hwcnt_backend_csf_metadata_term() - Terminate the metadata for a CSF
 *                                           hardware counter backend.
 * @iface: Non-NULL pointer to backend interface structure.
 */
-void kbase_hwcnt_backend_csf_metadata_term(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_csf_metadata_term(struct kbase_hwcnt_backend_interface *iface);

 /**
 * kbase_hwcnt_backend_csf_destroy() - Destroy a CSF hardware counter backend
@ -77,8 +74,7 @@ void kbase_hwcnt_backend_csf_metadata_term(
 * Can be safely called on an all-zeroed interface, or on an already destroyed
 * interface.
 */
-void kbase_hwcnt_backend_csf_destroy(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_csf_destroy(struct kbase_hwcnt_backend_interface *iface);

 /**
 * kbase_hwcnt_backend_csf_protm_entered() - CSF HWC backend function to receive
@ -86,8 +82,7 @@ void kbase_hwcnt_backend_csf_destroy(
 *                                           has been entered.
 * @iface: Non-NULL pointer to HWC backend interface.
 */
-void kbase_hwcnt_backend_csf_protm_entered(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_csf_protm_entered(struct kbase_hwcnt_backend_interface *iface);

 /**
 * kbase_hwcnt_backend_csf_protm_exited() - CSF HWC backend function to receive
@ -95,8 +90,7 @@ void kbase_hwcnt_backend_csf_protm_entered(
 *                                          been exited.
 * @iface: Non-NULL pointer to HWC backend interface.
 */
-void kbase_hwcnt_backend_csf_protm_exited(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_csf_protm_exited(struct kbase_hwcnt_backend_interface *iface);

 /**
 * kbase_hwcnt_backend_csf_on_unrecoverable_error() - CSF HWC backend function
@ -108,8 +102,7 @@ void kbase_hwcnt_backend_csf_protm_exited(
 * with reset, or that may put HWC logic in state that could result in hang. For
 * example, on bus error, or when FW becomes unresponsive.
 */
-void kbase_hwcnt_backend_csf_on_unrecoverable_error(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_csf_on_unrecoverable_error(struct kbase_hwcnt_backend_interface *iface);

 /**
 * kbase_hwcnt_backend_csf_on_before_reset() - CSF HWC backend function to be
@ -119,16 +112,14 @@ void kbase_hwcnt_backend_csf_on_unrecoverable_error(
 *                                             were in it.
 * @iface: Non-NULL pointer to HWC backend interface.
 */
-void kbase_hwcnt_backend_csf_on_before_reset(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_csf_on_before_reset(struct kbase_hwcnt_backend_interface *iface);

 /**
 * kbase_hwcnt_backend_csf_on_prfcnt_sample() - CSF performance counter sample
 *                                              complete interrupt handler.
 * @iface: Non-NULL pointer to HWC backend interface.
 */
-void kbase_hwcnt_backend_csf_on_prfcnt_sample(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_csf_on_prfcnt_sample(struct kbase_hwcnt_backend_interface *iface);

 /**
 * kbase_hwcnt_backend_csf_on_prfcnt_threshold() - CSF performance counter
@ -136,31 +127,27 @@ void kbase_hwcnt_backend_csf_on_prfcnt_sample(
 *                                                 interrupt handler.
 * @iface: Non-NULL pointer to HWC backend interface.
 */
-void kbase_hwcnt_backend_csf_on_prfcnt_threshold(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_csf_on_prfcnt_threshold(struct kbase_hwcnt_backend_interface *iface);

 /**
 * kbase_hwcnt_backend_csf_on_prfcnt_overflow() - CSF performance counter buffer
 *                                                overflow interrupt handler.
 * @iface: Non-NULL pointer to HWC backend interface.
 */
-void kbase_hwcnt_backend_csf_on_prfcnt_overflow(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_csf_on_prfcnt_overflow(struct kbase_hwcnt_backend_interface *iface);

 /**
 * kbase_hwcnt_backend_csf_on_prfcnt_enable() - CSF performance counter enabled
 *                                              interrupt handler.
 * @iface: Non-NULL pointer to HWC backend interface.
 */
-void kbase_hwcnt_backend_csf_on_prfcnt_enable(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_csf_on_prfcnt_enable(struct kbase_hwcnt_backend_interface *iface);

 /**
 * kbase_hwcnt_backend_csf_on_prfcnt_disable() - CSF performance counter
 *                                               disabled interrupt handler.
 * @iface: Non-NULL pointer to HWC backend interface.
 */
-void kbase_hwcnt_backend_csf_on_prfcnt_disable(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_csf_on_prfcnt_disable(struct kbase_hwcnt_backend_interface *iface);

 #endif /* _KBASE_HWCNT_BACKEND_CSF_H_ */
--- a/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if.h
@ -85,8 +85,8 @@ struct kbase_hwcnt_backend_csf_if_prfcnt_info {
 *                                                          held.
 * @ctx: Non-NULL pointer to a CSF context.
 */
-typedef void kbase_hwcnt_backend_csf_if_assert_lock_held_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx);
+typedef void
+kbase_hwcnt_backend_csf_if_assert_lock_held_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx);

 /**
 * typedef kbase_hwcnt_backend_csf_if_lock_fn - Acquire backend spinlock.
@ -95,9 +95,8 @@ typedef void kbase_hwcnt_backend_csf_if_assert_lock_held_fn(
 * @flags: Pointer to the memory location that would store the previous
 *         interrupt state.
 */
-typedef void kbase_hwcnt_backend_csf_if_lock_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-	unsigned long *flags);
+typedef void kbase_hwcnt_backend_csf_if_lock_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+						unsigned long *flags);

 /**
 * typedef kbase_hwcnt_backend_csf_if_unlock_fn - Release backend spinlock.
@ -106,9 +105,8 @@ typedef void kbase_hwcnt_backend_csf_if_lock_fn(
 * @flags: Previously stored interrupt state when Scheduler interrupt
 *         spinlock was acquired.
 */
-typedef void kbase_hwcnt_backend_csf_if_unlock_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-	unsigned long flags);
+typedef void kbase_hwcnt_backend_csf_if_unlock_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+						  unsigned long flags);

 /**
 * typedef kbase_hwcnt_backend_csf_if_get_prfcnt_info_fn - Get performance
@ -137,10 +135,10 @@ typedef void kbase_hwcnt_backend_csf_if_get_prfcnt_info_fn(
 *
 * Return: 0 on success, else error code.
 */
-typedef int kbase_hwcnt_backend_csf_if_ring_buf_alloc_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u32 buf_count,
-	void **cpu_dump_base,
-	struct kbase_hwcnt_backend_csf_if_ring_buf **ring_buf);
+typedef int
+kbase_hwcnt_backend_csf_if_ring_buf_alloc_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+					     u32 buf_count, void **cpu_dump_base,
+					     struct kbase_hwcnt_backend_csf_if_ring_buf **ring_buf);

 /**
 * typedef kbase_hwcnt_backend_csf_if_ring_buf_sync_fn - Sync HWC dump buffers
@ -159,10 +157,10 @@ typedef int kbase_hwcnt_backend_csf_if_ring_buf_alloc_fn(
 * Flush cached HWC dump buffer data to ensure that all writes from GPU and CPU
 * are correctly observed.
 */
-typedef void kbase_hwcnt_backend_csf_if_ring_buf_sync_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-	struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf,
-	u32 buf_index_first, u32 buf_index_last, bool for_cpu);
+typedef void
+kbase_hwcnt_backend_csf_if_ring_buf_sync_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+					    struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf,
+					    u32 buf_index_first, u32 buf_index_last, bool for_cpu);

 /**
 * typedef kbase_hwcnt_backend_csf_if_ring_buf_free_fn - Free a ring buffer for
@ -171,9 +169,9 @@ typedef void kbase_hwcnt_backend_csf_if_ring_buf_sync_fn(
 * @ctx:      Non-NULL pointer to a CSF interface context.
 * @ring_buf: Non-NULL pointer to the ring buffer which to be freed.
 */
-typedef void kbase_hwcnt_backend_csf_if_ring_buf_free_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-	struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf);
+typedef void
+kbase_hwcnt_backend_csf_if_ring_buf_free_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+					    struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf);

 /**
 * typedef kbase_hwcnt_backend_csf_if_timestamp_ns_fn - Get the current
@ -183,8 +181,7 @@ typedef void kbase_hwcnt_backend_csf_if_ring_buf_free_fn(
 *
 * Return: CSF interface timestamp in nanoseconds.
 */
-typedef u64 kbase_hwcnt_backend_csf_if_timestamp_ns_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx);
+typedef u64 kbase_hwcnt_backend_csf_if_timestamp_ns_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx);

 /**
 * typedef kbase_hwcnt_backend_csf_if_dump_enable_fn - Setup and enable hardware
@ -195,10 +192,10 @@ typedef u64 kbase_hwcnt_backend_csf_if_timestamp_ns_fn(
 *
 * Requires lock to be taken before calling.
 */
-typedef void kbase_hwcnt_backend_csf_if_dump_enable_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-	struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf,
-	struct kbase_hwcnt_backend_csf_if_enable *enable);
+typedef void
+kbase_hwcnt_backend_csf_if_dump_enable_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+					  struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf,
+					  struct kbase_hwcnt_backend_csf_if_enable *enable);

 /**
 * typedef kbase_hwcnt_backend_csf_if_dump_disable_fn - Disable hardware counter
@ -207,8 +204,7 @@ typedef void kbase_hwcnt_backend_csf_if_dump_enable_fn(
 *
 * Requires lock to be taken before calling.
 */
-typedef void kbase_hwcnt_backend_csf_if_dump_disable_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx);
+typedef void kbase_hwcnt_backend_csf_if_dump_disable_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx);

 /**
 * typedef kbase_hwcnt_backend_csf_if_dump_request_fn - Request a HWC dump.
@ -217,8 +213,7 @@ typedef void kbase_hwcnt_backend_csf_if_dump_disable_fn(
 *
 * Requires lock to be taken before calling.
 */
-typedef void kbase_hwcnt_backend_csf_if_dump_request_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx);
+typedef void kbase_hwcnt_backend_csf_if_dump_request_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx);

 /**
 * typedef kbase_hwcnt_backend_csf_if_get_indexes_fn - Get current extract and
@ -231,9 +226,8 @@ typedef void kbase_hwcnt_backend_csf_if_dump_request_fn(
 *
 * Requires lock to be taken before calling.
 */
-typedef void kbase_hwcnt_backend_csf_if_get_indexes_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u32 *extract_index,
-	u32 *insert_index);
+typedef void kbase_hwcnt_backend_csf_if_get_indexes_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+						       u32 *extract_index, u32 *insert_index);

 /**
 * typedef kbase_hwcnt_backend_csf_if_set_extract_index_fn - Update the extract
@ -245,8 +239,9 @@ typedef void kbase_hwcnt_backend_csf_if_get_indexes_fn(
 *
 * Requires lock to be taken before calling.
 */
-typedef void kbase_hwcnt_backend_csf_if_set_extract_index_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u32 extract_index);
+typedef void
+kbase_hwcnt_backend_csf_if_set_extract_index_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+						u32 extract_index);

 /**
 * typedef kbase_hwcnt_backend_csf_if_get_gpu_cycle_count_fn - Get the current
@ -260,9 +255,9 @@ typedef void kbase_hwcnt_backend_csf_if_set_extract_index_fn(
 *
 * Requires lock to be taken before calling.
 */
-typedef void kbase_hwcnt_backend_csf_if_get_gpu_cycle_count_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u64 *cycle_counts,
-	u64 clk_enable_map);
+typedef void
+kbase_hwcnt_backend_csf_if_get_gpu_cycle_count_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+						  u64 *cycle_counts, u64 clk_enable_map);

 /**
 * struct kbase_hwcnt_backend_csf_if - Hardware counter backend CSF virtual
--- a/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.c
+++ b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.c
@ -26,12 +26,12 @@
 #include <mali_kbase.h>
 #include <gpu/mali_kbase_gpu_regmap.h>
 #include <device/mali_kbase_device.h>
-#include "mali_kbase_hwcnt_gpu.h"
-#include "mali_kbase_hwcnt_types.h"
+#include "hwcnt/mali_kbase_hwcnt_gpu.h"
+#include "hwcnt/mali_kbase_hwcnt_types.h"
 #include <csf/mali_kbase_csf_registers.h>

 #include "csf/mali_kbase_csf_firmware.h"
-#include "mali_kbase_hwcnt_backend_csf_if_fw.h"
+#include "hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.h"
 #include "mali_kbase_hwaccess_time.h"
 #include "backend/gpu/mali_kbase_clk_rate_trace_mgr.h"

@ -42,9 +42,6 @@
 #include <backend/gpu/mali_kbase_model_dummy.h>
 #endif /* CONFIG_MALI_BIFROST_NO_MALI */

-/** The number of nanoseconds in a second. */
-#define NSECS_IN_SEC 1000000000ull /* ns */
-
 /* Ring buffer virtual address start at 4GB  */
 #define KBASE_HWC_CSF_RING_BUFFER_VA_START (1ull << 32)

@ -90,8 +87,8 @@ struct kbase_hwcnt_backend_csf_if_fw_ctx {
 	struct kbase_ccswe ccswe_shader_cores;
 };

-static void kbasep_hwcnt_backend_csf_if_fw_assert_lock_held(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx)
+static void
+kbasep_hwcnt_backend_csf_if_fw_assert_lock_held(struct kbase_hwcnt_backend_csf_if_ctx *ctx)
 {
 	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx;
 	struct kbase_device *kbdev;
@ -104,9 +101,8 @@ static void kbasep_hwcnt_backend_csf_if_fw_assert_lock_held(
 	kbase_csf_scheduler_spin_lock_assert_held(kbdev);
 }

-static void
-kbasep_hwcnt_backend_csf_if_fw_lock(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-				    unsigned long *flags)
+static void kbasep_hwcnt_backend_csf_if_fw_lock(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+						unsigned long *flags)
 {
 	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx;
 	struct kbase_device *kbdev;
@ -119,8 +115,8 @@ kbasep_hwcnt_backend_csf_if_fw_lock(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
 	kbase_csf_scheduler_spin_lock(kbdev, flags);
 }

-static void kbasep_hwcnt_backend_csf_if_fw_unlock(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx, unsigned long flags)
+static void kbasep_hwcnt_backend_csf_if_fw_unlock(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+						  unsigned long flags)
 {
 	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx;
 	struct kbase_device *kbdev;
@ -141,22 +137,19 @@ static void kbasep_hwcnt_backend_csf_if_fw_unlock(
 * @clk_index:        Clock index
 * @clk_rate_hz:      Clock frequency(hz)
 */
-static void kbasep_hwcnt_backend_csf_if_fw_on_freq_change(
-	struct kbase_clk_rate_listener *rate_listener, u32 clk_index,
-	u32 clk_rate_hz)
+static void
+kbasep_hwcnt_backend_csf_if_fw_on_freq_change(struct kbase_clk_rate_listener *rate_listener,
+					      u32 clk_index, u32 clk_rate_hz)
 {
-	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx =
-		container_of(rate_listener,
-			     struct kbase_hwcnt_backend_csf_if_fw_ctx,
-			     rate_listener);
+	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx = container_of(
+		rate_listener, struct kbase_hwcnt_backend_csf_if_fw_ctx, rate_listener);
 	u64 timestamp_ns;

 	if (clk_index != KBASE_CLOCK_DOMAIN_SHADER_CORES)
 		return;

 	timestamp_ns = ktime_get_raw_ns();
-	kbase_ccswe_freq_change(&fw_ctx->ccswe_shader_cores, timestamp_ns,
-				clk_rate_hz);
+	kbase_ccswe_freq_change(&fw_ctx->ccswe_shader_cores, timestamp_ns, clk_rate_hz);
 }

 /**
@ -165,17 +158,16 @@ static void kbasep_hwcnt_backend_csf_if_fw_on_freq_change(
 * @fw_ctx:         Non-NULL pointer to CSF firmware interface context.
 * @clk_enable_map: Non-NULL pointer to enable map specifying enabled counters.
 */
-static void kbasep_hwcnt_backend_csf_if_fw_cc_enable(
-	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx, u64 clk_enable_map)
+static void
+kbasep_hwcnt_backend_csf_if_fw_cc_enable(struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx,
+					 u64 clk_enable_map)
 {
 	struct kbase_device *kbdev = fw_ctx->kbdev;

-	if (kbase_hwcnt_clk_enable_map_enabled(
-		    clk_enable_map, KBASE_CLOCK_DOMAIN_SHADER_CORES)) {
+	if (kbase_hwcnt_clk_enable_map_enabled(clk_enable_map, KBASE_CLOCK_DOMAIN_SHADER_CORES)) {
 		/* software estimation for non-top clock domains */
 		struct kbase_clk_rate_trace_manager *rtm = &kbdev->pm.clk_rtm;
-		const struct kbase_clk_data *clk_data =
-			rtm->clks[KBASE_CLOCK_DOMAIN_SHADER_CORES];
+		const struct kbase_clk_data *clk_data = rtm->clks[KBASE_CLOCK_DOMAIN_SHADER_CORES];
 		u32 cur_freq;
 		unsigned long flags;
 		u64 timestamp_ns;
@ -186,11 +178,9 @@ static void kbasep_hwcnt_backend_csf_if_fw_cc_enable(

 		cur_freq = (u32)clk_data->clock_val;
 		kbase_ccswe_reset(&fw_ctx->ccswe_shader_cores);
-		kbase_ccswe_freq_change(&fw_ctx->ccswe_shader_cores,
-					timestamp_ns, cur_freq);
+		kbase_ccswe_freq_change(&fw_ctx->ccswe_shader_cores, timestamp_ns, cur_freq);

-		kbase_clk_rate_trace_manager_subscribe_no_lock(
-			rtm, &fw_ctx->rate_listener);
+		kbase_clk_rate_trace_manager_subscribe_no_lock(rtm, &fw_ctx->rate_listener);

 		spin_unlock_irqrestore(&rtm->lock, flags);
 	}
@ -203,17 +193,15 @@ static void kbasep_hwcnt_backend_csf_if_fw_cc_enable(
 *
 * @fw_ctx:     Non-NULL pointer to CSF firmware interface context.
 */
-static void kbasep_hwcnt_backend_csf_if_fw_cc_disable(
-	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx)
+static void
+kbasep_hwcnt_backend_csf_if_fw_cc_disable(struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx)
 {
 	struct kbase_device *kbdev = fw_ctx->kbdev;
 	struct kbase_clk_rate_trace_manager *rtm = &kbdev->pm.clk_rtm;
 	u64 clk_enable_map = fw_ctx->clk_enable_map;

-	if (kbase_hwcnt_clk_enable_map_enabled(clk_enable_map,
-					       KBASE_CLOCK_DOMAIN_SHADER_CORES))
-		kbase_clk_rate_trace_manager_unsubscribe(
-			rtm, &fw_ctx->rate_listener);
+	if (kbase_hwcnt_clk_enable_map_enabled(clk_enable_map, KBASE_CLOCK_DOMAIN_SHADER_CORES))
+		kbase_clk_rate_trace_manager_unsubscribe(rtm, &fw_ctx->rate_listener);
 }

 static void kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info(
@ -244,8 +232,8 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info(
 	u32 prfcnt_size;
 	u32 prfcnt_hw_size;
 	u32 prfcnt_fw_size;
-	u32 prfcnt_block_size = KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK *
-				KBASE_HWCNT_VALUE_HW_BYTES;
+	u32 prfcnt_block_size =
+		KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK * KBASE_HWCNT_VALUE_HW_BYTES;

 	WARN_ON(!ctx);
 	WARN_ON(!prfcnt_info);
@ -262,10 +250,9 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info(
 	 */
 	if ((kbdev->gpu_props.props.raw_props.gpu_id & GPU_ID2_PRODUCT_MODEL) >=
 	    GPU_ID2_PRODUCT_TTUX) {
-		prfcnt_block_size =
-			PRFCNT_FEATURES_COUNTER_BLOCK_SIZE_GET(kbase_reg_read(
-				kbdev, GPU_CONTROL_REG(PRFCNT_FEATURES)))
-			<< 8;
+		prfcnt_block_size = PRFCNT_FEATURES_COUNTER_BLOCK_SIZE_GET(
+					    kbase_reg_read(kbdev, GPU_CONTROL_REG(PRFCNT_FEATURES)))
+				    << 8;
 	}

 	*prfcnt_info = (struct kbase_hwcnt_backend_csf_if_prfcnt_info){
@ -280,17 +267,14 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info(
 	};

 	/* Block size must be multiple of counter size. */
-	WARN_ON((prfcnt_info->prfcnt_block_size % KBASE_HWCNT_VALUE_HW_BYTES) !=
-		0);
+	WARN_ON((prfcnt_info->prfcnt_block_size % KBASE_HWCNT_VALUE_HW_BYTES) != 0);
 	/* Total size must be multiple of block size. */
-	WARN_ON((prfcnt_info->dump_bytes % prfcnt_info->prfcnt_block_size) !=
-		0);
+	WARN_ON((prfcnt_info->dump_bytes % prfcnt_info->prfcnt_block_size) != 0);
 #endif
 }

 static int kbasep_hwcnt_backend_csf_if_fw_ring_buf_alloc(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u32 buf_count,
-	void **cpu_dump_base,
+	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u32 buf_count, void **cpu_dump_base,
 	struct kbase_hwcnt_backend_csf_if_ring_buf **out_ring_buf)
 {
 	struct kbase_device *kbdev;
@ -342,9 +326,8 @@ static int kbasep_hwcnt_backend_csf_if_fw_ring_buf_alloc(
 		goto page_list_alloc_error;

 	/* Get physical page for the buffer */
-	ret = kbase_mem_pool_alloc_pages(
-		&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW], num_pages,
-		phys, false);
+	ret = kbase_mem_pool_alloc_pages(&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW], num_pages,
+					 phys, false);
 	if (ret != num_pages)
 		goto phys_mem_pool_alloc_error;

@ -360,9 +343,8 @@ static int kbasep_hwcnt_backend_csf_if_fw_ring_buf_alloc(
 		KBASE_REG_MEMATTR_INDEX(AS_MEMATTR_INDEX_NON_CACHEABLE);

 	/* Update MMU table */
-	ret = kbase_mmu_insert_pages(kbdev, &kbdev->csf.mcu_mmu,
-				     gpu_va_base >> PAGE_SHIFT, phys, num_pages,
-				     flags, MCU_AS_NR, KBASE_MEM_GROUP_CSF_FW,
+	ret = kbase_mmu_insert_pages(kbdev, &kbdev->csf.mcu_mmu, gpu_va_base >> PAGE_SHIFT, phys,
+				     num_pages, flags, MCU_AS_NR, KBASE_MEM_GROUP_CSF_FW,
 				     mmu_sync_info);
 	if (ret)
 		goto mmu_insert_failed;
@ -381,17 +363,15 @@ static int kbasep_hwcnt_backend_csf_if_fw_ring_buf_alloc(
 	fw_ring_buf->as_nr = MCU_AS_NR;

 	*cpu_dump_base = fw_ring_buf->cpu_dump_base;
-	*out_ring_buf =
-		(struct kbase_hwcnt_backend_csf_if_ring_buf *)fw_ring_buf;
+	*out_ring_buf = (struct kbase_hwcnt_backend_csf_if_ring_buf *)fw_ring_buf;

 	return 0;

 mmu_insert_failed:
 	vunmap(cpu_addr);
 vmap_error:
-	kbase_mem_pool_free_pages(
-		&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW], num_pages,
-		phys, false, false);
+	kbase_mem_pool_free_pages(&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW], num_pages, phys,
+				  false, false);
 phys_mem_pool_alloc_error:
 	kfree(page_list);
 page_list_alloc_error:
@ -401,10 +381,10 @@ static int kbasep_hwcnt_backend_csf_if_fw_ring_buf_alloc(
 	return -ENOMEM;
 }

-static void kbasep_hwcnt_backend_csf_if_fw_ring_buf_sync(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-	struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf,
-	u32 buf_index_first, u32 buf_index_last, bool for_cpu)
+static void
+kbasep_hwcnt_backend_csf_if_fw_ring_buf_sync(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+					     struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf,
+					     u32 buf_index_first, u32 buf_index_last, bool for_cpu)
 {
 	struct kbase_hwcnt_backend_csf_if_fw_ring_buf *fw_ring_buf =
 		(struct kbase_hwcnt_backend_csf_if_fw_ring_buf *)ring_buf;
@ -435,8 +415,7 @@ static void kbasep_hwcnt_backend_csf_if_fw_ring_buf_sync(
 	 * inclusive at both ends so full flushes are not 0 -> 0.
 	 */
 	ring_buf_index_first = buf_index_first & (fw_ring_buf->buf_count - 1);
-	ring_buf_index_last =
-		(buf_index_last - 1) & (fw_ring_buf->buf_count - 1);
+	ring_buf_index_last = (buf_index_last - 1) & (fw_ring_buf->buf_count - 1);

 	/* The start address is the offset of the first buffer. */
 	start_address = fw_ctx->buf_bytes * ring_buf_index_first;
@ -453,15 +432,11 @@ static void kbasep_hwcnt_backend_csf_if_fw_ring_buf_sync(
 			struct page *pg = as_page(fw_ring_buf->phys[i]);

 			if (for_cpu) {
-				kbase_sync_single_for_cpu(fw_ctx->kbdev,
-							  kbase_dma_addr(pg),
-							  PAGE_SIZE,
-							  DMA_BIDIRECTIONAL);
+				kbase_sync_single_for_cpu(fw_ctx->kbdev, kbase_dma_addr(pg),
+							  PAGE_SIZE, DMA_BIDIRECTIONAL);
 			} else {
-				kbase_sync_single_for_device(fw_ctx->kbdev,
-							     kbase_dma_addr(pg),
-							     PAGE_SIZE,
-							     DMA_BIDIRECTIONAL);
+				kbase_sync_single_for_device(fw_ctx->kbdev, kbase_dma_addr(pg),
+							     PAGE_SIZE, DMA_BIDIRECTIONAL);
 			}
 		}

@ -473,28 +448,24 @@ static void kbasep_hwcnt_backend_csf_if_fw_ring_buf_sync(
 		struct page *pg = as_page(fw_ring_buf->phys[i]);

 		if (for_cpu) {
-			kbase_sync_single_for_cpu(fw_ctx->kbdev,
-						  kbase_dma_addr(pg), PAGE_SIZE,
+			kbase_sync_single_for_cpu(fw_ctx->kbdev, kbase_dma_addr(pg), PAGE_SIZE,
 						  DMA_BIDIRECTIONAL);
 		} else {
-			kbase_sync_single_for_device(fw_ctx->kbdev,
-						     kbase_dma_addr(pg),
-						     PAGE_SIZE,
+			kbase_sync_single_for_device(fw_ctx->kbdev, kbase_dma_addr(pg), PAGE_SIZE,
 						     DMA_BIDIRECTIONAL);
 		}
 	}
 }

-static u64 kbasep_hwcnt_backend_csf_if_fw_timestamp_ns(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx)
+static u64 kbasep_hwcnt_backend_csf_if_fw_timestamp_ns(struct kbase_hwcnt_backend_csf_if_ctx *ctx)
 {
 	CSTD_UNUSED(ctx);
 	return ktime_get_raw_ns();
 }

-static void kbasep_hwcnt_backend_csf_if_fw_ring_buf_free(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-	struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf)
+static void
+kbasep_hwcnt_backend_csf_if_fw_ring_buf_free(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+					     struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf)
 {
 	struct kbase_hwcnt_backend_csf_if_fw_ring_buf *fw_ring_buf =
 		(struct kbase_hwcnt_backend_csf_if_fw_ring_buf *)ring_buf;
@ -513,10 +484,8 @@ static void kbasep_hwcnt_backend_csf_if_fw_ring_buf_free(

 		vunmap(fw_ring_buf->cpu_dump_base);

-		kbase_mem_pool_free_pages(
-			&fw_ctx->kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW],
-			fw_ring_buf->num_pages, fw_ring_buf->phys, false,
-			false);
+		kbase_mem_pool_free_pages(&fw_ctx->kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW],
+					  fw_ring_buf->num_pages, fw_ring_buf->phys, false, false);

 		kfree(fw_ring_buf->phys);

@ -524,10 +493,10 @@ static void kbasep_hwcnt_backend_csf_if_fw_ring_buf_free(
 	}
 }

-static void kbasep_hwcnt_backend_csf_if_fw_dump_enable(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-	struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf,
-	struct kbase_hwcnt_backend_csf_if_enable *enable)
+static void
+kbasep_hwcnt_backend_csf_if_fw_dump_enable(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+					   struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf,
+					   struct kbase_hwcnt_backend_csf_if_enable *enable)
 {
 	u32 prfcnt_config;
 	struct kbase_device *kbdev;
@ -550,8 +519,7 @@ static void kbasep_hwcnt_backend_csf_if_fw_dump_enable(
 	prfcnt_config = GLB_PRFCNT_CONFIG_SET_SELECT_SET(prfcnt_config, enable->counter_set);

 	/* Configure the ring buffer base address */
-	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_JASID,
-					fw_ring_buf->as_nr);
+	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_JASID, fw_ring_buf->as_nr);
 	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_BASE_LO,
 					fw_ring_buf->gpu_dump_base & U32_MAX);
 	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_BASE_HI,
@ -561,38 +529,29 @@ static void kbasep_hwcnt_backend_csf_if_fw_dump_enable(
 	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_EXTRACT, 0);

 	/* Configure the enable bitmap */
-	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_CSF_EN,
-					enable->fe_bm);
-	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_SHADER_EN,
-					enable->shader_bm);
-	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_MMU_L2_EN,
-					enable->mmu_l2_bm);
-	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_TILER_EN,
-					enable->tiler_bm);
+	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_CSF_EN, enable->fe_bm);
+	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_SHADER_EN, enable->shader_bm);
+	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_MMU_L2_EN, enable->mmu_l2_bm);
+	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_TILER_EN, enable->tiler_bm);

 	/* Configure the HWC set and buffer size */
-	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_CONFIG,
-					prfcnt_config);
+	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_CONFIG, prfcnt_config);

 	kbdev->csf.hwcnt.enable_pending = true;

 	/* Unmask the interrupts */
-	kbase_csf_firmware_global_input_mask(
-		global_iface, GLB_ACK_IRQ_MASK,
-		GLB_ACK_IRQ_MASK_PRFCNT_SAMPLE_MASK,
-		GLB_ACK_IRQ_MASK_PRFCNT_SAMPLE_MASK);
-	kbase_csf_firmware_global_input_mask(
-		global_iface, GLB_ACK_IRQ_MASK,
-		GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_MASK,
-		GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_MASK);
-	kbase_csf_firmware_global_input_mask(
-		global_iface, GLB_ACK_IRQ_MASK,
-		GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_MASK,
-		GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_MASK);
-	kbase_csf_firmware_global_input_mask(
-		global_iface, GLB_ACK_IRQ_MASK,
-		GLB_ACK_IRQ_MASK_PRFCNT_ENABLE_MASK,
-		GLB_ACK_IRQ_MASK_PRFCNT_ENABLE_MASK);
+	kbase_csf_firmware_global_input_mask(global_iface, GLB_ACK_IRQ_MASK,
+					     GLB_ACK_IRQ_MASK_PRFCNT_SAMPLE_MASK,
+					     GLB_ACK_IRQ_MASK_PRFCNT_SAMPLE_MASK);
+	kbase_csf_firmware_global_input_mask(global_iface, GLB_ACK_IRQ_MASK,
+					     GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_MASK,
+					     GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_MASK);
+	kbase_csf_firmware_global_input_mask(global_iface, GLB_ACK_IRQ_MASK,
+					     GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_MASK,
+					     GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_MASK);
+	kbase_csf_firmware_global_input_mask(global_iface, GLB_ACK_IRQ_MASK,
+					     GLB_ACK_IRQ_MASK_PRFCNT_ENABLE_MASK,
+					     GLB_ACK_IRQ_MASK_PRFCNT_ENABLE_MASK);

 	/* Enable the HWC */
 	kbase_csf_firmware_global_input_mask(global_iface, GLB_REQ,
@ -600,15 +559,12 @@ static void kbasep_hwcnt_backend_csf_if_fw_dump_enable(
 					     GLB_REQ_PRFCNT_ENABLE_MASK);
 	kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR);

-	prfcnt_config = kbase_csf_firmware_global_input_read(global_iface,
-							     GLB_PRFCNT_CONFIG);
+	prfcnt_config = kbase_csf_firmware_global_input_read(global_iface, GLB_PRFCNT_CONFIG);

-	kbasep_hwcnt_backend_csf_if_fw_cc_enable(fw_ctx,
-						 enable->clk_enable_map);
+	kbasep_hwcnt_backend_csf_if_fw_cc_enable(fw_ctx, enable->clk_enable_map);
 }

-static void kbasep_hwcnt_backend_csf_if_fw_dump_disable(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx)
+static void kbasep_hwcnt_backend_csf_if_fw_dump_disable(struct kbase_hwcnt_backend_csf_if_ctx *ctx)
 {
 	struct kbase_device *kbdev;
 	struct kbase_csf_global_iface *global_iface;
@ -623,20 +579,16 @@ static void kbasep_hwcnt_backend_csf_if_fw_dump_disable(

 	/* Disable the HWC */
 	kbdev->csf.hwcnt.enable_pending = true;
-	kbase_csf_firmware_global_input_mask(global_iface, GLB_REQ, 0,
-					     GLB_REQ_PRFCNT_ENABLE_MASK);
+	kbase_csf_firmware_global_input_mask(global_iface, GLB_REQ, 0, GLB_REQ_PRFCNT_ENABLE_MASK);
 	kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR);

 	/* mask the interrupts */
-	kbase_csf_firmware_global_input_mask(
-		global_iface, GLB_ACK_IRQ_MASK, 0,
-		GLB_ACK_IRQ_MASK_PRFCNT_SAMPLE_MASK);
-	kbase_csf_firmware_global_input_mask(
-		global_iface, GLB_ACK_IRQ_MASK, 0,
-		GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_MASK);
-	kbase_csf_firmware_global_input_mask(
-		global_iface, GLB_ACK_IRQ_MASK, 0,
-		GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_MASK);
+	kbase_csf_firmware_global_input_mask(global_iface, GLB_ACK_IRQ_MASK, 0,
+					     GLB_ACK_IRQ_MASK_PRFCNT_SAMPLE_MASK);
+	kbase_csf_firmware_global_input_mask(global_iface, GLB_ACK_IRQ_MASK, 0,
+					     GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_MASK);
+	kbase_csf_firmware_global_input_mask(global_iface, GLB_ACK_IRQ_MASK, 0,
+					     GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_MASK);

 	/* In case we have a previous request in flight when the disable
 	 * happens.
@ -646,8 +598,7 @@ static void kbasep_hwcnt_backend_csf_if_fw_dump_disable(
 	kbasep_hwcnt_backend_csf_if_fw_cc_disable(fw_ctx);
 }

-static void kbasep_hwcnt_backend_csf_if_fw_dump_request(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx)
+static void kbasep_hwcnt_backend_csf_if_fw_dump_request(struct kbase_hwcnt_backend_csf_if_ctx *ctx)
 {
 	u32 glb_req;
 	struct kbase_device *kbdev;
@ -670,9 +621,8 @@ static void kbasep_hwcnt_backend_csf_if_fw_dump_request(
 	kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR);
 }

-static void kbasep_hwcnt_backend_csf_if_fw_get_indexes(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u32 *extract_index,
-	u32 *insert_index)
+static void kbasep_hwcnt_backend_csf_if_fw_get_indexes(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+						       u32 *extract_index, u32 *insert_index)
 {
 	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx =
 		(struct kbase_hwcnt_backend_csf_if_fw_ctx *)ctx;
@ -682,14 +632,15 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_indexes(
 	WARN_ON(!insert_index);
 	kbasep_hwcnt_backend_csf_if_fw_assert_lock_held(ctx);

-	*extract_index = kbase_csf_firmware_global_input_read(
-		&fw_ctx->kbdev->csf.global_iface, GLB_PRFCNT_EXTRACT);
-	*insert_index = kbase_csf_firmware_global_output(
-		&fw_ctx->kbdev->csf.global_iface, GLB_PRFCNT_INSERT);
+	*extract_index = kbase_csf_firmware_global_input_read(&fw_ctx->kbdev->csf.global_iface,
+							      GLB_PRFCNT_EXTRACT);
+	*insert_index = kbase_csf_firmware_global_output(&fw_ctx->kbdev->csf.global_iface,
+							 GLB_PRFCNT_INSERT);
 }

-static void kbasep_hwcnt_backend_csf_if_fw_set_extract_index(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u32 extract_idx)
+static void
+kbasep_hwcnt_backend_csf_if_fw_set_extract_index(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+						 u32 extract_idx)
 {
 	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx =
 		(struct kbase_hwcnt_backend_csf_if_fw_ctx *)ctx;
@ -700,13 +651,13 @@ static void kbasep_hwcnt_backend_csf_if_fw_set_extract_index(
 	/* Set the raw extract index to release the buffer back to the ring
 	 * buffer.
 	 */
-	kbase_csf_firmware_global_input(&fw_ctx->kbdev->csf.global_iface,
-					GLB_PRFCNT_EXTRACT, extract_idx);
+	kbase_csf_firmware_global_input(&fw_ctx->kbdev->csf.global_iface, GLB_PRFCNT_EXTRACT,
+					extract_idx);
 }

-static void kbasep_hwcnt_backend_csf_if_fw_get_gpu_cycle_count(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u64 *cycle_counts,
-	u64 clk_enable_map)
+static void
+kbasep_hwcnt_backend_csf_if_fw_get_gpu_cycle_count(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+						   u64 *cycle_counts, u64 clk_enable_map)
 {
 	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx =
 		(struct kbase_hwcnt_backend_csf_if_fw_ctx *)ctx;
@ -723,12 +674,12 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_gpu_cycle_count(

 		if (clk == KBASE_CLOCK_DOMAIN_TOP) {
 			/* Read cycle count for top clock domain. */
-			kbase_backend_get_gpu_time_norequest(
-				fw_ctx->kbdev, &cycle_counts[clk], NULL, NULL);
+			kbase_backend_get_gpu_time_norequest(fw_ctx->kbdev, &cycle_counts[clk],
+							     NULL, NULL);
 		} else {
 			/* Estimate cycle count for non-top clock domain. */
-			cycle_counts[clk] = kbase_ccswe_cycle_at(
-				&fw_ctx->ccswe_shader_cores, timestamp_ns);
+			cycle_counts[clk] =
+				kbase_ccswe_cycle_at(&fw_ctx->ccswe_shader_cores, timestamp_ns);
 		}
 	}
 }
@ -738,8 +689,8 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_gpu_cycle_count(
 *
 * @fw_ctx: Pointer to context to destroy.
 */
-static void kbasep_hwcnt_backend_csf_if_fw_ctx_destroy(
-	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx)
+static void
+kbasep_hwcnt_backend_csf_if_fw_ctx_destroy(struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx)
 {
 	if (!fw_ctx)
 		return;
@ -754,9 +705,9 @@ static void kbasep_hwcnt_backend_csf_if_fw_ctx_destroy(
 * @out_ctx: Non-NULL pointer to where info is stored on success.
 * Return: 0 on success, else error code.
 */
-static int kbasep_hwcnt_backend_csf_if_fw_ctx_create(
-	struct kbase_device *kbdev,
-	struct kbase_hwcnt_backend_csf_if_fw_ctx **out_ctx)
+static int
+kbasep_hwcnt_backend_csf_if_fw_ctx_create(struct kbase_device *kbdev,
+					  struct kbase_hwcnt_backend_csf_if_fw_ctx **out_ctx)
 {
 	u8 clk;
 	int errcode = -ENOMEM;
@ -780,8 +731,7 @@ static int kbasep_hwcnt_backend_csf_if_fw_ctx_create(

 	ctx->clk_enable_map = 0;
 	kbase_ccswe_init(&ctx->ccswe_shader_cores);
-	ctx->rate_listener.notify =
-		kbasep_hwcnt_backend_csf_if_fw_on_freq_change;
+	ctx->rate_listener.notify = kbasep_hwcnt_backend_csf_if_fw_on_freq_change;

 	*out_ctx = ctx;

@ -791,8 +741,7 @@ static int kbasep_hwcnt_backend_csf_if_fw_ctx_create(
 	return errcode;
 }

-void kbase_hwcnt_backend_csf_if_fw_destroy(
-	struct kbase_hwcnt_backend_csf_if *if_fw)
+void kbase_hwcnt_backend_csf_if_fw_destroy(struct kbase_hwcnt_backend_csf_if *if_fw)
 {
 	if (!if_fw)
 		return;
@ -802,8 +751,8 @@ void kbase_hwcnt_backend_csf_if_fw_destroy(
 	memset(if_fw, 0, sizeof(*if_fw));
 }

-int kbase_hwcnt_backend_csf_if_fw_create(
-	struct kbase_device *kbdev, struct kbase_hwcnt_backend_csf_if *if_fw)
+int kbase_hwcnt_backend_csf_if_fw_create(struct kbase_device *kbdev,
+					 struct kbase_hwcnt_backend_csf_if *if_fw)
 {
 	int errcode;
 	struct kbase_hwcnt_backend_csf_if_fw_ctx *ctx = NULL;
@ -816,8 +765,7 @@ int kbase_hwcnt_backend_csf_if_fw_create(
 		return errcode;

 	if_fw->ctx = (struct kbase_hwcnt_backend_csf_if_ctx *)ctx;
-	if_fw->assert_lock_held =
-		kbasep_hwcnt_backend_csf_if_fw_assert_lock_held;
+	if_fw->assert_lock_held = kbasep_hwcnt_backend_csf_if_fw_assert_lock_held;
 	if_fw->lock = kbasep_hwcnt_backend_csf_if_fw_lock;
 	if_fw->unlock = kbasep_hwcnt_backend_csf_if_fw_unlock;
 	if_fw->get_prfcnt_info = kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info;
@ -828,11 +776,9 @@ int kbase_hwcnt_backend_csf_if_fw_create(
 	if_fw->dump_enable = kbasep_hwcnt_backend_csf_if_fw_dump_enable;
 	if_fw->dump_disable = kbasep_hwcnt_backend_csf_if_fw_dump_disable;
 	if_fw->dump_request = kbasep_hwcnt_backend_csf_if_fw_dump_request;
-	if_fw->get_gpu_cycle_count =
-		kbasep_hwcnt_backend_csf_if_fw_get_gpu_cycle_count;
+	if_fw->get_gpu_cycle_count = kbasep_hwcnt_backend_csf_if_fw_get_gpu_cycle_count;
 	if_fw->get_indexes = kbasep_hwcnt_backend_csf_if_fw_get_indexes;
-	if_fw->set_extract_index =
-		kbasep_hwcnt_backend_csf_if_fw_set_extract_index;
+	if_fw->set_extract_index = kbasep_hwcnt_backend_csf_if_fw_set_extract_index;

 	return 0;
 }
--- a/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.h
@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 *
- * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -26,7 +26,7 @@
 #ifndef _KBASE_HWCNT_BACKEND_CSF_IF_FW_H_
 #define _KBASE_HWCNT_BACKEND_CSF_IF_FW_H_

-#include "mali_kbase_hwcnt_backend_csf_if.h"
+#include "hwcnt/backend/mali_kbase_hwcnt_backend_csf_if.h"

 /**
 * kbase_hwcnt_backend_csf_if_fw_create() - Create a firmware CSF interface
@ -36,15 +36,14 @@
 *         creation success.
 * Return: 0 on success, else error code.
 */
-int kbase_hwcnt_backend_csf_if_fw_create(
-	struct kbase_device *kbdev, struct kbase_hwcnt_backend_csf_if *if_fw);
+int kbase_hwcnt_backend_csf_if_fw_create(struct kbase_device *kbdev,
+					 struct kbase_hwcnt_backend_csf_if *if_fw);

 /**
 * kbase_hwcnt_backend_csf_if_fw_destroy() - Destroy a firmware CSF interface of
 *                                           hardware counter backend.
 * @if_fw: Pointer to a CSF interface to destroy.
 */
-void kbase_hwcnt_backend_csf_if_fw_destroy(
-	struct kbase_hwcnt_backend_csf_if *if_fw);
+void kbase_hwcnt_backend_csf_if_fw_destroy(struct kbase_hwcnt_backend_csf_if *if_fw);

 #endif /* _KBASE_HWCNT_BACKEND_CSF_IF_FW_H_ */
--- a/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_jm.c
+++ b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_jm.c
@ -19,9 +19,9 @@
 *
 */

-#include "mali_kbase_hwcnt_backend_jm.h"
-#include "mali_kbase_hwcnt_gpu.h"
-#include "mali_kbase_hwcnt_types.h"
+#include "hwcnt/backend/mali_kbase_hwcnt_backend_jm.h"
+#include "hwcnt/mali_kbase_hwcnt_gpu.h"
+#include "hwcnt/mali_kbase_hwcnt_types.h"
 #include "mali_kbase.h"
 #include "backend/gpu/mali_kbase_pm_ca.h"
 #include "mali_kbase_hwaccess_instr.h"
@ -136,9 +136,8 @@ struct kbase_hwcnt_backend_jm {
 *
 * Return: 0 on success, else error code.
 */
-static int
-kbasep_hwcnt_backend_jm_gpu_info_init(struct kbase_device *kbdev,
-				      struct kbase_hwcnt_gpu_info *info)
+static int kbasep_hwcnt_backend_jm_gpu_info_init(struct kbase_device *kbdev,
+						 struct kbase_hwcnt_gpu_info *info)
 {
 	size_t clk;

@ -153,13 +152,11 @@ kbasep_hwcnt_backend_jm_gpu_info_init(struct kbase_device *kbdev,
 	{
 		const struct base_gpu_props *props = &kbdev->gpu_props.props;
 		const size_t l2_count = props->l2_props.num_l2_slices;
-		const size_t core_mask =
-			props->coherency_info.group[0].core_mask;
+		const size_t core_mask = props->coherency_info.group[0].core_mask;

 		info->l2_count = l2_count;
 		info->core_mask = core_mask;
-		info->prfcnt_values_per_block =
-			KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK;
+		info->prfcnt_values_per_block = KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK;
 	}
 #endif /* CONFIG_MALI_BIFROST_NO_MALI */

@ -173,9 +170,8 @@ kbasep_hwcnt_backend_jm_gpu_info_init(struct kbase_device *kbdev,
 	return 0;
 }

-static void kbasep_hwcnt_backend_jm_init_layout(
-	const struct kbase_hwcnt_gpu_info *gpu_info,
-	struct kbase_hwcnt_jm_physical_layout *phys_layout)
+static void kbasep_hwcnt_backend_jm_init_layout(const struct kbase_hwcnt_gpu_info *gpu_info,
+						struct kbase_hwcnt_jm_physical_layout *phys_layout)
 {
 	u8 shader_core_cnt;

@ -189,32 +185,29 @@ static void kbasep_hwcnt_backend_jm_init_layout(
 		.tiler_cnt = KBASE_HWCNT_V5_TILER_BLOCK_COUNT,
 		.mmu_l2_cnt = gpu_info->l2_count,
 		.shader_cnt = shader_core_cnt,
-		.block_cnt = KBASE_HWCNT_V5_FE_BLOCK_COUNT +
-			     KBASE_HWCNT_V5_TILER_BLOCK_COUNT +
+		.block_cnt = KBASE_HWCNT_V5_FE_BLOCK_COUNT + KBASE_HWCNT_V5_TILER_BLOCK_COUNT +
 			     gpu_info->l2_count + shader_core_cnt,
 		.shader_avail_mask = gpu_info->core_mask,
 		.headers_per_block = KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
 		.values_per_block = gpu_info->prfcnt_values_per_block,
-		.counters_per_block = gpu_info->prfcnt_values_per_block -
-				      KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
+		.counters_per_block =
+			gpu_info->prfcnt_values_per_block - KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
 		.enable_mask_offset = KBASE_HWCNT_V5_PRFCNT_EN_HEADER,
 	};
 }

-static void kbasep_hwcnt_backend_jm_dump_sample(
-	const struct kbase_hwcnt_backend_jm *const backend_jm)
+static void
+kbasep_hwcnt_backend_jm_dump_sample(const struct kbase_hwcnt_backend_jm *const backend_jm)
 {
 	size_t block_idx;
 	const u32 *new_sample_buf = backend_jm->cpu_dump_va;
 	const u32 *new_block = new_sample_buf;
 	u64 *dst_buf = backend_jm->to_user_buf;
 	u64 *dst_block = dst_buf;
-	const size_t values_per_block =
-		backend_jm->phys_layout.values_per_block;
+	const size_t values_per_block = backend_jm->phys_layout.values_per_block;
 	const size_t dump_bytes = backend_jm->info->dump_bytes;

-	for (block_idx = 0; block_idx < backend_jm->phys_layout.block_cnt;
-	     block_idx++) {
+	for (block_idx = 0; block_idx < backend_jm->phys_layout.block_cnt; block_idx++) {
 		size_t ctr_idx;

 		for (ctr_idx = 0; ctr_idx < values_per_block; ctr_idx++)
@ -224,10 +217,8 @@ static void kbasep_hwcnt_backend_jm_dump_sample(
 		dst_block += values_per_block;
 	}

-	WARN_ON(new_block !=
-		new_sample_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES));
-	WARN_ON(dst_block !=
-		dst_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES));
+	WARN_ON(new_block != new_sample_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES));
+	WARN_ON(dst_block != dst_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES));
 }

 /**
@ -237,21 +228,18 @@ static void kbasep_hwcnt_backend_jm_dump_sample(
 * @clk_index:        Clock index
 * @clk_rate_hz:      Clock frequency(hz)
 */
-static void kbasep_hwcnt_backend_jm_on_freq_change(
-	struct kbase_clk_rate_listener *rate_listener,
-	u32 clk_index,
-	u32 clk_rate_hz)
+static void kbasep_hwcnt_backend_jm_on_freq_change(struct kbase_clk_rate_listener *rate_listener,
+						   u32 clk_index, u32 clk_rate_hz)
 {
-	struct kbase_hwcnt_backend_jm *backend_jm = container_of(
-		rate_listener, struct kbase_hwcnt_backend_jm, rate_listener);
+	struct kbase_hwcnt_backend_jm *backend_jm =
+		container_of(rate_listener, struct kbase_hwcnt_backend_jm, rate_listener);
 	u64 timestamp_ns;

 	if (clk_index != KBASE_CLOCK_DOMAIN_SHADER_CORES)
 		return;

 	timestamp_ns = ktime_get_raw_ns();
-	kbase_ccswe_freq_change(
-		&backend_jm->ccswe_shader_cores, timestamp_ns, clk_rate_hz);
+	kbase_ccswe_freq_change(&backend_jm->ccswe_shader_cores, timestamp_ns, clk_rate_hz);
 }

 /**
@ -261,53 +249,42 @@ static void kbasep_hwcnt_backend_jm_on_freq_change(
 * @enable_map:   Non-NULL pointer to enable map specifying enabled counters.
 * @timestamp_ns: Timestamp(ns) when HWCNT were enabled.
 */
-static void kbasep_hwcnt_backend_jm_cc_enable(
-	struct kbase_hwcnt_backend_jm *backend_jm,
-	const struct kbase_hwcnt_enable_map *enable_map,
-	u64 timestamp_ns)
+static void kbasep_hwcnt_backend_jm_cc_enable(struct kbase_hwcnt_backend_jm *backend_jm,
+					      const struct kbase_hwcnt_enable_map *enable_map,
+					      u64 timestamp_ns)
 {
 	struct kbase_device *kbdev = backend_jm->kctx->kbdev;
 	u64 clk_enable_map = enable_map->clk_enable_map;
 	u64 cycle_count;

-	if (kbase_hwcnt_clk_enable_map_enabled(
-		    clk_enable_map, KBASE_CLOCK_DOMAIN_TOP)) {
+	if (kbase_hwcnt_clk_enable_map_enabled(clk_enable_map, KBASE_CLOCK_DOMAIN_TOP)) {
 		/* turn on the cycle counter */
 		kbase_pm_request_gpu_cycle_counter_l2_is_on(kbdev);
 		/* Read cycle count for top clock domain. */
-		kbase_backend_get_gpu_time_norequest(
-			kbdev, &cycle_count, NULL, NULL);
+		kbase_backend_get_gpu_time_norequest(kbdev, &cycle_count, NULL, NULL);

-		backend_jm->prev_cycle_count[KBASE_CLOCK_DOMAIN_TOP] =
-			cycle_count;
+		backend_jm->prev_cycle_count[KBASE_CLOCK_DOMAIN_TOP] = cycle_count;
 	}

-	if (kbase_hwcnt_clk_enable_map_enabled(
-		    clk_enable_map, KBASE_CLOCK_DOMAIN_SHADER_CORES)) {
+	if (kbase_hwcnt_clk_enable_map_enabled(clk_enable_map, KBASE_CLOCK_DOMAIN_SHADER_CORES)) {
 		/* software estimation for non-top clock domains */
 		struct kbase_clk_rate_trace_manager *rtm = &kbdev->pm.clk_rtm;
-		const struct kbase_clk_data *clk_data =
-			rtm->clks[KBASE_CLOCK_DOMAIN_SHADER_CORES];
+		const struct kbase_clk_data *clk_data = rtm->clks[KBASE_CLOCK_DOMAIN_SHADER_CORES];
 		u32 cur_freq;
 		unsigned long flags;

 		spin_lock_irqsave(&rtm->lock, flags);

-		cur_freq = (u32) clk_data->clock_val;
+		cur_freq = (u32)clk_data->clock_val;
 		kbase_ccswe_reset(&backend_jm->ccswe_shader_cores);
-		kbase_ccswe_freq_change(
-			&backend_jm->ccswe_shader_cores,
-			timestamp_ns,
-			cur_freq);
+		kbase_ccswe_freq_change(&backend_jm->ccswe_shader_cores, timestamp_ns, cur_freq);

-		kbase_clk_rate_trace_manager_subscribe_no_lock(
-			rtm, &backend_jm->rate_listener);
+		kbase_clk_rate_trace_manager_subscribe_no_lock(rtm, &backend_jm->rate_listener);

 		spin_unlock_irqrestore(&rtm->lock, flags);

 		/* ccswe was reset. The estimated cycle is zero. */
-		backend_jm->prev_cycle_count[
-			KBASE_CLOCK_DOMAIN_SHADER_CORES] = 0;
+		backend_jm->prev_cycle_count[KBASE_CLOCK_DOMAIN_SHADER_CORES] = 0;
 	}

 	/* Keep clk_enable_map for dump_request. */
@ -319,28 +296,22 @@ static void kbasep_hwcnt_backend_jm_cc_enable(
 *
 * @backend_jm:      Non-NULL pointer to backend.
 */
-static void kbasep_hwcnt_backend_jm_cc_disable(
-	struct kbase_hwcnt_backend_jm *backend_jm)
+static void kbasep_hwcnt_backend_jm_cc_disable(struct kbase_hwcnt_backend_jm *backend_jm)
 {
 	struct kbase_device *kbdev = backend_jm->kctx->kbdev;
 	struct kbase_clk_rate_trace_manager *rtm = &kbdev->pm.clk_rtm;
 	u64 clk_enable_map = backend_jm->clk_enable_map;

-	if (kbase_hwcnt_clk_enable_map_enabled(
-		clk_enable_map, KBASE_CLOCK_DOMAIN_TOP)) {
+	if (kbase_hwcnt_clk_enable_map_enabled(clk_enable_map, KBASE_CLOCK_DOMAIN_TOP)) {
 		/* turn off the cycle counter */
 		kbase_pm_release_gpu_cycle_counter(kbdev);
 	}

-	if (kbase_hwcnt_clk_enable_map_enabled(
-		clk_enable_map, KBASE_CLOCK_DOMAIN_SHADER_CORES)) {
-
-		kbase_clk_rate_trace_manager_unsubscribe(
-			rtm, &backend_jm->rate_listener);
+	if (kbase_hwcnt_clk_enable_map_enabled(clk_enable_map, KBASE_CLOCK_DOMAIN_SHADER_CORES)) {
+		kbase_clk_rate_trace_manager_unsubscribe(rtm, &backend_jm->rate_listener);
 	}
 }

-
 /**
 * kbasep_hwcnt_gpu_update_curr_config() - Update the destination buffer with
 *                                        current config information.
@ -356,38 +327,33 @@ static void kbasep_hwcnt_backend_jm_cc_disable(
 *
 * Return: 0 on success, else error code.
 */
-static int kbasep_hwcnt_gpu_update_curr_config(
-	struct kbase_device *kbdev,
-	struct kbase_hwcnt_curr_config *curr_config)
+static int kbasep_hwcnt_gpu_update_curr_config(struct kbase_device *kbdev,
+					       struct kbase_hwcnt_curr_config *curr_config)
 {
 	if (WARN_ON(!kbdev) || WARN_ON(!curr_config))
 		return -EINVAL;

 	lockdep_assert_held(&kbdev->hwaccess_lock);

-	curr_config->num_l2_slices =
-		kbdev->gpu_props.curr_config.l2_slices;
-	curr_config->shader_present =
-		kbdev->gpu_props.curr_config.shader_present;
+	curr_config->num_l2_slices = kbdev->gpu_props.curr_config.l2_slices;
+	curr_config->shader_present = kbdev->gpu_props.curr_config.shader_present;
 	return 0;
 }

 /* JM backend implementation of kbase_hwcnt_backend_timestamp_ns_fn */
-static u64 kbasep_hwcnt_backend_jm_timestamp_ns(
-	struct kbase_hwcnt_backend *backend)
+static u64 kbasep_hwcnt_backend_jm_timestamp_ns(struct kbase_hwcnt_backend *backend)
 {
 	(void)backend;
 	return ktime_get_raw_ns();
 }

 /* JM backend implementation of kbase_hwcnt_backend_dump_enable_nolock_fn */
-static int kbasep_hwcnt_backend_jm_dump_enable_nolock(
-	struct kbase_hwcnt_backend *backend,
-	const struct kbase_hwcnt_enable_map *enable_map)
+static int
+kbasep_hwcnt_backend_jm_dump_enable_nolock(struct kbase_hwcnt_backend *backend,
+					   const struct kbase_hwcnt_enable_map *enable_map)
 {
 	int errcode;
-	struct kbase_hwcnt_backend_jm *backend_jm =
-		(struct kbase_hwcnt_backend_jm *)backend;
+	struct kbase_hwcnt_backend_jm *backend_jm = (struct kbase_hwcnt_backend_jm *)backend;
 	struct kbase_context *kctx;
 	struct kbase_device *kbdev;
 	struct kbase_hwcnt_physical_enable_map phys_enable_map;
@ -406,8 +372,7 @@ static int kbasep_hwcnt_backend_jm_dump_enable_nolock(

 	kbase_hwcnt_gpu_enable_map_to_physical(&phys_enable_map, enable_map);

-	kbase_hwcnt_gpu_set_to_physical(&phys_counter_set,
-					backend_jm->info->counter_set);
+	kbase_hwcnt_gpu_set_to_physical(&phys_counter_set, backend_jm->info->counter_set);

 	enable.fe_bm = phys_enable_map.fe_bm;
 	enable.shader_bm = phys_enable_map.shader_bm;
@ -425,8 +390,7 @@ static int kbasep_hwcnt_backend_jm_dump_enable_nolock(
 	timestamp_ns = kbasep_hwcnt_backend_jm_timestamp_ns(backend);

 	/* Update the current configuration information. */
-	errcode = kbasep_hwcnt_gpu_update_curr_config(kbdev,
-						      &backend_jm->curr_config);
+	errcode = kbasep_hwcnt_gpu_update_curr_config(kbdev, &backend_jm->curr_config);
 	if (errcode)
 		goto error;

@ -446,14 +410,12 @@ static int kbasep_hwcnt_backend_jm_dump_enable_nolock(
 }

 /* JM backend implementation of kbase_hwcnt_backend_dump_enable_fn */
-static int kbasep_hwcnt_backend_jm_dump_enable(
-	struct kbase_hwcnt_backend *backend,
-	const struct kbase_hwcnt_enable_map *enable_map)
+static int kbasep_hwcnt_backend_jm_dump_enable(struct kbase_hwcnt_backend *backend,
+					       const struct kbase_hwcnt_enable_map *enable_map)
 {
 	unsigned long flags;
 	int errcode;
-	struct kbase_hwcnt_backend_jm *backend_jm =
-		(struct kbase_hwcnt_backend_jm *)backend;
+	struct kbase_hwcnt_backend_jm *backend_jm = (struct kbase_hwcnt_backend_jm *)backend;
 	struct kbase_device *kbdev;

 	if (!backend_jm)
@ -463,8 +425,7 @@ static int kbasep_hwcnt_backend_jm_dump_enable(

 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);

-	errcode = kbasep_hwcnt_backend_jm_dump_enable_nolock(
-		backend, enable_map);
+	errcode = kbasep_hwcnt_backend_jm_dump_enable_nolock(backend, enable_map);

 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);

@ -472,12 +433,10 @@ static int kbasep_hwcnt_backend_jm_dump_enable(
 }

 /* JM backend implementation of kbase_hwcnt_backend_dump_disable_fn */
-static void kbasep_hwcnt_backend_jm_dump_disable(
-	struct kbase_hwcnt_backend *backend)
+static void kbasep_hwcnt_backend_jm_dump_disable(struct kbase_hwcnt_backend *backend)
 {
 	int errcode;
-	struct kbase_hwcnt_backend_jm *backend_jm =
-		(struct kbase_hwcnt_backend_jm *)backend;
+	struct kbase_hwcnt_backend_jm *backend_jm = (struct kbase_hwcnt_backend_jm *)backend;

 	if (WARN_ON(!backend_jm) || !backend_jm->enabled)
 		return;
@ -491,11 +450,9 @@ static void kbasep_hwcnt_backend_jm_dump_disable(
 }

 /* JM backend implementation of kbase_hwcnt_backend_dump_clear_fn */
-static int kbasep_hwcnt_backend_jm_dump_clear(
-	struct kbase_hwcnt_backend *backend)
+static int kbasep_hwcnt_backend_jm_dump_clear(struct kbase_hwcnt_backend *backend)
 {
-	struct kbase_hwcnt_backend_jm *backend_jm =
-		(struct kbase_hwcnt_backend_jm *)backend;
+	struct kbase_hwcnt_backend_jm *backend_jm = (struct kbase_hwcnt_backend_jm *)backend;

 	if (!backend_jm || !backend_jm->enabled)
 		return -EINVAL;
@ -504,12 +461,10 @@ static int kbasep_hwcnt_backend_jm_dump_clear(
 }

 /* JM backend implementation of kbase_hwcnt_backend_dump_request_fn */
-static int kbasep_hwcnt_backend_jm_dump_request(
-	struct kbase_hwcnt_backend *backend,
-	u64 *dump_time_ns)
+static int kbasep_hwcnt_backend_jm_dump_request(struct kbase_hwcnt_backend *backend,
+						u64 *dump_time_ns)
 {
-	struct kbase_hwcnt_backend_jm *backend_jm =
-		(struct kbase_hwcnt_backend_jm *)backend;
+	struct kbase_hwcnt_backend_jm *backend_jm = (struct kbase_hwcnt_backend_jm *)backend;
 	struct kbase_device *kbdev;
 	const struct kbase_hwcnt_metadata *metadata;
 	u64 current_cycle_count;
@ -528,28 +483,25 @@ static int kbasep_hwcnt_backend_jm_dump_request(
 		*dump_time_ns = kbasep_hwcnt_backend_jm_timestamp_ns(backend);
 		ret = kbase_instr_hwcnt_request_dump(backend_jm->kctx);

-		kbase_hwcnt_metadata_for_each_clock(metadata, clk) {
-			if (!kbase_hwcnt_clk_enable_map_enabled(
-				backend_jm->clk_enable_map, clk))
+		kbase_hwcnt_metadata_for_each_clock(metadata, clk)
+		{
+			if (!kbase_hwcnt_clk_enable_map_enabled(backend_jm->clk_enable_map, clk))
 				continue;

 			if (clk == KBASE_CLOCK_DOMAIN_TOP) {
 				/* Read cycle count for top clock domain. */
-				kbase_backend_get_gpu_time_norequest(
-					kbdev, &current_cycle_count,
-					NULL, NULL);
+				kbase_backend_get_gpu_time_norequest(kbdev, &current_cycle_count,
+								     NULL, NULL);
 			} else {
 				/*
 				 * Estimate cycle count for non-top clock
 				 * domain.
 				 */
 				current_cycle_count = kbase_ccswe_cycle_at(
-					&backend_jm->ccswe_shader_cores,
-					*dump_time_ns);
+					&backend_jm->ccswe_shader_cores, *dump_time_ns);
 			}
 			backend_jm->cycle_count_elapsed[clk] =
-				current_cycle_count -
-				backend_jm->prev_cycle_count[clk];
+				current_cycle_count - backend_jm->prev_cycle_count[clk];

 			/*
 			 * Keep the current cycle count for later calculation.
@ -563,11 +515,9 @@ static int kbasep_hwcnt_backend_jm_dump_request(
 }

 /* JM backend implementation of kbase_hwcnt_backend_dump_wait_fn */
-static int kbasep_hwcnt_backend_jm_dump_wait(
-	struct kbase_hwcnt_backend *backend)
+static int kbasep_hwcnt_backend_jm_dump_wait(struct kbase_hwcnt_backend *backend)
 {
-	struct kbase_hwcnt_backend_jm *backend_jm =
-		(struct kbase_hwcnt_backend_jm *)backend;
+	struct kbase_hwcnt_backend_jm *backend_jm = (struct kbase_hwcnt_backend_jm *)backend;

 	if (!backend_jm || !backend_jm->enabled)
 		return -EINVAL;
@ -576,14 +526,12 @@ static int kbasep_hwcnt_backend_jm_dump_wait(
 }

 /* JM backend implementation of kbase_hwcnt_backend_dump_get_fn */
-static int kbasep_hwcnt_backend_jm_dump_get(
-	struct kbase_hwcnt_backend *backend,
-	struct kbase_hwcnt_dump_buffer *dst,
-	const struct kbase_hwcnt_enable_map *dst_enable_map,
-	bool accumulate)
+static int kbasep_hwcnt_backend_jm_dump_get(struct kbase_hwcnt_backend *backend,
+					    struct kbase_hwcnt_dump_buffer *dst,
+					    const struct kbase_hwcnt_enable_map *dst_enable_map,
+					    bool accumulate)
 {
-	struct kbase_hwcnt_backend_jm *backend_jm =
-		(struct kbase_hwcnt_backend_jm *)backend;
+	struct kbase_hwcnt_backend_jm *backend_jm = (struct kbase_hwcnt_backend_jm *)backend;
 	size_t clk;
 #if IS_ENABLED(CONFIG_MALI_BIFROST_NO_MALI)
 	struct kbase_device *kbdev;
@ -597,16 +545,15 @@ static int kbasep_hwcnt_backend_jm_dump_get(
 		return -EINVAL;

 	/* Invalidate the kernel buffer before reading from it. */
-	kbase_sync_mem_regions(
-		backend_jm->kctx, backend_jm->vmap, KBASE_SYNC_TO_CPU);
+	kbase_sync_mem_regions(backend_jm->kctx, backend_jm->vmap, KBASE_SYNC_TO_CPU);

 	/* Dump sample to the internal 64-bit user buffer. */
 	kbasep_hwcnt_backend_jm_dump_sample(backend_jm);

 	/* Extract elapsed cycle count for each clock domain if enabled. */
-	kbase_hwcnt_metadata_for_each_clock(dst_enable_map->metadata, clk) {
-		if (!kbase_hwcnt_clk_enable_map_enabled(
-			dst_enable_map->clk_enable_map, clk))
+	kbase_hwcnt_metadata_for_each_clock(dst_enable_map->metadata, clk)
+	{
+		if (!kbase_hwcnt_clk_enable_map_enabled(dst_enable_map->clk_enable_map, clk))
 			continue;

 		/* Reset the counter to zero if accumulation is off. */
@ -621,17 +568,16 @@ static int kbasep_hwcnt_backend_jm_dump_get(
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);

 	/* Update the current configuration information. */
-	errcode = kbasep_hwcnt_gpu_update_curr_config(kbdev,
-		&backend_jm->curr_config);
+	errcode = kbasep_hwcnt_gpu_update_curr_config(kbdev, &backend_jm->curr_config);

 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);

 	if (errcode)
 		return errcode;
 #endif /* CONFIG_MALI_BIFROST_NO_MALI */
-	return kbase_hwcnt_jm_dump_get(dst, backend_jm->to_user_buf,
-				       dst_enable_map, backend_jm->pm_core_mask,
-				       &backend_jm->curr_config, accumulate);
+	return kbase_hwcnt_jm_dump_get(dst, backend_jm->to_user_buf, dst_enable_map,
+				       backend_jm->pm_core_mask, &backend_jm->curr_config,
+				       accumulate);
 }

 /**
@ -643,10 +589,8 @@ static int kbasep_hwcnt_backend_jm_dump_get(
 *
 * Return: 0 on success, else error code.
 */
-static int kbasep_hwcnt_backend_jm_dump_alloc(
-	const struct kbase_hwcnt_backend_jm_info *info,
-	struct kbase_context *kctx,
-	u64 *gpu_dump_va)
+static int kbasep_hwcnt_backend_jm_dump_alloc(const struct kbase_hwcnt_backend_jm_info *info,
+					      struct kbase_context *kctx, u64 *gpu_dump_va)
 {
 	struct kbase_va_region *reg;
 	u64 flags;
@ -661,16 +605,12 @@ static int kbasep_hwcnt_backend_jm_dump_alloc(
 	WARN_ON(!kctx);
 	WARN_ON(!gpu_dump_va);

-	flags = BASE_MEM_PROT_CPU_RD |
-		BASE_MEM_PROT_GPU_WR |
-		BASEP_MEM_PERMANENT_KERNEL_MAPPING |
-		BASE_MEM_CACHED_CPU |
-		BASE_MEM_UNCACHED_GPU;
+	flags = BASE_MEM_PROT_CPU_RD | BASE_MEM_PROT_GPU_WR | BASEP_MEM_PERMANENT_KERNEL_MAPPING |
+		BASE_MEM_CACHED_CPU | BASE_MEM_UNCACHED_GPU;

 	nr_pages = PFN_UP(info->dump_bytes);

-	reg = kbase_mem_alloc(kctx, nr_pages, nr_pages, 0, &flags, gpu_dump_va,
-			      mmu_sync_info);
+	reg = kbase_mem_alloc(kctx, nr_pages, nr_pages, 0, &flags, gpu_dump_va, mmu_sync_info);

 	if (!reg)
 		return -ENOMEM;
@ -683,9 +623,7 @@ static int kbasep_hwcnt_backend_jm_dump_alloc(
 * @kctx:        Non-NULL pointer to kbase context.
 * @gpu_dump_va: GPU dump buffer virtual address.
 */
-static void kbasep_hwcnt_backend_jm_dump_free(
-	struct kbase_context *kctx,
-	u64 gpu_dump_va)
+static void kbasep_hwcnt_backend_jm_dump_free(struct kbase_context *kctx, u64 gpu_dump_va)
 {
 	WARN_ON(!kctx);
 	if (gpu_dump_va)
@ -698,8 +636,7 @@ static void kbasep_hwcnt_backend_jm_dump_free(
 *
 * Can be safely called on a backend in any state of partial construction.
 */
-static void kbasep_hwcnt_backend_jm_destroy(
-	struct kbase_hwcnt_backend_jm *backend)
+static void kbasep_hwcnt_backend_jm_destroy(struct kbase_hwcnt_backend_jm *backend)
 {
 	if (!backend)
 		return;
@ -712,8 +649,7 @@ static void kbasep_hwcnt_backend_jm_destroy(
 			kbase_phy_alloc_mapping_put(kctx, backend->vmap);

 		if (backend->gpu_dump_va)
-			kbasep_hwcnt_backend_jm_dump_free(
-				kctx, backend->gpu_dump_va);
+			kbasep_hwcnt_backend_jm_dump_free(kctx, backend->gpu_dump_va);

 		kbasep_js_release_privileged_ctx(kbdev, kctx);
 		kbase_destroy_context(kctx);
@ -731,9 +667,8 @@ static void kbasep_hwcnt_backend_jm_destroy(
 *
 * Return: 0 on success, else error code.
 */
-static int kbasep_hwcnt_backend_jm_create(
-	const struct kbase_hwcnt_backend_jm_info *info,
-	struct kbase_hwcnt_backend_jm **out_backend)
+static int kbasep_hwcnt_backend_jm_create(const struct kbase_hwcnt_backend_jm_info *info,
+					  struct kbase_hwcnt_backend_jm **out_backend)
 {
 	int errcode;
 	struct kbase_device *kbdev;
@ -749,28 +684,25 @@ static int kbasep_hwcnt_backend_jm_create(
 		goto alloc_error;

 	backend->info = info;
-	kbasep_hwcnt_backend_jm_init_layout(&info->hwcnt_gpu_info,
-					    &backend->phys_layout);
+	kbasep_hwcnt_backend_jm_init_layout(&info->hwcnt_gpu_info, &backend->phys_layout);

 	backend->kctx = kbase_create_context(kbdev, true,
-		BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED, 0, NULL);
+					     BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED, 0, NULL);
 	if (!backend->kctx)
 		goto alloc_error;

 	kbasep_js_schedule_privileged_ctx(kbdev, backend->kctx);

-	errcode = kbasep_hwcnt_backend_jm_dump_alloc(
-		info, backend->kctx, &backend->gpu_dump_va);
+	errcode = kbasep_hwcnt_backend_jm_dump_alloc(info, backend->kctx, &backend->gpu_dump_va);
 	if (errcode)
 		goto error;

-	backend->cpu_dump_va = kbase_phy_alloc_mapping_get(backend->kctx,
-		backend->gpu_dump_va, &backend->vmap);
+	backend->cpu_dump_va =
+		kbase_phy_alloc_mapping_get(backend->kctx, backend->gpu_dump_va, &backend->vmap);
 	if (!backend->cpu_dump_va || !backend->vmap)
 		goto alloc_error;

-	backend->to_user_buf =
-		kzalloc(info->metadata->dump_buf_bytes, GFP_KERNEL);
+	backend->to_user_buf = kzalloc(info->metadata->dump_buf_bytes, GFP_KERNEL);
 	if (!backend->to_user_buf)
 		goto alloc_error;

@ -798,9 +730,8 @@ kbasep_hwcnt_backend_jm_metadata(const struct kbase_hwcnt_backend_info *info)
 }

 /* JM backend implementation of kbase_hwcnt_backend_init_fn */
-static int kbasep_hwcnt_backend_jm_init(
-	const struct kbase_hwcnt_backend_info *info,
-	struct kbase_hwcnt_backend **out_backend)
+static int kbasep_hwcnt_backend_jm_init(const struct kbase_hwcnt_backend_info *info,
+					struct kbase_hwcnt_backend **out_backend)
 {
 	int errcode;
 	struct kbase_hwcnt_backend_jm *backend = NULL;
@ -808,8 +739,8 @@ static int kbasep_hwcnt_backend_jm_init(
 	if (!info || !out_backend)
 		return -EINVAL;

-	errcode = kbasep_hwcnt_backend_jm_create(
-		(const struct kbase_hwcnt_backend_jm_info *) info, &backend);
+	errcode = kbasep_hwcnt_backend_jm_create((const struct kbase_hwcnt_backend_jm_info *)info,
+						 &backend);
 	if (errcode)
 		return errcode;

@ -825,8 +756,7 @@ static void kbasep_hwcnt_backend_jm_term(struct kbase_hwcnt_backend *backend)
 		return;

 	kbasep_hwcnt_backend_jm_dump_disable(backend);
-	kbasep_hwcnt_backend_jm_destroy(
-		(struct kbase_hwcnt_backend_jm *)backend);
+	kbasep_hwcnt_backend_jm_destroy((struct kbase_hwcnt_backend_jm *)backend);
 }

 /**
@ -835,8 +765,7 @@ static void kbasep_hwcnt_backend_jm_term(struct kbase_hwcnt_backend *backend)
 *
 * Can be safely called on a backend info in any state of partial construction.
 */
-static void kbasep_hwcnt_backend_jm_info_destroy(
-	const struct kbase_hwcnt_backend_jm_info *info)
+static void kbasep_hwcnt_backend_jm_info_destroy(const struct kbase_hwcnt_backend_jm_info *info)
 {
 	if (!info)
 		return;
@ -852,9 +781,8 @@ static void kbasep_hwcnt_backend_jm_info_destroy(
 *
 * Return: 0 on success, else error code.
 */
-static int kbasep_hwcnt_backend_jm_info_create(
-	struct kbase_device *kbdev,
-	const struct kbase_hwcnt_backend_jm_info **out_info)
+static int kbasep_hwcnt_backend_jm_info_create(struct kbase_device *kbdev,
+					       const struct kbase_hwcnt_backend_jm_info **out_info)
 {
 	int errcode = -ENOMEM;
 	struct kbase_hwcnt_backend_jm_info *info = NULL;
@ -877,15 +805,12 @@ static int kbasep_hwcnt_backend_jm_info_create(
 	info->counter_set = KBASE_HWCNT_SET_PRIMARY;
 #endif

-	errcode = kbasep_hwcnt_backend_jm_gpu_info_init(kbdev,
-							&info->hwcnt_gpu_info);
+	errcode = kbasep_hwcnt_backend_jm_gpu_info_init(kbdev, &info->hwcnt_gpu_info);
 	if (errcode)
 		goto error;

-	errcode = kbase_hwcnt_jm_metadata_create(&info->hwcnt_gpu_info,
-						 info->counter_set,
-						 &info->metadata,
-						 &info->dump_bytes);
+	errcode = kbase_hwcnt_jm_metadata_create(&info->hwcnt_gpu_info, info->counter_set,
+						 &info->metadata, &info->dump_bytes);
 	if (errcode)
 		goto error;

@ -897,9 +822,8 @@ static int kbasep_hwcnt_backend_jm_info_create(
 	return errcode;
 }

-int kbase_hwcnt_backend_jm_create(
-	struct kbase_device *kbdev,
-	struct kbase_hwcnt_backend_interface *iface)
+int kbase_hwcnt_backend_jm_create(struct kbase_device *kbdev,
+				  struct kbase_hwcnt_backend_interface *iface)
 {
 	int errcode;
 	const struct kbase_hwcnt_backend_jm_info *info = NULL;
@ -928,8 +852,7 @@ int kbase_hwcnt_backend_jm_create(
 	return 0;
 }

-void kbase_hwcnt_backend_jm_destroy(
-	struct kbase_hwcnt_backend_interface *iface)
+void kbase_hwcnt_backend_jm_destroy(struct kbase_hwcnt_backend_interface *iface)
 {
 	if (!iface)
 		return;
--- a/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_jm.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_jm.h
@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 *
- * (C) COPYRIGHT 2018, 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018, 2020-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -27,7 +27,7 @@
 #ifndef _KBASE_HWCNT_BACKEND_JM_H_
 #define _KBASE_HWCNT_BACKEND_JM_H_

-#include "mali_kbase_hwcnt_backend.h"
+#include "hwcnt/backend/mali_kbase_hwcnt_backend.h"

 struct kbase_device;

@ -42,9 +42,8 @@ struct kbase_device;
 *
 * Return: 0 on success, else error code.
 */
-int kbase_hwcnt_backend_jm_create(
-	struct kbase_device *kbdev,
-	struct kbase_hwcnt_backend_interface *iface);
+int kbase_hwcnt_backend_jm_create(struct kbase_device *kbdev,
+				  struct kbase_hwcnt_backend_interface *iface);

 /**
 * kbase_hwcnt_backend_jm_destroy() - Destroy a JM hardware counter backend
@ -54,7 +53,6 @@ int kbase_hwcnt_backend_jm_create(
 * Can be safely called on an all-zeroed interface, or on an already destroyed
 * interface.
 */
-void kbase_hwcnt_backend_jm_destroy(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_jm_destroy(struct kbase_hwcnt_backend_interface *iface);

 #endif /* _KBASE_HWCNT_BACKEND_JM_H_ */
--- a/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_jm_watchdog.c
+++ b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_jm_watchdog.c
@ -21,11 +21,12 @@

 #include <mali_kbase.h>

-#include <mali_kbase_hwcnt_gpu.h>
-#include <mali_kbase_hwcnt_types.h>
+#include <hwcnt/mali_kbase_hwcnt_gpu.h>
+#include <hwcnt/mali_kbase_hwcnt_types.h>

-#include <mali_kbase_hwcnt_backend.h>
-#include <mali_kbase_hwcnt_watchdog_if.h>
+#include <hwcnt/backend/mali_kbase_hwcnt_backend.h>
+#include <hwcnt/backend/mali_kbase_hwcnt_backend_jm_watchdog.h>
+#include <hwcnt/mali_kbase_hwcnt_watchdog_if.h>

 #if IS_ENABLED(CONFIG_MALI_IS_FPGA) && !IS_ENABLED(CONFIG_MALI_BIFROST_NO_MALI)
 /* Backend watch dog timer interval in milliseconds: 18 seconds. */
@ -118,8 +119,7 @@ enum backend_watchdog_state {
 */
 enum wd_init_state {
 	HWCNT_JM_WD_INIT_START,
-	HWCNT_JM_WD_INIT_ALLOC = HWCNT_JM_WD_INIT_START,
-	HWCNT_JM_WD_INIT_BACKEND,
+	HWCNT_JM_WD_INIT_BACKEND = HWCNT_JM_WD_INIT_START,
 	HWCNT_JM_WD_INIT_ENABLE_MAP,
 	HWCNT_JM_WD_INIT_DUMP_BUFFER,
 	HWCNT_JM_WD_INIT_END
@ -296,16 +296,10 @@ kbasep_hwcnt_backend_jm_watchdog_term_partial(struct kbase_hwcnt_backend_jm_watc
 	if (!wd_backend)
 		return;

-	/* disable timer thread to avoid concurrent access to shared resources */
-	wd_backend->info->dump_watchdog_iface->disable(
-		wd_backend->info->dump_watchdog_iface->timer);
+	WARN_ON(state > HWCNT_JM_WD_INIT_END);

-	/*will exit the loop when state reaches HWCNT_JM_WD_INIT_START*/
 	while (state-- > HWCNT_JM_WD_INIT_START) {
 		switch (state) {
-		case HWCNT_JM_WD_INIT_ALLOC:
-			kfree(wd_backend);
-			break;
 		case HWCNT_JM_WD_INIT_BACKEND:
 			wd_backend->info->jm_backend_iface->term(wd_backend->jm_backend);
 			break;
@ -319,6 +313,8 @@ kbasep_hwcnt_backend_jm_watchdog_term_partial(struct kbase_hwcnt_backend_jm_watc
 			break;
 		}
 	}
+
+	kfree(wd_backend);
 }

 /* Job manager watchdog backend, implementation of kbase_hwcnt_backend_term_fn
@ -326,11 +322,17 @@ kbasep_hwcnt_backend_jm_watchdog_term_partial(struct kbase_hwcnt_backend_jm_watc
 */
 static void kbasep_hwcnt_backend_jm_watchdog_term(struct kbase_hwcnt_backend *backend)
 {
+	struct kbase_hwcnt_backend_jm_watchdog *wd_backend =
+		(struct kbase_hwcnt_backend_jm_watchdog *)backend;
+
 	if (!backend)
 		return;

-	kbasep_hwcnt_backend_jm_watchdog_term_partial(
-		(struct kbase_hwcnt_backend_jm_watchdog *)backend, HWCNT_JM_WD_INIT_END);
+	/* disable timer thread to avoid concurrent access to shared resources */
+	wd_backend->info->dump_watchdog_iface->disable(
+		wd_backend->info->dump_watchdog_iface->timer);
+
+	kbasep_hwcnt_backend_jm_watchdog_term_partial(wd_backend, HWCNT_JM_WD_INIT_END);
 }

 /* Job manager watchdog backend, implementation of kbase_hwcnt_backend_init_fn */
@ -350,20 +352,20 @@ static int kbasep_hwcnt_backend_jm_watchdog_init(const struct kbase_hwcnt_backen
 	jm_info = wd_info->jm_backend_iface->info;
 	metadata = wd_info->jm_backend_iface->metadata(wd_info->jm_backend_iface->info);

+	wd_backend = kmalloc(sizeof(*wd_backend), GFP_KERNEL);
+	if (!wd_backend) {
+		*out_backend = NULL;
+		return -ENOMEM;
+	}
+
+	*wd_backend = (struct kbase_hwcnt_backend_jm_watchdog){
+		.info = wd_info,
+		.timeout_ms = hwcnt_backend_watchdog_timer_interval_ms,
+		.locked = { .state = HWCNT_JM_WD_IDLE_BUFFER_EMPTY, .is_enabled = false }
+	};
+
 	while (state < HWCNT_JM_WD_INIT_END && !errcode) {
 		switch (state) {
-		case HWCNT_JM_WD_INIT_ALLOC:
-			wd_backend = kmalloc(sizeof(*wd_backend), GFP_KERNEL);
-			if (wd_backend) {
-				*wd_backend = (struct kbase_hwcnt_backend_jm_watchdog){
-					.info = wd_info,
-					.timeout_ms = hwcnt_backend_watchdog_timer_interval_ms,
-					.locked = { .state = HWCNT_JM_WD_IDLE_BUFFER_EMPTY,
-						    .is_enabled = false }
-				};
-			} else
-				errcode = -ENOMEM;
-			break;
 		case HWCNT_JM_WD_INIT_BACKEND:
 			errcode = wd_info->jm_backend_iface->init(jm_info, &wd_backend->jm_backend);
 			break;
@ -823,5 +825,5 @@ void kbase_hwcnt_backend_jm_watchdog_destroy(struct kbase_hwcnt_backend_interfac
 	kfree((struct kbase_hwcnt_backend_jm_watchdog_info *)iface->info);

 	/*blanking the watchdog backend interface*/
-	*iface = (struct kbase_hwcnt_backend_interface){ NULL };
+	memset(iface, 0, sizeof(*iface));
 }
--- a/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_jm_watchdog.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_jm_watchdog.h
@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 *
- * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -32,8 +32,8 @@
 #ifndef _KBASE_HWCNT_BACKEND_JM_WATCHDOG_H_
 #define _KBASE_HWCNT_BACKEND_JM_WATCHDOG_H_

-#include <mali_kbase_hwcnt_backend.h>
-#include <mali_kbase_hwcnt_watchdog_if.h>
+#include <hwcnt/backend/mali_kbase_hwcnt_backend.h>
+#include <hwcnt/mali_kbase_hwcnt_watchdog_if.h>

 /**
 * kbase_hwcnt_backend_jm_watchdog_create() - Create a job manager hardware counter watchdog
--- a/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt.c
+++ b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt.c
@ -23,10 +23,10 @@
 * Implementation of hardware counter context and accumulator APIs.
 */

-#include "mali_kbase_hwcnt_context.h"
-#include "mali_kbase_hwcnt_accumulator.h"
-#include "mali_kbase_hwcnt_backend.h"
-#include "mali_kbase_hwcnt_types.h"
+#include "hwcnt/mali_kbase_hwcnt_context.h"
+#include "hwcnt/mali_kbase_hwcnt_accumulator.h"
+#include "hwcnt/backend/mali_kbase_hwcnt_backend.h"
+#include "hwcnt/mali_kbase_hwcnt_types.h"

 #include <linux/mutex.h>
 #include <linux/spinlock.h>
@ -39,11 +39,7 @@
 * @ACCUM_STATE_ENABLED:  Enabled state, where dumping is enabled if there are
 *                        any enabled counters.
 */
-enum kbase_hwcnt_accum_state {
-	ACCUM_STATE_ERROR,
-	ACCUM_STATE_DISABLED,
-	ACCUM_STATE_ENABLED
-};
+enum kbase_hwcnt_accum_state { ACCUM_STATE_ERROR, ACCUM_STATE_DISABLED, ACCUM_STATE_ENABLED };

 /**
 * struct kbase_hwcnt_accumulator - Hardware counter accumulator structure.
@ -130,9 +126,8 @@ struct kbase_hwcnt_context {
 	struct workqueue_struct *wq;
 };

-int kbase_hwcnt_context_init(
-	const struct kbase_hwcnt_backend_interface *iface,
-	struct kbase_hwcnt_context **out_hctx)
+int kbase_hwcnt_context_init(const struct kbase_hwcnt_backend_interface *iface,
+			     struct kbase_hwcnt_context **out_hctx)
 {
 	struct kbase_hwcnt_context *hctx = NULL;

@ -149,8 +144,7 @@ int kbase_hwcnt_context_init(
 	mutex_init(&hctx->accum_lock);
 	hctx->accum_inited = false;

-	hctx->wq =
-		alloc_workqueue("mali_kbase_hwcnt", WQ_HIGHPRI | WQ_UNBOUND, 0);
+	hctx->wq = alloc_workqueue("mali_kbase_hwcnt", WQ_HIGHPRI | WQ_UNBOUND, 0);
 	if (!hctx->wq)
 		goto err_alloc_workqueue;

@ -208,35 +202,30 @@ static int kbasep_hwcnt_accumulator_init(struct kbase_hwcnt_context *hctx)
 	WARN_ON(!hctx);
 	WARN_ON(!hctx->accum_inited);

-	errcode = hctx->iface->init(
-		hctx->iface->info, &hctx->accum.backend);
+	errcode = hctx->iface->init(hctx->iface->info, &hctx->accum.backend);
 	if (errcode)
 		goto error;

 	hctx->accum.metadata = hctx->iface->metadata(hctx->iface->info);
 	hctx->accum.state = ACCUM_STATE_ERROR;

-	errcode = kbase_hwcnt_enable_map_alloc(hctx->accum.metadata,
-					       &hctx->accum.enable_map);
+	errcode = kbase_hwcnt_enable_map_alloc(hctx->accum.metadata, &hctx->accum.enable_map);
 	if (errcode)
 		goto error;

 	hctx->accum.enable_map_any_enabled = false;

-	errcode = kbase_hwcnt_dump_buffer_alloc(hctx->accum.metadata,
-						&hctx->accum.accum_buf);
+	errcode = kbase_hwcnt_dump_buffer_alloc(hctx->accum.metadata, &hctx->accum.accum_buf);
 	if (errcode)
 		goto error;

-	errcode = kbase_hwcnt_enable_map_alloc(hctx->accum.metadata,
-					       &hctx->accum.scratch_map);
+	errcode = kbase_hwcnt_enable_map_alloc(hctx->accum.metadata, &hctx->accum.scratch_map);
 	if (errcode)
 		goto error;

 	hctx->accum.accumulated = false;

-	hctx->accum.ts_last_dump_ns =
-		hctx->iface->timestamp_ns(hctx->accum.backend);
+	hctx->accum.ts_last_dump_ns = hctx->iface->timestamp_ns(hctx->accum.backend);

 	return 0;

@ -252,8 +241,7 @@ static int kbasep_hwcnt_accumulator_init(struct kbase_hwcnt_context *hctx)
 * @hctx:       Non-NULL pointer to hardware counter context.
 * @accumulate: True if we should accumulate before disabling, else false.
 */
-static void kbasep_hwcnt_accumulator_disable(
-	struct kbase_hwcnt_context *hctx, bool accumulate)
+static void kbasep_hwcnt_accumulator_disable(struct kbase_hwcnt_context *hctx, bool accumulate)
 {
 	int errcode = 0;
 	bool backend_enabled = false;
@ -272,8 +260,7 @@ static void kbasep_hwcnt_accumulator_disable(
 	WARN_ON(hctx->disable_count != 0);
 	WARN_ON(hctx->accum.state == ACCUM_STATE_DISABLED);

-	if ((hctx->accum.state == ACCUM_STATE_ENABLED) &&
-	    (accum->enable_map_any_enabled))
+	if ((hctx->accum.state == ACCUM_STATE_ENABLED) && (accum->enable_map_any_enabled))
 		backend_enabled = true;

 	if (!backend_enabled)
@ -297,8 +284,8 @@ static void kbasep_hwcnt_accumulator_disable(
 	if (errcode)
 		goto disable;

-	errcode = hctx->iface->dump_get(accum->backend,
-		&accum->accum_buf, &accum->enable_map, accum->accumulated);
+	errcode = hctx->iface->dump_get(accum->backend, &accum->accum_buf, &accum->enable_map,
+					accum->accumulated);
 	if (errcode)
 		goto disable;

@ -336,8 +323,7 @@ static void kbasep_hwcnt_accumulator_enable(struct kbase_hwcnt_context *hctx)

 	/* The backend only needs enabling if any counters are enabled */
 	if (accum->enable_map_any_enabled)
-		errcode = hctx->iface->dump_enable_nolock(
-			accum->backend, &accum->enable_map);
+		errcode = hctx->iface->dump_enable_nolock(accum->backend, &accum->enable_map);

 	if (!errcode)
 		accum->state = ACCUM_STATE_ENABLED;
@ -364,12 +350,9 @@ static void kbasep_hwcnt_accumulator_enable(struct kbase_hwcnt_context *hctx)
 *
 * Return:       0 on success, else error code.
 */
-static int kbasep_hwcnt_accumulator_dump(
-	struct kbase_hwcnt_context *hctx,
-	u64 *ts_start_ns,
-	u64 *ts_end_ns,
-	struct kbase_hwcnt_dump_buffer *dump_buf,
-	const struct kbase_hwcnt_enable_map *new_map)
+static int kbasep_hwcnt_accumulator_dump(struct kbase_hwcnt_context *hctx, u64 *ts_start_ns,
+					 u64 *ts_end_ns, struct kbase_hwcnt_dump_buffer *dump_buf,
+					 const struct kbase_hwcnt_enable_map *new_map)
 {
 	int errcode = 0;
 	unsigned long flags;
@ -398,8 +381,7 @@ static int kbasep_hwcnt_accumulator_dump(
 	kbase_hwcnt_enable_map_copy(cur_map, &accum->enable_map);

 	if (new_map)
-		new_map_any_enabled =
-			kbase_hwcnt_enable_map_any_enabled(new_map);
+		new_map_any_enabled = kbase_hwcnt_enable_map_any_enabled(new_map);

 	/*
 	 * We're holding accum_lock, so the accumulator state might transition
@ -426,8 +408,7 @@ static int kbasep_hwcnt_accumulator_dump(
 	 * then we'll do it ourselves after the dump.
 	 */
 	if (new_map) {
-		kbase_hwcnt_enable_map_copy(
-			&accum->enable_map, new_map);
+		kbase_hwcnt_enable_map_copy(&accum->enable_map, new_map);
 		accum->enable_map_any_enabled = new_map_any_enabled;
 	}

@ -440,12 +421,10 @@ static int kbasep_hwcnt_accumulator_dump(
 	/* Initiate the dump if the backend is enabled. */
 	if ((state == ACCUM_STATE_ENABLED) && cur_map_any_enabled) {
 		if (dump_buf) {
-			errcode = hctx->iface->dump_request(
-					accum->backend, &dump_time_ns);
+			errcode = hctx->iface->dump_request(accum->backend, &dump_time_ns);
 			dump_requested = true;
 		} else {
-			dump_time_ns = hctx->iface->timestamp_ns(
-					accum->backend);
+			dump_time_ns = hctx->iface->timestamp_ns(accum->backend);
 			errcode = hctx->iface->dump_clear(accum->backend);
 		}

@ -457,8 +436,7 @@ static int kbasep_hwcnt_accumulator_dump(

 	/* Copy any accumulation into the dest buffer */
 	if (accum->accumulated && dump_buf) {
-		kbase_hwcnt_dump_buffer_copy(
-			dump_buf, &accum->accum_buf, cur_map);
+		kbase_hwcnt_dump_buffer_copy(dump_buf, &accum->accum_buf, cur_map);
 		dump_written = true;
 	}

@ -483,8 +461,7 @@ static int kbasep_hwcnt_accumulator_dump(
 		 * we're already enabled and holding accum_lock is impossible.
 		 */
 		if (new_map_any_enabled) {
-			errcode = hctx->iface->dump_enable(
-				accum->backend, new_map);
+			errcode = hctx->iface->dump_enable(accum->backend, new_map);
 			if (errcode)
 				goto error;
 		}
@ -495,11 +472,8 @@ static int kbasep_hwcnt_accumulator_dump(
 		/* If we dumped, copy or accumulate it into the destination */
 		if (dump_requested) {
 			WARN_ON(state != ACCUM_STATE_ENABLED);
-			errcode = hctx->iface->dump_get(
-				accum->backend,
-				dump_buf,
-				cur_map,
-				dump_written);
+			errcode = hctx->iface->dump_get(accum->backend, dump_buf, cur_map,
+							dump_written);
 			if (errcode)
 				goto error;
 			dump_written = true;
@ -540,8 +514,7 @@ static int kbasep_hwcnt_accumulator_dump(
 * @hctx:       Non-NULL pointer to hardware counter context.
 * @accumulate: True if we should accumulate before disabling, else false.
 */
-static void kbasep_hwcnt_context_disable(
-	struct kbase_hwcnt_context *hctx, bool accumulate)
+static void kbasep_hwcnt_context_disable(struct kbase_hwcnt_context *hctx, bool accumulate)
 {
 	unsigned long flags;

@ -563,9 +536,8 @@ static void kbasep_hwcnt_context_disable(
 	}
 }

-int kbase_hwcnt_accumulator_acquire(
-	struct kbase_hwcnt_context *hctx,
-	struct kbase_hwcnt_accumulator **accum)
+int kbase_hwcnt_accumulator_acquire(struct kbase_hwcnt_context *hctx,
+				    struct kbase_hwcnt_accumulator **accum)
 {
 	int errcode = 0;
 	unsigned long flags;
@ -618,9 +590,7 @@ int kbase_hwcnt_accumulator_acquire(
 	 * Regardless of initial state, counters don't need to be enabled via
 	 * the backend, as the initial enable map has no enabled counters.
 	 */
-	hctx->accum.state = (hctx->disable_count == 0) ?
-		ACCUM_STATE_ENABLED :
-		ACCUM_STATE_DISABLED;
+	hctx->accum.state = (hctx->disable_count == 0) ? ACCUM_STATE_ENABLED : ACCUM_STATE_DISABLED;

 	spin_unlock_irqrestore(&hctx->state_lock, flags);

@ -728,8 +698,7 @@ void kbase_hwcnt_context_enable(struct kbase_hwcnt_context *hctx)
 	spin_unlock_irqrestore(&hctx->state_lock, flags);
 }

-const struct kbase_hwcnt_metadata *kbase_hwcnt_context_metadata(
-	struct kbase_hwcnt_context *hctx)
+const struct kbase_hwcnt_metadata *kbase_hwcnt_context_metadata(struct kbase_hwcnt_context *hctx)
 {
 	if (!hctx)
 		return NULL;
@ -737,8 +706,7 @@ const struct kbase_hwcnt_metadata *kbase_hwcnt_context_metadata(
 	return hctx->iface->metadata(hctx->iface->info);
 }

-bool kbase_hwcnt_context_queue_work(struct kbase_hwcnt_context *hctx,
-				    struct work_struct *work)
+bool kbase_hwcnt_context_queue_work(struct kbase_hwcnt_context *hctx, struct work_struct *work)
 {
 	if (WARN_ON(!hctx) || WARN_ON(!work))
 		return false;
@ -746,12 +714,10 @@ bool kbase_hwcnt_context_queue_work(struct kbase_hwcnt_context *hctx,
 	return queue_work(hctx->wq, work);
 }

-int kbase_hwcnt_accumulator_set_counters(
-	struct kbase_hwcnt_accumulator *accum,
-	const struct kbase_hwcnt_enable_map *new_map,
-	u64 *ts_start_ns,
-	u64 *ts_end_ns,
-	struct kbase_hwcnt_dump_buffer *dump_buf)
+int kbase_hwcnt_accumulator_set_counters(struct kbase_hwcnt_accumulator *accum,
+					 const struct kbase_hwcnt_enable_map *new_map,
+					 u64 *ts_start_ns, u64 *ts_end_ns,
+					 struct kbase_hwcnt_dump_buffer *dump_buf)
 {
 	int errcode;
 	struct kbase_hwcnt_context *hctx;
@ -767,19 +733,15 @@ int kbase_hwcnt_accumulator_set_counters(

 	mutex_lock(&hctx->accum_lock);

-	errcode = kbasep_hwcnt_accumulator_dump(
-		hctx, ts_start_ns, ts_end_ns, dump_buf, new_map);
+	errcode = kbasep_hwcnt_accumulator_dump(hctx, ts_start_ns, ts_end_ns, dump_buf, new_map);

 	mutex_unlock(&hctx->accum_lock);

 	return errcode;
 }

-int kbase_hwcnt_accumulator_dump(
-	struct kbase_hwcnt_accumulator *accum,
-	u64 *ts_start_ns,
-	u64 *ts_end_ns,
-	struct kbase_hwcnt_dump_buffer *dump_buf)
+int kbase_hwcnt_accumulator_dump(struct kbase_hwcnt_accumulator *accum, u64 *ts_start_ns,
+				 u64 *ts_end_ns, struct kbase_hwcnt_dump_buffer *dump_buf)
 {
 	int errcode;
 	struct kbase_hwcnt_context *hctx;
@ -794,8 +756,7 @@ int kbase_hwcnt_accumulator_dump(

 	mutex_lock(&hctx->accum_lock);

-	errcode = kbasep_hwcnt_accumulator_dump(
-		hctx, ts_start_ns, ts_end_ns, dump_buf, NULL);
+	errcode = kbasep_hwcnt_accumulator_dump(hctx, ts_start_ns, ts_end_ns, dump_buf, NULL);

 	mutex_unlock(&hctx->accum_lock);

--- a/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_accumulator.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_accumulator.h
@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 *
- * (C) COPYRIGHT 2018, 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018, 2020-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -67,9 +67,8 @@ struct kbase_hwcnt_dump_buffer;
 *
 * Return: 0 on success or error code.
 */
-int kbase_hwcnt_accumulator_acquire(
-	struct kbase_hwcnt_context *hctx,
-	struct kbase_hwcnt_accumulator **accum);
+int kbase_hwcnt_accumulator_acquire(struct kbase_hwcnt_context *hctx,
+				    struct kbase_hwcnt_accumulator **accum);

 /**
 * kbase_hwcnt_accumulator_release() - Release a hardware counter accumulator.
@ -102,12 +101,10 @@ void kbase_hwcnt_accumulator_release(struct kbase_hwcnt_accumulator *accum);
 *
 * Return: 0 on success or error code.
 */
-int kbase_hwcnt_accumulator_set_counters(
-	struct kbase_hwcnt_accumulator *accum,
-	const struct kbase_hwcnt_enable_map *new_map,
-	u64 *ts_start_ns,
-	u64 *ts_end_ns,
-	struct kbase_hwcnt_dump_buffer *dump_buf);
+int kbase_hwcnt_accumulator_set_counters(struct kbase_hwcnt_accumulator *accum,
+					 const struct kbase_hwcnt_enable_map *new_map,
+					 u64 *ts_start_ns, u64 *ts_end_ns,
+					 struct kbase_hwcnt_dump_buffer *dump_buf);

 /**
 * kbase_hwcnt_accumulator_dump() - Perform a dump of the currently enabled
@ -127,11 +124,8 @@ int kbase_hwcnt_accumulator_set_counters(
 *
 * Return: 0 on success or error code.
 */
-int kbase_hwcnt_accumulator_dump(
-	struct kbase_hwcnt_accumulator *accum,
-	u64 *ts_start_ns,
-	u64 *ts_end_ns,
-	struct kbase_hwcnt_dump_buffer *dump_buf);
+int kbase_hwcnt_accumulator_dump(struct kbase_hwcnt_accumulator *accum, u64 *ts_start_ns,
+				 u64 *ts_end_ns, struct kbase_hwcnt_dump_buffer *dump_buf);

 /**
 * kbase_hwcnt_accumulator_timestamp_ns() - Get the current accumulator backend
--- a/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_context.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_context.h
@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 *
- * (C) COPYRIGHT 2018, 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018, 2020-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -43,9 +43,8 @@ struct kbase_hwcnt_context;
 *
 * Return: 0 on success, else error code.
 */
-int kbase_hwcnt_context_init(
-	const struct kbase_hwcnt_backend_interface *iface,
-	struct kbase_hwcnt_context **out_hctx);
+int kbase_hwcnt_context_init(const struct kbase_hwcnt_backend_interface *iface,
+			     struct kbase_hwcnt_context **out_hctx);

 /**
 * kbase_hwcnt_context_term() - Terminate a hardware counter context.
@ -61,8 +60,7 @@ void kbase_hwcnt_context_term(struct kbase_hwcnt_context *hctx);
 *
 * Return: Non-NULL pointer to metadata, or NULL on error.
 */
-const struct kbase_hwcnt_metadata *kbase_hwcnt_context_metadata(
-	struct kbase_hwcnt_context *hctx);
+const struct kbase_hwcnt_metadata *kbase_hwcnt_context_metadata(struct kbase_hwcnt_context *hctx);

 /**
 * kbase_hwcnt_context_disable() - Increment the disable count of the context.
@ -145,7 +143,6 @@ void kbase_hwcnt_context_enable(struct kbase_hwcnt_context *hctx);
 * this meant progress through the power management states could be stalled
 * for however long that higher priority thread took.
 */
-bool kbase_hwcnt_context_queue_work(struct kbase_hwcnt_context *hctx,
-				    struct work_struct *work);
+bool kbase_hwcnt_context_queue_work(struct kbase_hwcnt_context *hctx, struct work_struct *work);

 #endif /* _KBASE_HWCNT_CONTEXT_H_ */
--- a/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_gpu.c
+++ b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_gpu.c
@ -19,8 +19,8 @@
 *
 */

-#include "mali_kbase_hwcnt_gpu.h"
-#include "mali_kbase_hwcnt_types.h"
+#include "hwcnt/mali_kbase_hwcnt_gpu.h"
+#include "hwcnt/mali_kbase_hwcnt_types.h"

 #include <linux/err.h>

@ -32,8 +32,7 @@ enum enable_map_idx {
 	EM_COUNT,
 };

-static void kbasep_get_fe_block_type(u64 *dst, enum kbase_hwcnt_set counter_set,
-				     bool is_csf)
+static void kbasep_get_fe_block_type(u64 *dst, enum kbase_hwcnt_set counter_set, bool is_csf)
 {
 	switch (counter_set) {
 	case KBASE_HWCNT_SET_PRIMARY:
@ -56,8 +55,7 @@ static void kbasep_get_fe_block_type(u64 *dst, enum kbase_hwcnt_set counter_set,
 	}
 }

-static void kbasep_get_tiler_block_type(u64 *dst,
-					enum kbase_hwcnt_set counter_set)
+static void kbasep_get_tiler_block_type(u64 *dst, enum kbase_hwcnt_set counter_set)
 {
 	switch (counter_set) {
 	case KBASE_HWCNT_SET_PRIMARY:
@ -72,8 +70,7 @@ static void kbasep_get_tiler_block_type(u64 *dst,
 	}
 }

-static void kbasep_get_sc_block_type(u64 *dst, enum kbase_hwcnt_set counter_set,
-				     bool is_csf)
+static void kbasep_get_sc_block_type(u64 *dst, enum kbase_hwcnt_set counter_set, bool is_csf)
 {
 	switch (counter_set) {
 	case KBASE_HWCNT_SET_PRIMARY:
@ -93,8 +90,7 @@ static void kbasep_get_sc_block_type(u64 *dst, enum kbase_hwcnt_set counter_set,
 	}
 }

-static void kbasep_get_memsys_block_type(u64 *dst,
-					 enum kbase_hwcnt_set counter_set)
+static void kbasep_get_memsys_block_type(u64 *dst, enum kbase_hwcnt_set counter_set)
 {
 	switch (counter_set) {
 	case KBASE_HWCNT_SET_PRIMARY:
@ -122,15 +118,14 @@ static void kbasep_get_memsys_block_type(u64 *dst,
 *
 * Return: 0 on success, else error code.
 */
-static int kbasep_hwcnt_backend_gpu_metadata_create(
-	const struct kbase_hwcnt_gpu_info *gpu_info, const bool is_csf,
-	enum kbase_hwcnt_set counter_set,
-	const struct kbase_hwcnt_metadata **metadata)
+static int kbasep_hwcnt_backend_gpu_metadata_create(const struct kbase_hwcnt_gpu_info *gpu_info,
+						    const bool is_csf,
+						    enum kbase_hwcnt_set counter_set,
+						    const struct kbase_hwcnt_metadata **metadata)
 {
 	struct kbase_hwcnt_description desc;
 	struct kbase_hwcnt_group_description group;
-	struct kbase_hwcnt_block_description
-		blks[KBASE_HWCNT_V5_BLOCK_TYPE_COUNT];
+	struct kbase_hwcnt_block_description blks[KBASE_HWCNT_V5_BLOCK_TYPE_COUNT];
 	size_t non_sc_block_count;
 	size_t sc_block_count;

@ -156,22 +151,19 @@ static int kbasep_hwcnt_backend_gpu_metadata_create(
 	kbasep_get_fe_block_type(&blks[0].type, counter_set, is_csf);
 	blks[0].inst_cnt = 1;
 	blks[0].hdr_cnt = KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
-	blks[0].ctr_cnt = gpu_info->prfcnt_values_per_block -
-			  KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
+	blks[0].ctr_cnt = gpu_info->prfcnt_values_per_block - KBASE_HWCNT_V5_HEADERS_PER_BLOCK;

 	/* One Tiler block */
 	kbasep_get_tiler_block_type(&blks[1].type, counter_set);
 	blks[1].inst_cnt = 1;
 	blks[1].hdr_cnt = KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
-	blks[1].ctr_cnt = gpu_info->prfcnt_values_per_block -
-			  KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
+	blks[1].ctr_cnt = gpu_info->prfcnt_values_per_block - KBASE_HWCNT_V5_HEADERS_PER_BLOCK;

 	/* l2_count memsys blks */
 	kbasep_get_memsys_block_type(&blks[2].type, counter_set);
 	blks[2].inst_cnt = gpu_info->l2_count;
 	blks[2].hdr_cnt = KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
-	blks[2].ctr_cnt = gpu_info->prfcnt_values_per_block -
-			  KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
+	blks[2].ctr_cnt = gpu_info->prfcnt_values_per_block - KBASE_HWCNT_V5_HEADERS_PER_BLOCK;

 	/*
 	 * There are as many shader cores in the system as there are bits set in
@ -192,8 +184,7 @@ static int kbasep_hwcnt_backend_gpu_metadata_create(
 	kbasep_get_sc_block_type(&blks[3].type, counter_set, is_csf);
 	blks[3].inst_cnt = sc_block_count;
 	blks[3].hdr_cnt = KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
-	blks[3].ctr_cnt = gpu_info->prfcnt_values_per_block -
-			  KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
+	blks[3].ctr_cnt = gpu_info->prfcnt_values_per_block - KBASE_HWCNT_V5_HEADERS_PER_BLOCK;

 	WARN_ON(KBASE_HWCNT_V5_BLOCK_TYPE_COUNT != 4);

@ -220,8 +211,7 @@ static int kbasep_hwcnt_backend_gpu_metadata_create(
 *
 * Return: Size of buffer the GPU needs to perform a counter dump.
 */
-static size_t
-kbasep_hwcnt_backend_jm_dump_bytes(const struct kbase_hwcnt_gpu_info *gpu_info)
+static size_t kbasep_hwcnt_backend_jm_dump_bytes(const struct kbase_hwcnt_gpu_info *gpu_info)
 {
 	WARN_ON(!gpu_info);

@ -229,11 +219,10 @@ kbasep_hwcnt_backend_jm_dump_bytes(const struct kbase_hwcnt_gpu_info *gpu_info)
 	       gpu_info->prfcnt_values_per_block * KBASE_HWCNT_VALUE_HW_BYTES;
 }

-int kbase_hwcnt_jm_metadata_create(
-	const struct kbase_hwcnt_gpu_info *gpu_info,
-	enum kbase_hwcnt_set counter_set,
-	const struct kbase_hwcnt_metadata **out_metadata,
-	size_t *out_dump_bytes)
+int kbase_hwcnt_jm_metadata_create(const struct kbase_hwcnt_gpu_info *gpu_info,
+				   enum kbase_hwcnt_set counter_set,
+				   const struct kbase_hwcnt_metadata **out_metadata,
+				   size_t *out_dump_bytes)
 {
 	int errcode;
 	const struct kbase_hwcnt_metadata *metadata;
@ -250,8 +239,7 @@ int kbase_hwcnt_jm_metadata_create(
 	 * all the available L2 cache and Shader cores are allocated.
 	 */
 	dump_bytes = kbasep_hwcnt_backend_jm_dump_bytes(gpu_info);
-	errcode = kbasep_hwcnt_backend_gpu_metadata_create(
-		gpu_info, false, counter_set, &metadata);
+	errcode = kbasep_hwcnt_backend_gpu_metadata_create(gpu_info, false, counter_set, &metadata);
 	if (errcode)
 		return errcode;

@ -276,10 +264,9 @@ void kbase_hwcnt_jm_metadata_destroy(const struct kbase_hwcnt_metadata *metadata
 	kbase_hwcnt_metadata_destroy(metadata);
 }

-int kbase_hwcnt_csf_metadata_create(
-	const struct kbase_hwcnt_gpu_info *gpu_info,
-	enum kbase_hwcnt_set counter_set,
-	const struct kbase_hwcnt_metadata **out_metadata)
+int kbase_hwcnt_csf_metadata_create(const struct kbase_hwcnt_gpu_info *gpu_info,
+				    enum kbase_hwcnt_set counter_set,
+				    const struct kbase_hwcnt_metadata **out_metadata)
 {
 	int errcode;
 	const struct kbase_hwcnt_metadata *metadata;
@ -287,8 +274,7 @@ int kbase_hwcnt_csf_metadata_create(
 	if (!gpu_info || !out_metadata)
 		return -EINVAL;

-	errcode = kbasep_hwcnt_backend_gpu_metadata_create(
-		gpu_info, true, counter_set, &metadata);
+	errcode = kbasep_hwcnt_backend_gpu_metadata_create(gpu_info, true, counter_set, &metadata);
 	if (errcode)
 		return errcode;

@ -297,8 +283,7 @@ int kbase_hwcnt_csf_metadata_create(
 	return 0;
 }

-void kbase_hwcnt_csf_metadata_destroy(
-	const struct kbase_hwcnt_metadata *metadata)
+void kbase_hwcnt_csf_metadata_destroy(const struct kbase_hwcnt_metadata *metadata)
 {
 	if (!metadata)
 		return;
@ -306,10 +291,7 @@ void kbase_hwcnt_csf_metadata_destroy(
 	kbase_hwcnt_metadata_destroy(metadata);
 }

-static bool is_block_type_shader(
-	const u64 grp_type,
-	const u64 blk_type,
-	const size_t blk)
+static bool is_block_type_shader(const u64 grp_type, const u64 blk_type, const size_t blk)
 {
 	bool is_shader = false;

@ -326,9 +308,7 @@ static bool is_block_type_shader(
 	return is_shader;
 }

-static bool is_block_type_l2_cache(
-	const u64 grp_type,
-	const u64 blk_type)
+static bool is_block_type_l2_cache(const u64 grp_type, const u64 blk_type)
 {
 	bool is_l2_cache = false;

@ -348,10 +328,8 @@ static bool is_block_type_l2_cache(
 }

 int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
-			    const struct kbase_hwcnt_enable_map *dst_enable_map,
-			    u64 pm_core_mask,
-			    const struct kbase_hwcnt_curr_config *curr_config,
-			    bool accumulate)
+			    const struct kbase_hwcnt_enable_map *dst_enable_map, u64 pm_core_mask,
+			    const struct kbase_hwcnt_curr_config *curr_config, bool accumulate)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	size_t grp, blk, blk_inst;
@ -362,28 +340,21 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 	/* Variables to deal with the current configuration */
 	int l2_count = 0;

-	if (!dst || !src || !dst_enable_map ||
-	    (dst_enable_map->metadata != dst->metadata))
+	if (!dst || !src || !dst_enable_map || (dst_enable_map->metadata != dst->metadata))
 		return -EINVAL;

 	metadata = dst->metadata;

-	kbase_hwcnt_metadata_for_each_block(
-		metadata, grp, blk, blk_inst) {
-		const size_t hdr_cnt =
-			kbase_hwcnt_metadata_block_headers_count(
-				metadata, grp, blk);
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+	{
+		const size_t hdr_cnt = kbase_hwcnt_metadata_block_headers_count(metadata, grp, blk);
 		const size_t ctr_cnt =
-			kbase_hwcnt_metadata_block_counters_count(
-				metadata, grp, blk);
-		const u64 blk_type = kbase_hwcnt_metadata_block_type(
-			metadata, grp, blk);
+			kbase_hwcnt_metadata_block_counters_count(metadata, grp, blk);
+		const u64 blk_type = kbase_hwcnt_metadata_block_type(metadata, grp, blk);
 		const bool is_shader_core = is_block_type_shader(
-			kbase_hwcnt_metadata_group_type(metadata, grp),
-			blk_type, blk);
+			kbase_hwcnt_metadata_group_type(metadata, grp), blk_type, blk);
 		const bool is_l2_cache = is_block_type_l2_cache(
-			kbase_hwcnt_metadata_group_type(metadata, grp),
-			blk_type);
+			kbase_hwcnt_metadata_group_type(metadata, grp), blk_type);
 		const bool is_undefined = kbase_hwcnt_is_block_type_undefined(
 			kbase_hwcnt_metadata_group_type(metadata, grp), blk_type);
 		bool hw_res_available = true;
@ -412,10 +383,9 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 		/*
 		 * Skip block if no values in the destination block are enabled.
 		 */
-		if (kbase_hwcnt_enable_map_block_enabled(
-			dst_enable_map, grp, blk, blk_inst)) {
-			u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
-				dst, grp, blk, blk_inst);
+		if (kbase_hwcnt_enable_map_block_enabled(dst_enable_map, grp, blk, blk_inst)) {
+			u64 *dst_blk =
+				kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
 			const u64 *src_blk = dump_src + src_offset;
 			bool blk_powered;

@ -435,13 +405,11 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 			if (blk_powered && !is_undefined && hw_res_available) {
 				/* Only powered and defined blocks have valid data. */
 				if (accumulate) {
-					kbase_hwcnt_dump_buffer_block_accumulate(
-						dst_blk, src_blk, hdr_cnt,
-						ctr_cnt);
+					kbase_hwcnt_dump_buffer_block_accumulate(dst_blk, src_blk,
+										 hdr_cnt, ctr_cnt);
 				} else {
-					kbase_hwcnt_dump_buffer_block_copy(
-						dst_blk, src_blk,
-						(hdr_cnt + ctr_cnt));
+					kbase_hwcnt_dump_buffer_block_copy(dst_blk, src_blk,
+									   (hdr_cnt + ctr_cnt));
 				}
 			} else {
 				/* Even though the block might be undefined, the
@ -469,26 +437,23 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 }

 int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
-			     const struct kbase_hwcnt_enable_map *dst_enable_map,
-			     bool accumulate)
+			     const struct kbase_hwcnt_enable_map *dst_enable_map, bool accumulate)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	const u64 *dump_src = src;
 	size_t src_offset = 0;
 	size_t grp, blk, blk_inst;

-	if (!dst || !src || !dst_enable_map ||
-	    (dst_enable_map->metadata != dst->metadata))
+	if (!dst || !src || !dst_enable_map || (dst_enable_map->metadata != dst->metadata))
 		return -EINVAL;

 	metadata = dst->metadata;

-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
-		const size_t hdr_cnt = kbase_hwcnt_metadata_block_headers_count(
-			metadata, grp, blk);
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+	{
+		const size_t hdr_cnt = kbase_hwcnt_metadata_block_headers_count(metadata, grp, blk);
 		const size_t ctr_cnt =
-			kbase_hwcnt_metadata_block_counters_count(metadata, grp,
-								  blk);
+			kbase_hwcnt_metadata_block_counters_count(metadata, grp, blk);
 		const uint64_t blk_type = kbase_hwcnt_metadata_block_type(metadata, grp, blk);
 		const bool is_undefined = kbase_hwcnt_is_block_type_undefined(
 			kbase_hwcnt_metadata_group_type(metadata, grp), blk_type);
@ -496,10 +461,9 @@ int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 		/*
 		 * Skip block if no values in the destination block are enabled.
 		 */
-		if (kbase_hwcnt_enable_map_block_enabled(dst_enable_map, grp,
-							 blk, blk_inst)) {
-			u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
-				dst, grp, blk, blk_inst);
+		if (kbase_hwcnt_enable_map_block_enabled(dst_enable_map, grp, blk, blk_inst)) {
+			u64 *dst_blk =
+				kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
 			const u64 *src_blk = dump_src + src_offset;

 			if (!is_undefined) {
@ -542,12 +506,9 @@ int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 * @hi:   Non-NULL pointer to where high 64 bits of block enable map abstraction
 *        will be stored.
 */
-static inline void kbasep_hwcnt_backend_gpu_block_map_from_physical(
-	u32 phys,
-	u64 *lo,
-	u64 *hi)
+static inline void kbasep_hwcnt_backend_gpu_block_map_from_physical(u32 phys, u64 *lo, u64 *hi)
 {
-	u64 dwords[2] = {0, 0};
+	u64 dwords[2] = { 0, 0 };

 	size_t dword_idx;

@ -572,9 +533,8 @@ static inline void kbasep_hwcnt_backend_gpu_block_map_from_physical(
 	*hi = dwords[1];
 }

-void kbase_hwcnt_gpu_enable_map_to_physical(
-	struct kbase_hwcnt_physical_enable_map *dst,
-	const struct kbase_hwcnt_enable_map *src)
+void kbase_hwcnt_gpu_enable_map_to_physical(struct kbase_hwcnt_physical_enable_map *dst,
+					    const struct kbase_hwcnt_enable_map *src)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	u64 fe_bm[EM_COUNT] = { 0 };
@ -588,17 +548,13 @@ void kbase_hwcnt_gpu_enable_map_to_physical(

 	metadata = src->metadata;

-	kbase_hwcnt_metadata_for_each_block(
-		metadata, grp, blk, blk_inst) {
-		const u64 grp_type = kbase_hwcnt_metadata_group_type(
-			metadata, grp);
-		const u64 blk_type = kbase_hwcnt_metadata_block_type(
-			metadata, grp, blk);
-		const u64 *blk_map = kbase_hwcnt_enable_map_block_instance(
-			src, grp, blk, blk_inst);
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+	{
+		const u64 grp_type = kbase_hwcnt_metadata_group_type(metadata, grp);
+		const u64 blk_type = kbase_hwcnt_metadata_block_type(metadata, grp, blk);
+		const u64 *blk_map = kbase_hwcnt_enable_map_block_instance(src, grp, blk, blk_inst);

-		if ((enum kbase_hwcnt_gpu_group_type)grp_type ==
-		    KBASE_HWCNT_GPU_GROUP_TYPE_V5) {
+		if ((enum kbase_hwcnt_gpu_group_type)grp_type == KBASE_HWCNT_GPU_GROUP_TYPE_V5) {
 			const size_t map_stride =
 				kbase_hwcnt_metadata_block_enable_map_stride(metadata, grp, blk);
 			size_t map_idx;
@ -649,8 +605,7 @@ void kbase_hwcnt_gpu_enable_map_to_physical(
 		kbase_hwcnt_backend_gpu_block_map_to_physical(mmu_l2_bm[EM_LO], mmu_l2_bm[EM_HI]);
 }

-void kbase_hwcnt_gpu_set_to_physical(enum kbase_hwcnt_physical_set *dst,
-				     enum kbase_hwcnt_set src)
+void kbase_hwcnt_gpu_set_to_physical(enum kbase_hwcnt_physical_set *dst, enum kbase_hwcnt_set src)
 {
 	switch (src) {
 	case KBASE_HWCNT_SET_PRIMARY:
@ -667,9 +622,8 @@ void kbase_hwcnt_gpu_set_to_physical(enum kbase_hwcnt_physical_set *dst,
 	}
 }

-void kbase_hwcnt_gpu_enable_map_from_physical(
-	struct kbase_hwcnt_enable_map *dst,
-	const struct kbase_hwcnt_physical_enable_map *src)
+void kbase_hwcnt_gpu_enable_map_from_physical(struct kbase_hwcnt_enable_map *dst,
+					      const struct kbase_hwcnt_physical_enable_map *src)
 {
 	const struct kbase_hwcnt_metadata *metadata;

@ -692,16 +646,13 @@ void kbase_hwcnt_gpu_enable_map_from_physical(
 	kbasep_hwcnt_backend_gpu_block_map_from_physical(src->mmu_l2_bm, &mmu_l2_bm[EM_LO],
 							 &mmu_l2_bm[EM_HI]);

-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
-		const u64 grp_type = kbase_hwcnt_metadata_group_type(
-			metadata, grp);
-		const u64 blk_type = kbase_hwcnt_metadata_block_type(
-			metadata, grp, blk);
-		u64 *blk_map = kbase_hwcnt_enable_map_block_instance(
-			dst, grp, blk, blk_inst);
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+	{
+		const u64 grp_type = kbase_hwcnt_metadata_group_type(metadata, grp);
+		const u64 blk_type = kbase_hwcnt_metadata_block_type(metadata, grp, blk);
+		u64 *blk_map = kbase_hwcnt_enable_map_block_instance(dst, grp, blk, blk_inst);

-		if ((enum kbase_hwcnt_gpu_group_type)grp_type ==
-		    KBASE_HWCNT_GPU_GROUP_TYPE_V5) {
+		if ((enum kbase_hwcnt_gpu_group_type)grp_type == KBASE_HWCNT_GPU_GROUP_TYPE_V5) {
 			const size_t map_stride =
 				kbase_hwcnt_metadata_block_enable_map_stride(metadata, grp, blk);
 			size_t map_idx;
@ -744,29 +695,25 @@ void kbase_hwcnt_gpu_enable_map_from_physical(
 	}
 }

-void kbase_hwcnt_gpu_patch_dump_headers(
-	struct kbase_hwcnt_dump_buffer *buf,
-	const struct kbase_hwcnt_enable_map *enable_map)
+void kbase_hwcnt_gpu_patch_dump_headers(struct kbase_hwcnt_dump_buffer *buf,
+					const struct kbase_hwcnt_enable_map *enable_map)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	size_t grp, blk, blk_inst;

-	if (WARN_ON(!buf) || WARN_ON(!enable_map) ||
-	    WARN_ON(buf->metadata != enable_map->metadata))
+	if (WARN_ON(!buf) || WARN_ON(!enable_map) || WARN_ON(buf->metadata != enable_map->metadata))
 		return;

 	metadata = buf->metadata;

-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
-		const u64 grp_type =
-			kbase_hwcnt_metadata_group_type(metadata, grp);
-		u64 *buf_blk = kbase_hwcnt_dump_buffer_block_instance(
-			buf, grp, blk, blk_inst);
-		const u64 *blk_map = kbase_hwcnt_enable_map_block_instance(
-			enable_map, grp, blk, blk_inst);
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+	{
+		const u64 grp_type = kbase_hwcnt_metadata_group_type(metadata, grp);
+		u64 *buf_blk = kbase_hwcnt_dump_buffer_block_instance(buf, grp, blk, blk_inst);
+		const u64 *blk_map =
+			kbase_hwcnt_enable_map_block_instance(enable_map, grp, blk, blk_inst);

-		if ((enum kbase_hwcnt_gpu_group_type)grp_type ==
-		    KBASE_HWCNT_GPU_GROUP_TYPE_V5) {
+		if ((enum kbase_hwcnt_gpu_group_type)grp_type == KBASE_HWCNT_GPU_GROUP_TYPE_V5) {
 			const size_t map_stride =
 				kbase_hwcnt_metadata_block_enable_map_stride(metadata, grp, blk);
 			u64 prfcnt_bm[EM_COUNT] = { 0 };
--- a/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_gpu.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_gpu.h
@ -34,9 +34,8 @@ struct kbase_hwcnt_dump_buffer;
 #define KBASE_HWCNT_V5_BLOCK_TYPE_COUNT 4
 #define KBASE_HWCNT_V5_HEADERS_PER_BLOCK 4
 #define KBASE_HWCNT_V5_DEFAULT_COUNTERS_PER_BLOCK 60
-#define KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK                                \
-	(KBASE_HWCNT_V5_HEADERS_PER_BLOCK +                                    \
-	 KBASE_HWCNT_V5_DEFAULT_COUNTERS_PER_BLOCK)
+#define KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK                                                    \
+	(KBASE_HWCNT_V5_HEADERS_PER_BLOCK + KBASE_HWCNT_V5_DEFAULT_COUNTERS_PER_BLOCK)

 /* FrontEnd block count in V5 GPU hardware counter. */
 #define KBASE_HWCNT_V5_FE_BLOCK_COUNT 1
@ -228,19 +227,17 @@ static inline bool kbase_hwcnt_is_block_type_undefined(const uint64_t grp_type,
 *
 * Return: 0 on success, else error code.
 */
-int kbase_hwcnt_jm_metadata_create(
-	const struct kbase_hwcnt_gpu_info *info,
-	enum kbase_hwcnt_set counter_set,
-	const struct kbase_hwcnt_metadata **out_metadata,
-	size_t *out_dump_bytes);
+int kbase_hwcnt_jm_metadata_create(const struct kbase_hwcnt_gpu_info *info,
+				   enum kbase_hwcnt_set counter_set,
+				   const struct kbase_hwcnt_metadata **out_metadata,
+				   size_t *out_dump_bytes);

 /**
 * kbase_hwcnt_jm_metadata_destroy() - Destroy JM GPU hardware counter metadata.
 *
 * @metadata: Pointer to metadata to destroy.
 */
-void kbase_hwcnt_jm_metadata_destroy(
-	const struct kbase_hwcnt_metadata *metadata);
+void kbase_hwcnt_jm_metadata_destroy(const struct kbase_hwcnt_metadata *metadata);

 /**
 * kbase_hwcnt_csf_metadata_create() - Create hardware counter metadata for the
@ -252,18 +249,16 @@ void kbase_hwcnt_jm_metadata_destroy(
 *
 * Return: 0 on success, else error code.
 */
-int kbase_hwcnt_csf_metadata_create(
-	const struct kbase_hwcnt_gpu_info *info,
-	enum kbase_hwcnt_set counter_set,
-	const struct kbase_hwcnt_metadata **out_metadata);
+int kbase_hwcnt_csf_metadata_create(const struct kbase_hwcnt_gpu_info *info,
+				    enum kbase_hwcnt_set counter_set,
+				    const struct kbase_hwcnt_metadata **out_metadata);

 /**
 * kbase_hwcnt_csf_metadata_destroy() - Destroy CSF GPU hardware counter
 *                                      metadata.
 * @metadata: Pointer to metadata to destroy.
 */
-void kbase_hwcnt_csf_metadata_destroy(
-	const struct kbase_hwcnt_metadata *metadata);
+void kbase_hwcnt_csf_metadata_destroy(const struct kbase_hwcnt_metadata *metadata);

 /**
 * kbase_hwcnt_jm_dump_get() - Copy or accumulate enabled counters from the raw
@ -289,8 +284,7 @@ void kbase_hwcnt_csf_metadata_destroy(
 int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 			    const struct kbase_hwcnt_enable_map *dst_enable_map,
 			    const u64 pm_core_mask,
-			    const struct kbase_hwcnt_curr_config *curr_config,
-			    bool accumulate);
+			    const struct kbase_hwcnt_curr_config *curr_config, bool accumulate);

 /**
 * kbase_hwcnt_csf_dump_get() - Copy or accumulate enabled counters from the raw
@ -310,8 +304,7 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 * Return: 0 on success, else error code.
 */
 int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
-			     const struct kbase_hwcnt_enable_map *dst_enable_map,
-			     bool accumulate);
+			     const struct kbase_hwcnt_enable_map *dst_enable_map, bool accumulate);

 /**
 * kbase_hwcnt_backend_gpu_block_map_to_physical() - Convert from a block
@ -365,9 +358,8 @@ static inline u32 kbase_hwcnt_backend_gpu_block_map_to_physical(u64 lo, u64 hi)
 * individual counter block value, but the physical enable map uses 1 bit for
 * every 4 counters, shared over all instances of a block.
 */
-void kbase_hwcnt_gpu_enable_map_to_physical(
-	struct kbase_hwcnt_physical_enable_map *dst,
-	const struct kbase_hwcnt_enable_map *src);
+void kbase_hwcnt_gpu_enable_map_to_physical(struct kbase_hwcnt_physical_enable_map *dst,
+					    const struct kbase_hwcnt_enable_map *src);

 /**
 * kbase_hwcnt_gpu_set_to_physical() - Map counter set selection to physical
@ -376,8 +368,7 @@ void kbase_hwcnt_gpu_enable_map_to_physical(
 * @dst: Non-NULL pointer to destination physical SET_SELECT value.
 * @src: Non-NULL pointer to source counter set selection.
 */
-void kbase_hwcnt_gpu_set_to_physical(enum kbase_hwcnt_physical_set *dst,
-				     enum kbase_hwcnt_set src);
+void kbase_hwcnt_gpu_set_to_physical(enum kbase_hwcnt_physical_set *dst, enum kbase_hwcnt_set src);

 /**
 * kbase_hwcnt_gpu_enable_map_from_physical() - Convert a physical enable map to
@ -393,9 +384,8 @@ void kbase_hwcnt_gpu_set_to_physical(enum kbase_hwcnt_physical_set *dst,
 * more than 64, so the enable map abstraction has nowhere to store the enable
 * information for the 64 non-existent counters.
 */
-void kbase_hwcnt_gpu_enable_map_from_physical(
-	struct kbase_hwcnt_enable_map *dst,
-	const struct kbase_hwcnt_physical_enable_map *src);
+void kbase_hwcnt_gpu_enable_map_from_physical(struct kbase_hwcnt_enable_map *dst,
+					      const struct kbase_hwcnt_physical_enable_map *src);

 /**
 * kbase_hwcnt_gpu_patch_dump_headers() - Patch all the performance counter
@ -411,8 +401,7 @@ void kbase_hwcnt_gpu_enable_map_from_physical(
 * kernel-user boundary, to ensure the header is accurate for the enable map
 * used by the user.
 */
-void kbase_hwcnt_gpu_patch_dump_headers(
-	struct kbase_hwcnt_dump_buffer *buf,
-	const struct kbase_hwcnt_enable_map *enable_map);
+void kbase_hwcnt_gpu_patch_dump_headers(struct kbase_hwcnt_dump_buffer *buf,
+					const struct kbase_hwcnt_enable_map *enable_map);

 #endif /* _KBASE_HWCNT_GPU_H_ */
--- a/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_gpu_narrow.c
+++ b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_gpu_narrow.c
@ -19,21 +19,19 @@
 *
 */

-#include "mali_kbase_hwcnt_gpu.h"
-#include "mali_kbase_hwcnt_gpu_narrow.h"
+#include "hwcnt/mali_kbase_hwcnt_gpu.h"
+#include "hwcnt/mali_kbase_hwcnt_gpu_narrow.h"

 #include <linux/bug.h>
 #include <linux/err.h>
 #include <linux/slab.h>

-int kbase_hwcnt_gpu_metadata_narrow_create(
-	const struct kbase_hwcnt_metadata_narrow **dst_md_narrow,
-	const struct kbase_hwcnt_metadata *src_md)
+int kbase_hwcnt_gpu_metadata_narrow_create(const struct kbase_hwcnt_metadata_narrow **dst_md_narrow,
+					   const struct kbase_hwcnt_metadata *src_md)
 {
 	struct kbase_hwcnt_description desc;
 	struct kbase_hwcnt_group_description group;
-	struct kbase_hwcnt_block_description
-		blks[KBASE_HWCNT_V5_BLOCK_TYPE_COUNT];
+	struct kbase_hwcnt_block_description blks[KBASE_HWCNT_V5_BLOCK_TYPE_COUNT];
 	size_t prfcnt_values_per_block;
 	size_t blk;
 	int err;
@ -47,18 +45,15 @@ int kbase_hwcnt_gpu_metadata_narrow_create(
 	 * count in the metadata.
 	 */
 	if ((kbase_hwcnt_metadata_group_count(src_md) != 1) ||
-	    (kbase_hwcnt_metadata_block_count(src_md, 0) !=
-	     KBASE_HWCNT_V5_BLOCK_TYPE_COUNT))
+	    (kbase_hwcnt_metadata_block_count(src_md, 0) != KBASE_HWCNT_V5_BLOCK_TYPE_COUNT))
 		return -EINVAL;

 	/* Get the values count in the first block. */
-	prfcnt_values_per_block =
-		kbase_hwcnt_metadata_block_values_count(src_md, 0, 0);
+	prfcnt_values_per_block = kbase_hwcnt_metadata_block_values_count(src_md, 0, 0);

 	/* check all blocks should have same values count. */
 	for (blk = 1; blk < KBASE_HWCNT_V5_BLOCK_TYPE_COUNT; blk++) {
-		size_t val_cnt =
-			kbase_hwcnt_metadata_block_values_count(src_md, 0, blk);
+		size_t val_cnt = kbase_hwcnt_metadata_block_values_count(src_md, 0, blk);
 		if (val_cnt != prfcnt_values_per_block)
 			return -EINVAL;
 	}
@ -75,12 +70,10 @@ int kbase_hwcnt_gpu_metadata_narrow_create(
 	prfcnt_values_per_block = 64;

 	for (blk = 0; blk < KBASE_HWCNT_V5_BLOCK_TYPE_COUNT; blk++) {
-		size_t blk_hdr_cnt = kbase_hwcnt_metadata_block_headers_count(
-			src_md, 0, blk);
+		size_t blk_hdr_cnt = kbase_hwcnt_metadata_block_headers_count(src_md, 0, blk);
 		blks[blk] = (struct kbase_hwcnt_block_description){
 			.type = kbase_hwcnt_metadata_block_type(src_md, 0, blk),
-			.inst_cnt = kbase_hwcnt_metadata_block_instance_count(
-				src_md, 0, blk),
+			.inst_cnt = kbase_hwcnt_metadata_block_instance_count(src_md, 0, blk),
 			.hdr_cnt = blk_hdr_cnt,
 			.ctr_cnt = prfcnt_values_per_block - blk_hdr_cnt,
 		};
@ -105,8 +98,7 @@ int kbase_hwcnt_gpu_metadata_narrow_create(
 		 * only supports 32-bit but the created metadata uses 64-bit for
 		 * block entry.
 		 */
-		metadata_narrow->dump_buf_bytes =
-			metadata_narrow->metadata->dump_buf_bytes >> 1;
+		metadata_narrow->dump_buf_bytes = metadata_narrow->metadata->dump_buf_bytes >> 1;
 		*dst_md_narrow = metadata_narrow;
 	} else {
 		kfree(metadata_narrow);
@ -115,8 +107,7 @@ int kbase_hwcnt_gpu_metadata_narrow_create(
 	return err;
 }

-void kbase_hwcnt_gpu_metadata_narrow_destroy(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow)
+void kbase_hwcnt_gpu_metadata_narrow_destroy(const struct kbase_hwcnt_metadata_narrow *md_narrow)
 {
 	if (!md_narrow)
 		return;
@ -125,9 +116,8 @@ void kbase_hwcnt_gpu_metadata_narrow_destroy(
 	kfree(md_narrow);
 }

-int kbase_hwcnt_dump_buffer_narrow_alloc(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow,
-	struct kbase_hwcnt_dump_buffer_narrow *dump_buf)
+int kbase_hwcnt_dump_buffer_narrow_alloc(const struct kbase_hwcnt_metadata_narrow *md_narrow,
+					 struct kbase_hwcnt_dump_buffer_narrow *dump_buf)
 {
 	size_t dump_buf_bytes;
 	size_t clk_cnt_buf_bytes;
@ -137,8 +127,7 @@ int kbase_hwcnt_dump_buffer_narrow_alloc(
 		return -EINVAL;

 	dump_buf_bytes = md_narrow->dump_buf_bytes;
-	clk_cnt_buf_bytes =
-		sizeof(*dump_buf->clk_cnt_buf) * md_narrow->metadata->clk_cnt;
+	clk_cnt_buf_bytes = sizeof(*dump_buf->clk_cnt_buf) * md_narrow->metadata->clk_cnt;

 	/* Make a single allocation for both dump_buf and clk_cnt_buf. */
 	buf = kmalloc(dump_buf_bytes + clk_cnt_buf_bytes, GFP_KERNEL);
@ -154,14 +143,15 @@ int kbase_hwcnt_dump_buffer_narrow_alloc(
 	return 0;
 }

-void kbase_hwcnt_dump_buffer_narrow_free(
-	struct kbase_hwcnt_dump_buffer_narrow *dump_buf_narrow)
+void kbase_hwcnt_dump_buffer_narrow_free(struct kbase_hwcnt_dump_buffer_narrow *dump_buf_narrow)
 {
 	if (!dump_buf_narrow)
 		return;

 	kfree(dump_buf_narrow->dump_buf);
-	*dump_buf_narrow = (struct kbase_hwcnt_dump_buffer_narrow){ NULL };
+	*dump_buf_narrow = (struct kbase_hwcnt_dump_buffer_narrow){ .md_narrow = NULL,
+								    .dump_buf = NULL,
+								    .clk_cnt_buf = NULL };
 }

 int kbase_hwcnt_dump_buffer_narrow_array_alloc(
@ -180,8 +170,7 @@ int kbase_hwcnt_dump_buffer_narrow_array_alloc(
 		return -EINVAL;

 	dump_buf_bytes = md_narrow->dump_buf_bytes;
-	clk_cnt_buf_bytes = sizeof(*dump_bufs->bufs->clk_cnt_buf) *
-			    md_narrow->metadata->clk_cnt;
+	clk_cnt_buf_bytes = sizeof(*dump_bufs->bufs->clk_cnt_buf) * md_narrow->metadata->clk_cnt;

 	/* Allocate memory for the dump buffer struct array */
 	buffers = kmalloc_array(n, sizeof(*buffers), GFP_KERNEL);
@ -234,27 +223,22 @@ void kbase_hwcnt_dump_buffer_narrow_array_free(
 	memset(dump_bufs, 0, sizeof(*dump_bufs));
 }

-void kbase_hwcnt_dump_buffer_block_copy_strict_narrow(u32 *dst_blk,
-						      const u64 *src_blk,
-						      const u64 *blk_em,
-						      size_t val_cnt)
+void kbase_hwcnt_dump_buffer_block_copy_strict_narrow(u32 *dst_blk, const u64 *src_blk,
+						      const u64 *blk_em, size_t val_cnt)
 {
 	size_t val;

 	for (val = 0; val < val_cnt; val++) {
-		bool val_enabled =
-			kbase_hwcnt_enable_map_block_value_enabled(blk_em, val);
-		u32 src_val =
-			(src_blk[val] > U32_MAX) ? U32_MAX : (u32)src_blk[val];
+		bool val_enabled = kbase_hwcnt_enable_map_block_value_enabled(blk_em, val);
+		u32 src_val = (src_blk[val] > U32_MAX) ? U32_MAX : (u32)src_blk[val];

 		dst_blk[val] = val_enabled ? src_val : 0;
 	}
 }

-void kbase_hwcnt_dump_buffer_copy_strict_narrow(
-	struct kbase_hwcnt_dump_buffer_narrow *dst_narrow,
-	const struct kbase_hwcnt_dump_buffer *src,
-	const struct kbase_hwcnt_enable_map *dst_enable_map)
+void kbase_hwcnt_dump_buffer_copy_strict_narrow(struct kbase_hwcnt_dump_buffer_narrow *dst_narrow,
+						const struct kbase_hwcnt_dump_buffer *src,
+						const struct kbase_hwcnt_enable_map *dst_enable_map)
 {
 	const struct kbase_hwcnt_metadata_narrow *metadata_narrow;
 	size_t grp;
@ -262,68 +246,53 @@ void kbase_hwcnt_dump_buffer_copy_strict_narrow(

 	if (WARN_ON(!dst_narrow) || WARN_ON(!src) || WARN_ON(!dst_enable_map) ||
 	    WARN_ON(dst_narrow->md_narrow->metadata == src->metadata) ||
-	    WARN_ON(dst_narrow->md_narrow->metadata->grp_cnt !=
-		    src->metadata->grp_cnt) ||
+	    WARN_ON(dst_narrow->md_narrow->metadata->grp_cnt != src->metadata->grp_cnt) ||
 	    WARN_ON(src->metadata->grp_cnt != 1) ||
 	    WARN_ON(dst_narrow->md_narrow->metadata->grp_metadata[0].blk_cnt !=
 		    src->metadata->grp_metadata[0].blk_cnt) ||
 	    WARN_ON(dst_narrow->md_narrow->metadata->grp_metadata[0].blk_cnt !=
 		    KBASE_HWCNT_V5_BLOCK_TYPE_COUNT) ||
-	    WARN_ON(dst_narrow->md_narrow->metadata->grp_metadata[0]
-			    .blk_metadata[0]
-			    .ctr_cnt >
+	    WARN_ON(dst_narrow->md_narrow->metadata->grp_metadata[0].blk_metadata[0].ctr_cnt >
 		    src->metadata->grp_metadata[0].blk_metadata[0].ctr_cnt))
 		return;

 	/* Don't use src metadata since src buffer is bigger than dst buffer. */
 	metadata_narrow = dst_narrow->md_narrow;

-	for (grp = 0;
-	     grp < kbase_hwcnt_metadata_narrow_group_count(metadata_narrow);
-	     grp++) {
+	for (grp = 0; grp < kbase_hwcnt_metadata_narrow_group_count(metadata_narrow); grp++) {
 		size_t blk;
-		size_t blk_cnt = kbase_hwcnt_metadata_narrow_block_count(
-			metadata_narrow, grp);
+		size_t blk_cnt = kbase_hwcnt_metadata_narrow_block_count(metadata_narrow, grp);

 		for (blk = 0; blk < blk_cnt; blk++) {
 			size_t blk_inst;
-			size_t blk_inst_cnt =
-				kbase_hwcnt_metadata_narrow_block_instance_count(
-					metadata_narrow, grp, blk);
+			size_t blk_inst_cnt = kbase_hwcnt_metadata_narrow_block_instance_count(
+				metadata_narrow, grp, blk);

-			for (blk_inst = 0; blk_inst < blk_inst_cnt;
-			     blk_inst++) {
+			for (blk_inst = 0; blk_inst < blk_inst_cnt; blk_inst++) {
 				/* The narrowed down buffer is only 32-bit. */
-				u32 *dst_blk =
-					kbase_hwcnt_dump_buffer_narrow_block_instance(
-						dst_narrow, grp, blk, blk_inst);
-				const u64 *src_blk =
-					kbase_hwcnt_dump_buffer_block_instance(
-						src, grp, blk, blk_inst);
-				const u64 *blk_em =
-					kbase_hwcnt_enable_map_block_instance(
-						dst_enable_map, grp, blk,
-						blk_inst);
-				size_t val_cnt =
-					kbase_hwcnt_metadata_narrow_block_values_count(
-						metadata_narrow, grp, blk);
+				u32 *dst_blk = kbase_hwcnt_dump_buffer_narrow_block_instance(
+					dst_narrow, grp, blk, blk_inst);
+				const u64 *src_blk = kbase_hwcnt_dump_buffer_block_instance(
+					src, grp, blk, blk_inst);
+				const u64 *blk_em = kbase_hwcnt_enable_map_block_instance(
+					dst_enable_map, grp, blk, blk_inst);
+				size_t val_cnt = kbase_hwcnt_metadata_narrow_block_values_count(
+					metadata_narrow, grp, blk);
 				/* Align upwards to include padding bytes */
 				val_cnt = KBASE_HWCNT_ALIGN_UPWARDS(
-					val_cnt,
-					(KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT /
-					 KBASE_HWCNT_VALUE_BYTES));
+					val_cnt, (KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT /
+						  KBASE_HWCNT_VALUE_BYTES));

-				kbase_hwcnt_dump_buffer_block_copy_strict_narrow(
-					dst_blk, src_blk, blk_em, val_cnt);
+				kbase_hwcnt_dump_buffer_block_copy_strict_narrow(dst_blk, src_blk,
+										 blk_em, val_cnt);
 			}
 		}
 	}

 	for (clk = 0; clk < metadata_narrow->metadata->clk_cnt; clk++) {
-		bool clk_enabled = kbase_hwcnt_clk_enable_map_enabled(
-			dst_enable_map->clk_enable_map, clk);
+		bool clk_enabled =
+			kbase_hwcnt_clk_enable_map_enabled(dst_enable_map->clk_enable_map, clk);

-		dst_narrow->clk_cnt_buf[clk] =
-			clk_enabled ? src->clk_cnt_buf[clk] : 0;
+		dst_narrow->clk_cnt_buf[clk] = clk_enabled ? src->clk_cnt_buf[clk] : 0;
 	}
 }
--- a/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_gpu_narrow.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_gpu_narrow.h
@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 *
- * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -22,7 +22,7 @@
 #ifndef _KBASE_HWCNT_GPU_NARROW_H_
 #define _KBASE_HWCNT_GPU_NARROW_H_

-#include "mali_kbase_hwcnt_types.h"
+#include "hwcnt/mali_kbase_hwcnt_types.h"
 #include <linux/types.h>

 struct kbase_device;
@ -86,8 +86,8 @@ struct kbase_hwcnt_dump_buffer_narrow_array {
 *
 * Return: Number of hardware counter groups described by narrow metadata.
 */
-static inline size_t kbase_hwcnt_metadata_narrow_group_count(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow)
+static inline size_t
+kbase_hwcnt_metadata_narrow_group_count(const struct kbase_hwcnt_metadata_narrow *md_narrow)
 {
 	return kbase_hwcnt_metadata_group_count(md_narrow->metadata);
 }
@ -100,8 +100,9 @@ static inline size_t kbase_hwcnt_metadata_narrow_group_count(
 *
 * Return: Type of the group grp.
 */
-static inline u64 kbase_hwcnt_metadata_narrow_group_type(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp)
+static inline u64
+kbase_hwcnt_metadata_narrow_group_type(const struct kbase_hwcnt_metadata_narrow *md_narrow,
+				       size_t grp)
 {
 	return kbase_hwcnt_metadata_group_type(md_narrow->metadata, grp);
 }
@ -114,8 +115,9 @@ static inline u64 kbase_hwcnt_metadata_narrow_group_type(
 *
 * Return: Number of blocks in group grp.
 */
-static inline size_t kbase_hwcnt_metadata_narrow_block_count(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp)
+static inline size_t
+kbase_hwcnt_metadata_narrow_block_count(const struct kbase_hwcnt_metadata_narrow *md_narrow,
+					size_t grp)
 {
 	return kbase_hwcnt_metadata_block_count(md_narrow->metadata, grp);
 }
@ -131,11 +133,9 @@ static inline size_t kbase_hwcnt_metadata_narrow_block_count(
 * Return: Number of instances of block blk in group grp.
 */
 static inline size_t kbase_hwcnt_metadata_narrow_block_instance_count(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp,
-	size_t blk)
+	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp, size_t blk)
 {
-	return kbase_hwcnt_metadata_block_instance_count(md_narrow->metadata,
-							 grp, blk);
+	return kbase_hwcnt_metadata_block_instance_count(md_narrow->metadata, grp, blk);
 }

 /**
@ -148,12 +148,11 @@ static inline size_t kbase_hwcnt_metadata_narrow_block_instance_count(
 *
 * Return: Number of counter headers in each instance of block blk in group grp.
 */
-static inline size_t kbase_hwcnt_metadata_narrow_block_headers_count(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp,
-	size_t blk)
+static inline size_t
+kbase_hwcnt_metadata_narrow_block_headers_count(const struct kbase_hwcnt_metadata_narrow *md_narrow,
+						size_t grp, size_t blk)
 {
-	return kbase_hwcnt_metadata_block_headers_count(md_narrow->metadata,
-							grp, blk);
+	return kbase_hwcnt_metadata_block_headers_count(md_narrow->metadata, grp, blk);
 }

 /**
@ -167,11 +166,9 @@ static inline size_t kbase_hwcnt_metadata_narrow_block_headers_count(
 * Return: Number of counters in each instance of block blk in group grp.
 */
 static inline size_t kbase_hwcnt_metadata_narrow_block_counters_count(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp,
-	size_t blk)
+	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp, size_t blk)
 {
-	return kbase_hwcnt_metadata_block_counters_count(md_narrow->metadata,
-							 grp, blk);
+	return kbase_hwcnt_metadata_block_counters_count(md_narrow->metadata, grp, blk);
 }

 /**
@ -184,14 +181,12 @@ static inline size_t kbase_hwcnt_metadata_narrow_block_counters_count(
 * Return: Number of headers plus counters in each instance of block blk
 *         in group grp.
 */
-static inline size_t kbase_hwcnt_metadata_narrow_block_values_count(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp,
-	size_t blk)
+static inline size_t
+kbase_hwcnt_metadata_narrow_block_values_count(const struct kbase_hwcnt_metadata_narrow *md_narrow,
+					       size_t grp, size_t blk)
 {
-	return kbase_hwcnt_metadata_narrow_block_counters_count(md_narrow, grp,
-								blk) +
-	       kbase_hwcnt_metadata_narrow_block_headers_count(md_narrow, grp,
-							       blk);
+	return kbase_hwcnt_metadata_narrow_block_counters_count(md_narrow, grp, blk) +
+	       kbase_hwcnt_metadata_narrow_block_headers_count(md_narrow, grp, blk);
 }

 /**
@ -205,18 +200,13 @@ static inline size_t kbase_hwcnt_metadata_narrow_block_values_count(
 *
 * Return: u32* to the dump buffer for the block instance.
 */
-static inline u32 *kbase_hwcnt_dump_buffer_narrow_block_instance(
-	const struct kbase_hwcnt_dump_buffer_narrow *buf, size_t grp,
-	size_t blk, size_t blk_inst)
+static inline u32 *
+kbase_hwcnt_dump_buffer_narrow_block_instance(const struct kbase_hwcnt_dump_buffer_narrow *buf,
+					      size_t grp, size_t blk, size_t blk_inst)
 {
-	return buf->dump_buf +
-	       buf->md_narrow->metadata->grp_metadata[grp].dump_buf_index +
-	       buf->md_narrow->metadata->grp_metadata[grp]
-		       .blk_metadata[blk]
-		       .dump_buf_index +
-	       (buf->md_narrow->metadata->grp_metadata[grp]
-			.blk_metadata[blk]
-			.dump_buf_stride *
+	return buf->dump_buf + buf->md_narrow->metadata->grp_metadata[grp].dump_buf_index +
+	       buf->md_narrow->metadata->grp_metadata[grp].blk_metadata[blk].dump_buf_index +
+	       (buf->md_narrow->metadata->grp_metadata[grp].blk_metadata[blk].dump_buf_stride *
 		blk_inst);
 }

@ -239,17 +229,15 @@ static inline u32 *kbase_hwcnt_dump_buffer_narrow_block_instance(
 *
 * Return: 0 on success, else error code.
 */
-int kbase_hwcnt_gpu_metadata_narrow_create(
-	const struct kbase_hwcnt_metadata_narrow **dst_md_narrow,
-	const struct kbase_hwcnt_metadata *src_md);
+int kbase_hwcnt_gpu_metadata_narrow_create(const struct kbase_hwcnt_metadata_narrow **dst_md_narrow,
+					   const struct kbase_hwcnt_metadata *src_md);

 /**
 * kbase_hwcnt_gpu_metadata_narrow_destroy() - Destroy a hardware counter narrow
 *                                             metadata object.
 * @md_narrow: Pointer to hardware counter narrow metadata.
 */
-void kbase_hwcnt_gpu_metadata_narrow_destroy(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow);
+void kbase_hwcnt_gpu_metadata_narrow_destroy(const struct kbase_hwcnt_metadata_narrow *md_narrow);

 /**
 * kbase_hwcnt_dump_buffer_narrow_alloc() - Allocate a narrow dump buffer.
@ -260,9 +248,8 @@ void kbase_hwcnt_gpu_metadata_narrow_destroy(
 *
 * Return: 0 on success, else error code.
 */
-int kbase_hwcnt_dump_buffer_narrow_alloc(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow,
-	struct kbase_hwcnt_dump_buffer_narrow *dump_buf);
+int kbase_hwcnt_dump_buffer_narrow_alloc(const struct kbase_hwcnt_metadata_narrow *md_narrow,
+					 struct kbase_hwcnt_dump_buffer_narrow *dump_buf);

 /**
 * kbase_hwcnt_dump_buffer_narrow_free() - Free a narrow dump buffer.
@ -271,8 +258,7 @@ int kbase_hwcnt_dump_buffer_narrow_alloc(
 * Can be safely called on an all-zeroed narrow dump buffer structure, or on an
 * already freed narrow dump buffer.
 */
-void kbase_hwcnt_dump_buffer_narrow_free(
-	struct kbase_hwcnt_dump_buffer_narrow *dump_buf);
+void kbase_hwcnt_dump_buffer_narrow_free(struct kbase_hwcnt_dump_buffer_narrow *dump_buf);

 /**
 * kbase_hwcnt_dump_buffer_narrow_array_alloc() - Allocate an array of narrow
@ -320,10 +306,8 @@ void kbase_hwcnt_dump_buffer_narrow_array_free(
 * source value is bigger than U32_MAX, or copy the value from source if the
 * corresponding source value is less than or equal to U32_MAX.
 */
-void kbase_hwcnt_dump_buffer_block_copy_strict_narrow(u32 *dst_blk,
-						      const u64 *src_blk,
-						      const u64 *blk_em,
-						      size_t val_cnt);
+void kbase_hwcnt_dump_buffer_block_copy_strict_narrow(u32 *dst_blk, const u64 *src_blk,
+						      const u64 *blk_em, size_t val_cnt);

 /**
 * kbase_hwcnt_dump_buffer_copy_strict_narrow() - Copy all enabled values to a
@ -339,9 +323,8 @@ void kbase_hwcnt_dump_buffer_block_copy_strict_narrow(u32 *dst_blk,
 * corresponding source value is bigger than U32_MAX, or copy the value from
 * source if the corresponding source value is less than or equal to U32_MAX.
 */
-void kbase_hwcnt_dump_buffer_copy_strict_narrow(
-	struct kbase_hwcnt_dump_buffer_narrow *dst_narrow,
-	const struct kbase_hwcnt_dump_buffer *src,
-	const struct kbase_hwcnt_enable_map *dst_enable_map);
+void kbase_hwcnt_dump_buffer_copy_strict_narrow(struct kbase_hwcnt_dump_buffer_narrow *dst_narrow,
+						const struct kbase_hwcnt_dump_buffer *src,
+						const struct kbase_hwcnt_enable_map *dst_enable_map);

 #endif /* _KBASE_HWCNT_GPU_NARROW_H_ */
--- a/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_types.c
+++ b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_types.c
@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
 *
- * (C) COPYRIGHT 2018, 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018, 2020-2022 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
@ -19,13 +19,12 @@
 *
 */

-#include "mali_kbase_hwcnt_types.h"
+#include "hwcnt/mali_kbase_hwcnt_types.h"

 #include <linux/slab.h>

-int kbase_hwcnt_metadata_create(
-	const struct kbase_hwcnt_description *desc,
-	const struct kbase_hwcnt_metadata **out_metadata)
+int kbase_hwcnt_metadata_create(const struct kbase_hwcnt_description *desc,
+				const struct kbase_hwcnt_metadata **out_metadata)
 {
 	char *buf;
 	struct kbase_hwcnt_metadata *metadata;
@ -56,8 +55,7 @@ int kbase_hwcnt_metadata_create(

 	/* Block metadata */
 	for (grp = 0; grp < desc->grp_cnt; grp++) {
-		size += sizeof(struct kbase_hwcnt_block_metadata) *
-			desc->grps[grp].blk_cnt;
+		size += sizeof(struct kbase_hwcnt_block_metadata) * desc->grps[grp].blk_cnt;
 	}

 	/* Single allocation for the entire metadata */
@ -83,8 +81,7 @@ int kbase_hwcnt_metadata_create(
 	for (grp = 0; grp < desc->grp_cnt; grp++) {
 		size_t blk;

-		const struct kbase_hwcnt_group_description *grp_desc =
-			desc->grps + grp;
+		const struct kbase_hwcnt_group_description *grp_desc = desc->grps + grp;
 		struct kbase_hwcnt_group_metadata *grp_md = grp_mds + grp;

 		size_t group_enable_map_count = 0;
@ -94,37 +91,28 @@ int kbase_hwcnt_metadata_create(
 		/* Bump allocate this group's block metadata */
 		struct kbase_hwcnt_block_metadata *blk_mds =
 			(struct kbase_hwcnt_block_metadata *)(buf + offset);
-		offset += sizeof(struct kbase_hwcnt_block_metadata) *
-			grp_desc->blk_cnt;
+		offset += sizeof(struct kbase_hwcnt_block_metadata) * grp_desc->blk_cnt;

 		/* Fill in each block in the group's information */
 		for (blk = 0; blk < grp_desc->blk_cnt; blk++) {
-			const struct kbase_hwcnt_block_description *blk_desc =
-				grp_desc->blks + blk;
-			struct kbase_hwcnt_block_metadata *blk_md =
-				blk_mds + blk;
-			const size_t n_values =
-				blk_desc->hdr_cnt + blk_desc->ctr_cnt;
+			const struct kbase_hwcnt_block_description *blk_desc = grp_desc->blks + blk;
+			struct kbase_hwcnt_block_metadata *blk_md = blk_mds + blk;
+			const size_t n_values = blk_desc->hdr_cnt + blk_desc->ctr_cnt;

 			blk_md->type = blk_desc->type;
 			blk_md->inst_cnt = blk_desc->inst_cnt;
 			blk_md->hdr_cnt = blk_desc->hdr_cnt;
 			blk_md->ctr_cnt = blk_desc->ctr_cnt;
 			blk_md->enable_map_index = group_enable_map_count;
-			blk_md->enable_map_stride =
-				kbase_hwcnt_bitfield_count(n_values);
+			blk_md->enable_map_stride = kbase_hwcnt_bitfield_count(n_values);
 			blk_md->dump_buf_index = group_dump_buffer_count;
-			blk_md->dump_buf_stride =
-				KBASE_HWCNT_ALIGN_UPWARDS(
-					n_values,
-					(KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT /
-					 KBASE_HWCNT_VALUE_BYTES));
+			blk_md->dump_buf_stride = KBASE_HWCNT_ALIGN_UPWARDS(
+				n_values,
+				(KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT / KBASE_HWCNT_VALUE_BYTES));
 			blk_md->avail_mask_index = group_avail_mask_bits;

-			group_enable_map_count +=
-				blk_md->enable_map_stride * blk_md->inst_cnt;
-			group_dump_buffer_count +=
-				blk_md->dump_buf_stride * blk_md->inst_cnt;
+			group_enable_map_count += blk_md->enable_map_stride * blk_md->inst_cnt;
+			group_dump_buffer_count += blk_md->dump_buf_stride * blk_md->inst_cnt;
 			group_avail_mask_bits += blk_md->inst_cnt;
 		}

@ -144,8 +132,7 @@ int kbase_hwcnt_metadata_create(
 	/* Fill in the top level metadata's information */
 	metadata->grp_cnt = desc->grp_cnt;
 	metadata->grp_metadata = grp_mds;
-	metadata->enable_map_bytes =
-		enable_map_count * KBASE_HWCNT_BITFIELD_BYTES;
+	metadata->enable_map_bytes = enable_map_count * KBASE_HWCNT_BITFIELD_BYTES;
 	metadata->dump_buf_bytes = dump_buf_count * KBASE_HWCNT_VALUE_BYTES;
 	metadata->avail_mask = desc->avail_mask;
 	metadata->clk_cnt = desc->clk_cnt;
@ -155,8 +142,7 @@ int kbase_hwcnt_metadata_create(
 	 * bit per 4 bytes in the dump buffer.
 	 */
 	WARN_ON(metadata->dump_buf_bytes !=
-		(metadata->enable_map_bytes *
-		 BITS_PER_BYTE * KBASE_HWCNT_VALUE_BYTES));
+		(metadata->enable_map_bytes * BITS_PER_BYTE * KBASE_HWCNT_VALUE_BYTES));

 	*out_metadata = metadata;
 	return 0;
@ -167,9 +153,8 @@ void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata)
 	kfree(metadata);
 }

-int kbase_hwcnt_enable_map_alloc(
-	const struct kbase_hwcnt_metadata *metadata,
-	struct kbase_hwcnt_enable_map *enable_map)
+int kbase_hwcnt_enable_map_alloc(const struct kbase_hwcnt_metadata *metadata,
+				 struct kbase_hwcnt_enable_map *enable_map)
 {
 	u64 *enable_map_buf;

@ -177,8 +162,7 @@ int kbase_hwcnt_enable_map_alloc(
 		return -EINVAL;

 	if (metadata->enable_map_bytes > 0) {
-		enable_map_buf =
-			kzalloc(metadata->enable_map_bytes, GFP_KERNEL);
+		enable_map_buf = kzalloc(metadata->enable_map_bytes, GFP_KERNEL);
 		if (!enable_map_buf)
 			return -ENOMEM;
 	} else {
@ -200,9 +184,8 @@ void kbase_hwcnt_enable_map_free(struct kbase_hwcnt_enable_map *enable_map)
 	enable_map->metadata = NULL;
 }

-int kbase_hwcnt_dump_buffer_alloc(
-	const struct kbase_hwcnt_metadata *metadata,
-	struct kbase_hwcnt_dump_buffer *dump_buf)
+int kbase_hwcnt_dump_buffer_alloc(const struct kbase_hwcnt_metadata *metadata,
+				  struct kbase_hwcnt_dump_buffer *dump_buf)
 {
 	size_t dump_buf_bytes;
 	size_t clk_cnt_buf_bytes;
@ -235,10 +218,8 @@ void kbase_hwcnt_dump_buffer_free(struct kbase_hwcnt_dump_buffer *dump_buf)
 	memset(dump_buf, 0, sizeof(*dump_buf));
 }

-int kbase_hwcnt_dump_buffer_array_alloc(
-	const struct kbase_hwcnt_metadata *metadata,
-	size_t n,
-	struct kbase_hwcnt_dump_buffer_array *dump_bufs)
+int kbase_hwcnt_dump_buffer_array_alloc(const struct kbase_hwcnt_metadata *metadata, size_t n,
+					struct kbase_hwcnt_dump_buffer_array *dump_bufs)
 {
 	struct kbase_hwcnt_dump_buffer *buffers;
 	size_t buf_idx;
@ -251,8 +232,7 @@ int kbase_hwcnt_dump_buffer_array_alloc(
 		return -EINVAL;

 	dump_buf_bytes = metadata->dump_buf_bytes;
-	clk_cnt_buf_bytes =
-		sizeof(*dump_bufs->bufs->clk_cnt_buf) * metadata->clk_cnt;
+	clk_cnt_buf_bytes = sizeof(*dump_bufs->bufs->clk_cnt_buf) * metadata->clk_cnt;

 	/* Allocate memory for the dump buffer struct array */
 	buffers = kmalloc_array(n, sizeof(*buffers), GFP_KERNEL);
@ -283,15 +263,13 @@ int kbase_hwcnt_dump_buffer_array_alloc(

 		buffers[buf_idx].metadata = metadata;
 		buffers[buf_idx].dump_buf = (u64 *)(addr + dump_buf_offset);
-		buffers[buf_idx].clk_cnt_buf =
-			(u64 *)(addr + clk_cnt_buf_offset);
+		buffers[buf_idx].clk_cnt_buf = (u64 *)(addr + clk_cnt_buf_offset);
 	}

 	return 0;
 }

-void kbase_hwcnt_dump_buffer_array_free(
-	struct kbase_hwcnt_dump_buffer_array *dump_bufs)
+void kbase_hwcnt_dump_buffer_array_free(struct kbase_hwcnt_dump_buffer_array *dump_bufs)
 {
 	if (!dump_bufs)
 		return;
@ -301,84 +279,71 @@ void kbase_hwcnt_dump_buffer_array_free(
 	memset(dump_bufs, 0, sizeof(*dump_bufs));
 }

-void kbase_hwcnt_dump_buffer_zero(
-	struct kbase_hwcnt_dump_buffer *dst,
-	const struct kbase_hwcnt_enable_map *dst_enable_map)
+void kbase_hwcnt_dump_buffer_zero(struct kbase_hwcnt_dump_buffer *dst,
+				  const struct kbase_hwcnt_enable_map *dst_enable_map)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	size_t grp, blk, blk_inst;

-	if (WARN_ON(!dst) ||
-	    WARN_ON(!dst_enable_map) ||
+	if (WARN_ON(!dst) || WARN_ON(!dst_enable_map) ||
 	    WARN_ON(dst->metadata != dst_enable_map->metadata))
 		return;

 	metadata = dst->metadata;

-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+	{
 		u64 *dst_blk;
 		size_t val_cnt;

-		if (!kbase_hwcnt_enable_map_block_enabled(
-			dst_enable_map, grp, blk, blk_inst))
+		if (!kbase_hwcnt_enable_map_block_enabled(dst_enable_map, grp, blk, blk_inst))
 			continue;

-		dst_blk = kbase_hwcnt_dump_buffer_block_instance(
-			dst, grp, blk, blk_inst);
-		val_cnt = kbase_hwcnt_metadata_block_values_count(
-			metadata, grp, blk);
+		dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
+		val_cnt = kbase_hwcnt_metadata_block_values_count(metadata, grp, blk);

 		kbase_hwcnt_dump_buffer_block_zero(dst_blk, val_cnt);
 	}

-	memset(dst->clk_cnt_buf, 0,
-		sizeof(*dst->clk_cnt_buf) * metadata->clk_cnt);
+	memset(dst->clk_cnt_buf, 0, sizeof(*dst->clk_cnt_buf) * metadata->clk_cnt);
 }

-void kbase_hwcnt_dump_buffer_zero_strict(
-	struct kbase_hwcnt_dump_buffer *dst)
+void kbase_hwcnt_dump_buffer_zero_strict(struct kbase_hwcnt_dump_buffer *dst)
 {
 	if (WARN_ON(!dst))
 		return;

 	memset(dst->dump_buf, 0, dst->metadata->dump_buf_bytes);

-	memset(dst->clk_cnt_buf, 0,
-		sizeof(*dst->clk_cnt_buf) * dst->metadata->clk_cnt);
+	memset(dst->clk_cnt_buf, 0, sizeof(*dst->clk_cnt_buf) * dst->metadata->clk_cnt);
 }

-void kbase_hwcnt_dump_buffer_zero_non_enabled(
-	struct kbase_hwcnt_dump_buffer *dst,
-	const struct kbase_hwcnt_enable_map *dst_enable_map)
+void kbase_hwcnt_dump_buffer_zero_non_enabled(struct kbase_hwcnt_dump_buffer *dst,
+					      const struct kbase_hwcnt_enable_map *dst_enable_map)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	size_t grp, blk, blk_inst;

-	if (WARN_ON(!dst) ||
-	    WARN_ON(!dst_enable_map) ||
+	if (WARN_ON(!dst) || WARN_ON(!dst_enable_map) ||
 	    WARN_ON(dst->metadata != dst_enable_map->metadata))
 		return;

 	metadata = dst->metadata;

-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
-		u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
-			dst, grp, blk, blk_inst);
-		const u64 *blk_em = kbase_hwcnt_enable_map_block_instance(
-			dst_enable_map, grp, blk, blk_inst);
-		size_t val_cnt = kbase_hwcnt_metadata_block_values_count(
-			metadata, grp, blk);
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+	{
+		u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
+		const u64 *blk_em =
+			kbase_hwcnt_enable_map_block_instance(dst_enable_map, grp, blk, blk_inst);
+		size_t val_cnt = kbase_hwcnt_metadata_block_values_count(metadata, grp, blk);

 		/* Align upwards to include padding bytes */
-		val_cnt = KBASE_HWCNT_ALIGN_UPWARDS(val_cnt,
-			(KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT /
-			 KBASE_HWCNT_VALUE_BYTES));
+		val_cnt = KBASE_HWCNT_ALIGN_UPWARDS(
+			val_cnt, (KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT / KBASE_HWCNT_VALUE_BYTES));

-		if (kbase_hwcnt_metadata_block_instance_avail(
-			metadata, grp, blk, blk_inst)) {
+		if (kbase_hwcnt_metadata_block_instance_avail(metadata, grp, blk, blk_inst)) {
 			/* Block available, so only zero non-enabled values */
-			kbase_hwcnt_dump_buffer_block_zero_non_enabled(
-				dst_blk, blk_em, val_cnt);
+			kbase_hwcnt_dump_buffer_block_zero_non_enabled(dst_blk, blk_em, val_cnt);
 		} else {
 			/* Block not available, so zero the entire thing */
 			kbase_hwcnt_dump_buffer_block_zero(dst_blk, val_cnt);
@ -386,188 +351,159 @@ void kbase_hwcnt_dump_buffer_zero_non_enabled(
 	}
 }

-void kbase_hwcnt_dump_buffer_copy(
-	struct kbase_hwcnt_dump_buffer *dst,
-	const struct kbase_hwcnt_dump_buffer *src,
-	const struct kbase_hwcnt_enable_map *dst_enable_map)
+void kbase_hwcnt_dump_buffer_copy(struct kbase_hwcnt_dump_buffer *dst,
+				  const struct kbase_hwcnt_dump_buffer *src,
+				  const struct kbase_hwcnt_enable_map *dst_enable_map)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	size_t grp, blk, blk_inst;
 	size_t clk;

-	if (WARN_ON(!dst) ||
-	    WARN_ON(!src) ||
-	    WARN_ON(!dst_enable_map) ||
-	    WARN_ON(dst == src) ||
+	if (WARN_ON(!dst) || WARN_ON(!src) || WARN_ON(!dst_enable_map) || WARN_ON(dst == src) ||
 	    WARN_ON(dst->metadata != src->metadata) ||
 	    WARN_ON(dst->metadata != dst_enable_map->metadata))
 		return;

 	metadata = dst->metadata;

-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+	{
 		u64 *dst_blk;
 		const u64 *src_blk;
 		size_t val_cnt;

-		if (!kbase_hwcnt_enable_map_block_enabled(
-			dst_enable_map, grp, blk, blk_inst))
+		if (!kbase_hwcnt_enable_map_block_enabled(dst_enable_map, grp, blk, blk_inst))
 			continue;

-		dst_blk = kbase_hwcnt_dump_buffer_block_instance(
-			dst, grp, blk, blk_inst);
-		src_blk = kbase_hwcnt_dump_buffer_block_instance(
-			src, grp, blk, blk_inst);
-		val_cnt = kbase_hwcnt_metadata_block_values_count(
-			metadata, grp, blk);
+		dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
+		src_blk = kbase_hwcnt_dump_buffer_block_instance(src, grp, blk, blk_inst);
+		val_cnt = kbase_hwcnt_metadata_block_values_count(metadata, grp, blk);

 		kbase_hwcnt_dump_buffer_block_copy(dst_blk, src_blk, val_cnt);
 	}

-	kbase_hwcnt_metadata_for_each_clock(metadata, clk) {
-		if (kbase_hwcnt_clk_enable_map_enabled(
-			dst_enable_map->clk_enable_map, clk))
+	kbase_hwcnt_metadata_for_each_clock(metadata, clk)
+	{
+		if (kbase_hwcnt_clk_enable_map_enabled(dst_enable_map->clk_enable_map, clk))
 			dst->clk_cnt_buf[clk] = src->clk_cnt_buf[clk];
 	}
 }

-void kbase_hwcnt_dump_buffer_copy_strict(
-	struct kbase_hwcnt_dump_buffer *dst,
-	const struct kbase_hwcnt_dump_buffer *src,
-	const struct kbase_hwcnt_enable_map *dst_enable_map)
+void kbase_hwcnt_dump_buffer_copy_strict(struct kbase_hwcnt_dump_buffer *dst,
+					 const struct kbase_hwcnt_dump_buffer *src,
+					 const struct kbase_hwcnt_enable_map *dst_enable_map)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	size_t grp, blk, blk_inst;
 	size_t clk;

-	if (WARN_ON(!dst) ||
-	    WARN_ON(!src) ||
-	    WARN_ON(!dst_enable_map) ||
-	    WARN_ON(dst == src) ||
+	if (WARN_ON(!dst) || WARN_ON(!src) || WARN_ON(!dst_enable_map) || WARN_ON(dst == src) ||
 	    WARN_ON(dst->metadata != src->metadata) ||
 	    WARN_ON(dst->metadata != dst_enable_map->metadata))
 		return;

 	metadata = dst->metadata;

-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
-		u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
-			dst, grp, blk, blk_inst);
-		const u64 *src_blk = kbase_hwcnt_dump_buffer_block_instance(
-			src, grp, blk, blk_inst);
-		const u64 *blk_em = kbase_hwcnt_enable_map_block_instance(
-			dst_enable_map, grp, blk, blk_inst);
-		size_t val_cnt = kbase_hwcnt_metadata_block_values_count(
-			metadata, grp, blk);
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+	{
+		u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
+		const u64 *src_blk =
+			kbase_hwcnt_dump_buffer_block_instance(src, grp, blk, blk_inst);
+		const u64 *blk_em =
+			kbase_hwcnt_enable_map_block_instance(dst_enable_map, grp, blk, blk_inst);
+		size_t val_cnt = kbase_hwcnt_metadata_block_values_count(metadata, grp, blk);
 		/* Align upwards to include padding bytes */
-		val_cnt = KBASE_HWCNT_ALIGN_UPWARDS(val_cnt,
-			(KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT /
-			 KBASE_HWCNT_VALUE_BYTES));
+		val_cnt = KBASE_HWCNT_ALIGN_UPWARDS(
+			val_cnt, (KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT / KBASE_HWCNT_VALUE_BYTES));

-		kbase_hwcnt_dump_buffer_block_copy_strict(
-			dst_blk, src_blk, blk_em, val_cnt);
+		kbase_hwcnt_dump_buffer_block_copy_strict(dst_blk, src_blk, blk_em, val_cnt);
 	}

-	kbase_hwcnt_metadata_for_each_clock(metadata, clk) {
+	kbase_hwcnt_metadata_for_each_clock(metadata, clk)
+	{
 		bool clk_enabled =
-			kbase_hwcnt_clk_enable_map_enabled(
-				dst_enable_map->clk_enable_map, clk);
+			kbase_hwcnt_clk_enable_map_enabled(dst_enable_map->clk_enable_map, clk);

 		dst->clk_cnt_buf[clk] = clk_enabled ? src->clk_cnt_buf[clk] : 0;
 	}
 }

-void kbase_hwcnt_dump_buffer_accumulate(
-	struct kbase_hwcnt_dump_buffer *dst,
-	const struct kbase_hwcnt_dump_buffer *src,
-	const struct kbase_hwcnt_enable_map *dst_enable_map)
+void kbase_hwcnt_dump_buffer_accumulate(struct kbase_hwcnt_dump_buffer *dst,
+					const struct kbase_hwcnt_dump_buffer *src,
+					const struct kbase_hwcnt_enable_map *dst_enable_map)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	size_t grp, blk, blk_inst;
 	size_t clk;

-	if (WARN_ON(!dst) ||
-	    WARN_ON(!src) ||
-	    WARN_ON(!dst_enable_map) ||
-	    WARN_ON(dst == src) ||
+	if (WARN_ON(!dst) || WARN_ON(!src) || WARN_ON(!dst_enable_map) || WARN_ON(dst == src) ||
 	    WARN_ON(dst->metadata != src->metadata) ||
 	    WARN_ON(dst->metadata != dst_enable_map->metadata))
 		return;

 	metadata = dst->metadata;

-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+	{
 		u64 *dst_blk;
 		const u64 *src_blk;
 		size_t hdr_cnt;
 		size_t ctr_cnt;

-		if (!kbase_hwcnt_enable_map_block_enabled(
-			dst_enable_map, grp, blk, blk_inst))
+		if (!kbase_hwcnt_enable_map_block_enabled(dst_enable_map, grp, blk, blk_inst))
 			continue;

-		dst_blk = kbase_hwcnt_dump_buffer_block_instance(
-			dst, grp, blk, blk_inst);
-		src_blk = kbase_hwcnt_dump_buffer_block_instance(
-			src, grp, blk, blk_inst);
-		hdr_cnt = kbase_hwcnt_metadata_block_headers_count(
-			metadata, grp, blk);
-		ctr_cnt = kbase_hwcnt_metadata_block_counters_count(
-			metadata, grp, blk);
+		dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
+		src_blk = kbase_hwcnt_dump_buffer_block_instance(src, grp, blk, blk_inst);
+		hdr_cnt = kbase_hwcnt_metadata_block_headers_count(metadata, grp, blk);
+		ctr_cnt = kbase_hwcnt_metadata_block_counters_count(metadata, grp, blk);

-		kbase_hwcnt_dump_buffer_block_accumulate(
-			dst_blk, src_blk, hdr_cnt, ctr_cnt);
+		kbase_hwcnt_dump_buffer_block_accumulate(dst_blk, src_blk, hdr_cnt, ctr_cnt);
 	}

-	kbase_hwcnt_metadata_for_each_clock(metadata, clk) {
-		if (kbase_hwcnt_clk_enable_map_enabled(
-			dst_enable_map->clk_enable_map, clk))
+	kbase_hwcnt_metadata_for_each_clock(metadata, clk)
+	{
+		if (kbase_hwcnt_clk_enable_map_enabled(dst_enable_map->clk_enable_map, clk))
 			dst->clk_cnt_buf[clk] += src->clk_cnt_buf[clk];
 	}
 }

-void kbase_hwcnt_dump_buffer_accumulate_strict(
-	struct kbase_hwcnt_dump_buffer *dst,
-	const struct kbase_hwcnt_dump_buffer *src,
-	const struct kbase_hwcnt_enable_map *dst_enable_map)
+void kbase_hwcnt_dump_buffer_accumulate_strict(struct kbase_hwcnt_dump_buffer *dst,
+					       const struct kbase_hwcnt_dump_buffer *src,
+					       const struct kbase_hwcnt_enable_map *dst_enable_map)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	size_t grp, blk, blk_inst;
 	size_t clk;

-	if (WARN_ON(!dst) ||
-	    WARN_ON(!src) ||
-	    WARN_ON(!dst_enable_map) ||
-	    WARN_ON(dst == src) ||
+	if (WARN_ON(!dst) || WARN_ON(!src) || WARN_ON(!dst_enable_map) || WARN_ON(dst == src) ||
 	    WARN_ON(dst->metadata != src->metadata) ||
 	    WARN_ON(dst->metadata != dst_enable_map->metadata))
 		return;

 	metadata = dst->metadata;

-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
-		u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
-			dst, grp, blk, blk_inst);
-		const u64 *src_blk = kbase_hwcnt_dump_buffer_block_instance(
-			src, grp, blk, blk_inst);
-		const u64 *blk_em = kbase_hwcnt_enable_map_block_instance(
-			dst_enable_map, grp, blk, blk_inst);
-		size_t hdr_cnt = kbase_hwcnt_metadata_block_headers_count(
-			metadata, grp, blk);
-		size_t ctr_cnt = kbase_hwcnt_metadata_block_counters_count(
-			metadata, grp, blk);
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+	{
+		u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
+		const u64 *src_blk =
+			kbase_hwcnt_dump_buffer_block_instance(src, grp, blk, blk_inst);
+		const u64 *blk_em =
+			kbase_hwcnt_enable_map_block_instance(dst_enable_map, grp, blk, blk_inst);
+		size_t hdr_cnt = kbase_hwcnt_metadata_block_headers_count(metadata, grp, blk);
+		size_t ctr_cnt = kbase_hwcnt_metadata_block_counters_count(metadata, grp, blk);
 		/* Align upwards to include padding bytes */
-		ctr_cnt = KBASE_HWCNT_ALIGN_UPWARDS(hdr_cnt + ctr_cnt,
-			(KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT /
-			 KBASE_HWCNT_VALUE_BYTES) - hdr_cnt);
+		ctr_cnt = KBASE_HWCNT_ALIGN_UPWARDS(
+			hdr_cnt + ctr_cnt,
+			(KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT / KBASE_HWCNT_VALUE_BYTES) - hdr_cnt);

-		kbase_hwcnt_dump_buffer_block_accumulate_strict(
-			dst_blk, src_blk, blk_em, hdr_cnt, ctr_cnt);
+		kbase_hwcnt_dump_buffer_block_accumulate_strict(dst_blk, src_blk, blk_em, hdr_cnt,
+								ctr_cnt);
 	}

-	kbase_hwcnt_metadata_for_each_clock(metadata, clk) {
-		if (kbase_hwcnt_clk_enable_map_enabled(
-			dst_enable_map->clk_enable_map, clk))
+	kbase_hwcnt_metadata_for_each_clock(metadata, clk)
+	{
+		if (kbase_hwcnt_clk_enable_map_enabled(dst_enable_map->clk_enable_map, clk))
 			dst->clk_cnt_buf[clk] += src->clk_cnt_buf[clk];
 		else
 			dst->clk_cnt_buf[clk] = 0;
--- a/Show More
+++ b/Show More