diff --git a/.github/workflows/sycl-ur-perf-benchmarking.yml b/.github/workflows/sycl-ur-perf-benchmarking.yml
index 4890c1c26a7d1..3732500c00b9d 100644
--- a/.github/workflows/sycl-ur-perf-benchmarking.yml
+++ b/.github/workflows/sycl-ur-perf-benchmarking.yml
@@ -227,30 +227,14 @@ jobs:
       toolchain_decompress_command: ${{ needs.build_nightly.outputs.toolchain_decompress_command }}
   # END nightly benchmarking path
 
-  # Benchmark framework builds and runs on PRs path:
-  build_pr:
-    name: '[PR] Build SYCL'
-    if: github.event_name == 'pull_request'
-    uses: ./.github/workflows/sycl-linux-build.yml
-    with:
-      build_ref: ${{ github.sha }}
-      build_cache_root: "/__w/"
-      build_cache_suffix: "default"
-      # Docker image has last nightly pre-installed and added to the PATH
-      build_image: "ghcr.io/intel/llvm/sycl_ubuntu2404_nightly:latest"
-      cc: clang
-      cxx: clang++
-      changes: '[]'
-      toolchain_artifact: sycl_linux_default
-
+  # BEGIN benchmark framework builds and runs on PRs path
   # TODO: When we have stable BMG runner(s), consider moving this job to that runner.
   test_benchmark_framework:
     name: '[PR] Benchmark suite testing'
-    needs: [build_pr]
     permissions:
       contents: write
       packages: read
-    if: always() && !cancelled() && needs.build_pr.outputs.build_conclusion == 'success'
+    if: github.event_name == 'pull_request'
     uses: ./.github/workflows/sycl-linux-run-tests.yml
     with:
       name: 'Framework test: PVC_PERF, L0, Minimal preset'
diff --git a/devops/actions/run-tests/benchmark/action.yml b/devops/actions/run-tests/benchmark/action.yml
index 7321dc5aa932c..a131d334afc45 100644
--- a/devops/actions/run-tests/benchmark/action.yml
+++ b/devops/actions/run-tests/benchmark/action.yml
@@ -113,10 +113,10 @@ runs:
   # modified output the entire sycl build dir as an artifact, in which the
   # intermediate files required can be stitched together from the build files.
   # However, this is not exactly "clean" or "fun to maintain"...
-  - name: Build Unified Runtime
+  - name: Build LLVM
     shell: bash
     run: |
-      # Build Unified Runtime
+      echo "::group::checkout_llvm"
       # Sparse-checkout UR at build ref:
       git clone --depth 1 --no-checkout https://github.com/intel/llvm ur
       cd ur
@@ -124,6 +124,8 @@ runs:
       git sparse-checkout set unified-runtime
       git fetch origin ${{ inputs.build_ref }}
       git checkout FETCH_HEAD
+      echo "::endgroup::"
+      echo "::group::configure_llvm"
 
       # Configure UR
       mkdir build install
@@ -135,32 +137,41 @@ runs:
         -DUR_BUILD_ADAPTER_L0=ON \
         -DUR_BUILD_ADAPTER_L0_V2=ON
 
-      # Build and install UR
+      echo "::endgroup::"
+      echo "::group::build_and_install_llvm"
+
       cmake --build build -j "$(nproc)" 
       cmake --install build
 
       cd -
+
+      echo "::endgroup::"
   # Install level zero v1.25.2
   # This is to have the latest level zero required by Compute Benchmarks
   # Remove this w/a once the sycl nightly images are updated to have level zero v1.25.2
   - name: Install level zero v1.25.2
     shell: bash
     run: |
-      # Install level zero v1.25.2
+      echo "::group::checkout_level_zero"
       # Checkout Level Zero at build ref:
       wget https://github.com/oneapi-src/level-zero/archive/refs/tags/v1.25.2.tar.gz -O level-zero-v1.25.2.tar.gz
       tar -xvf level-zero-v1.25.2.tar.gz
       cd level-zero-1.25.2
 
-      # Configure Level Zero
+      echo "::endgroup::"
+      echo "::group::configure_level_zero"
+      
       cmake -DCMAKE_BUILD_TYPE=Release \
         -Bbuild
 
-      # Build and install Level Zero
+      echo "::endgroup::"
+      echo "::group::build_and_install_level_zero"
+      
       cmake --build build -j "$(nproc)"
       sudo cmake --install build
 
       cd -
+      echo "::endgroup::"
   - name: Set env var for results branch
     shell: bash
     run: |
@@ -181,10 +192,10 @@ runs:
       SAVE_PREFIX: ${{ inputs.save_name }}
     shell: bash
     run: |
-      # Build and run benchmarks
       # TODO generate summary + display helpful message here
       export CMPLR_ROOT=./toolchain
-      echo "-----"
+      echo "::group::install_python_deps"
+      echo "Installing python dependencies..."
       # Using --break-system-packages because:
       # - venv is not installed
       # - unable to install anything via pip, as python packages in the docker
@@ -192,7 +203,8 @@ runs:
       # - apt is unable to install anything due to unresolved dpkg dependencies,
       #   as a result of how the sycl nightly images are created
       pip install --user --break-system-packages -r ./devops/scripts/benchmarks/requirements.txt
-      echo "-----"
+      echo "::endgroup::"
+      echo "::group::sycl_ls"
 
       # By default, the benchmark scripts forceload level_zero
       FORCELOAD_ADAPTER="${ONEAPI_DEVICE_SELECTOR%%:*}"
@@ -228,7 +240,8 @@ runs:
       export COMPUTE_RUNTIME_TAG_CACHE="$(cat ./devops/dependencies.json | jq -r .linux.compute_runtime.github_tag)"
 
       sycl-ls
-      echo "-----"
+      echo "::endgroup::"
+      echo "::group::run_benchmarks"
 
       WORKDIR="$(realpath ./llvm_test_workdir)"
       if [ -n "$WORKDIR" ] && [ -d "$WORKDIR" ] && [[ "$WORKDIR" == *llvm_test_workdir* ]]; then rm -rf "$WORKDIR" ; fi
@@ -247,7 +260,8 @@ runs:
         ${{ inputs.exit_on_failure == 'true' && '--exit-on-failure --iterations 1' || '' }}
       # TODO: add back: "--flamegraph inclusive" once works properly
 
-      echo "-----"
+      echo "::endgroup::"
+      echo "::group::compare_results"
       python3 ./devops/scripts/benchmarks/compare.py to_hist \
         --avg-type EWMA \
         --cutoff "$(date -u -d '7 days ago' +'%Y%m%d_%H%M%S')" \
@@ -260,7 +274,9 @@ runs:
         --produce-github-summary \
         ${{ inputs.dry_run == 'true' && '--dry-run' || '' }} \
 
-      echo "-----"
+      echo "::endgroup::"
+      
+      LLVM_BENCHMARKS_UNIT_TESTING=1 COMPUTE_BENCHMARKS_BUILD_PATH=$WORKDIR/compute-benchmarks-build python3 ./devops/scripts/benchmarks/tests/test_integration.py
 
   - name: Cache changes and upload github summary
     if: always()
diff --git a/devops/scripts/benchmarks/README.md b/devops/scripts/benchmarks/README.md
index 5429f75788015..d312da59b127a 100644
--- a/devops/scripts/benchmarks/README.md
+++ b/devops/scripts/benchmarks/README.md
@@ -52,6 +52,21 @@ $ cmake --build ~/ur_build -j $(nproc)
 $ cmake --install ~/ur_build
 ```
 
+## Testing
+
+There is a test which can execute benchmarking code and do some checks
+of internal data structures. In order to use it one should
+- prepare environment on its own (Level Zero, OneAPI or somehow SYCL
+`clang++` compiler)
+- have CMPLR_ROOT set and pointing to directory with `clang++`
+- have COMPUTE_BENCHMARKS_BUILD_PATH variable pointing to build directory of compute-benchmarks
+- set LLVM_BENCHMARKS_UNIT_TESTING=1
+
+Then tests can be executed by
+```
+python3 ./devops/scripts/benchmarks/tests/test_integration.py
+```
+
 ## Results
 
 By default, the benchmark results are not stored.  
diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
index 96dfc61e56dd6..a4e09198c226f 100644
--- a/devops/scripts/benchmarks/benches/compute.py
+++ b/devops/scripts/benchmarks/benches/compute.py
@@ -270,6 +270,52 @@ def benchmarks(self) -> list[Benchmark]:
                 )
             )
 
+        record_and_replay_params = product([0, 1], [0, 1])
+        for emulate, instantiate in record_and_replay_params:
+
+            def createRrBench(variant_name: str, **kwargs):
+                return RecordAndReplay(
+                    self,
+                    RUNTIMES.LEVEL_ZERO,
+                    variant_name,
+                    PROFILERS.TIMER,
+                    mRec=1,
+                    mInst=instantiate,
+                    mDest=0,
+                    emulate=emulate,
+                    **kwargs,
+                )
+
+            benches += [
+                createRrBench(
+                    "large",
+                    nForksInLvl=2,
+                    nLvls=4,
+                    nCmdSetsInLvl=10,
+                    nInstantiations=10,
+                    nAppendKern=10,
+                    nAppendCopy=1,
+                ),
+                createRrBench(
+                    "medium",
+                    nForksInLvl=1,
+                    nLvls=1,
+                    nCmdSetsInLvl=10,
+                    nInstantiations=10,
+                    nAppendKern=10,
+                    nAppendCopy=10,
+                ),
+                createRrBench(
+                    "short",
+                    nForksInLvl=1,
+                    nLvls=4,
+                    nCmdSetsInLvl=1,
+                    nInstantiations=0,
+                    nAppendKern=1,
+                    nAppendCopy=0,
+                ),
+            ]
+
         # Add UR-specific benchmarks
         benches += [
             # TODO: multithread_benchmark_ur fails with segfault
@@ -648,6 +694,49 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
         ]
 
 
+class RecordAndReplay(ComputeBenchmark):
+    def __init__(
+        self, bench, runtime: RUNTIMES, variant_name: str, profiler_type, **kwargs
+    ):
+        self.variant_name = variant_name
+        self.rr_params = kwargs
+        self.iterations_regular = 1000
+        self.iterations_trace = 10
+        super().__init__(
+            bench,
+            f"record_and_replay_benchmark_{runtime.value}",
+            "RecordGraph",
+            runtime,
+            profiler_type,
+        )
+
+    def explicit_group(self):
+        return f"{self.test} {self.variant_name}"
+
+    def display_name(self) -> str:
+        return f"{self.explicit_group()}_{self.runtime.value}"
+
+    def name(self):
+        ret = []
+        for k, v in self.rr_params.items():
+            if k[0] == "n":  # numeric parameter
+                ret.append(f"{k[1:]} {v}")
+            elif k[0] == "m":
+                if v != 0:  # measure parameter
+                    ret.append(f"{k[1:]}")
+            else:  # boolean parameter
+                if v != 0:
+                    ret.append(k)
+        ret.sort()
+        return self.bench_name + " " + ", ".join(ret)
+
+    def get_tags(self):
+        return ["L0"]
+
+    def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
+        return [f"--{k}={v}" for k, v in self.rr_params.items()]
+
+
 class QueueInOrderMemcpy(ComputeBenchmark):
     def __init__(self, bench, isCopyOnly, source, destination, size, profiler_type):
         self.isCopyOnly = isCopyOnly
diff --git a/devops/scripts/benchmarks/git_project.py b/devops/scripts/benchmarks/git_project.py
index 7d2ffd6706b44..1761b0049224b 100644
--- a/devops/scripts/benchmarks/git_project.py
+++ b/devops/scripts/benchmarks/git_project.py
@@ -3,6 +3,7 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+import os
 from pathlib import Path
 import shutil
 
@@ -10,7 +11,6 @@
 from utils.utils import run
 from options import options
 
-
 class GitProject:
     def __init__(
         self,
@@ -167,6 +167,11 @@ def _setup_repo(self) -> bool:
         Returns:
             bool: True if the repository was cloned or updated, False if it was already up-to-date.
         """
+        if os.environ.get("LLVM_BENCHMARKS_UNIT_TESTING") == "1":
+            log.debug(
+                f"Skipping git operations during unit testing of {self._name} (LLVM_BENCHMARKS_UNIT_TESTING=1)."
+            )
+            return False
         if not self.src_dir.exists():
             self._git_clone()
             return True
diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py
index de372d7279cc2..a63eca053a598 100755
--- a/devops/scripts/benchmarks/main.py
+++ b/devops/scripts/benchmarks/main.py
@@ -137,10 +137,13 @@ def process_results(
             stddev_threshold_override
             if stddev_threshold_override is not None
             else options.stddev_threshold
-        ) * mean_value
+        )
+        threshold_scaled = threshold * mean_value
 
-        if stddev > threshold:
-            log.warning(f"stddev {stddev} above the threshold {threshold} for {label}")
+        if stddev > threshold_scaled:
+            log.warning(
+                f"stddev {stddev} above the threshold {threshold_scaled} ({threshold} times {mean_value}) for {label}"
+            )
             valid_results = False
 
         rlist.sort(key=lambda res: res.value)
@@ -228,6 +231,10 @@ def main(directory, additional_env_vars, compare_names, filter):
             benchmark for benchmark in s.benchmarks() if benchmark.enabled()
         ]
         if filter:
+            # log.info(f"all benchmarks:\n" + "\n".join([b.name() for b in suite_benchmarks]))
+            log.debug(
+                f"Filtering {len(suite_benchmarks)} benchmarks in {s.name()} suite for {filter.pattern}"
+            )
             suite_benchmarks = [
                 benchmark
                 for benchmark in suite_benchmarks
@@ -713,6 +720,7 @@ def validate_and_parse_env_args(env_args):
     options.dry_run = args.dry_run
     options.umf = args.umf
     options.iterations_stddev = args.iterations_stddev
+    options.stddev_threshold = args.stddev_threshold
     options.build_igc = args.build_igc
     options.current_run_name = args.relative_perf
     options.cudnn_directory = args.cudnn_directory
diff --git a/devops/scripts/benchmarks/tests/test_integration.py b/devops/scripts/benchmarks/tests/test_integration.py
new file mode 100644
index 0000000000000..f4be229404850
--- /dev/null
+++ b/devops/scripts/benchmarks/tests/test_integration.py
@@ -0,0 +1,188 @@
+# Copyright (C) 2025 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+import logging
+import os
+import shutil
+import unittest
+import tempfile
+import subprocess
+import json
+from collections import namedtuple
+
+DataJson = namedtuple("DataJson", ["runs", "metadata", "tags", "names"])
+DataJsonRun = namedtuple("DataJsonRun", ["name", "results"])
+DataJsonResult = namedtuple(
+    "DataJsonResult", ["name", "label", "suite", "value", "unit"]
+)
+DataJsonMetatdata = namedtuple(
+    "DataJsonMetatdata",
+    [
+        "type",
+        "unstable",
+        "tags",
+        "range_min",
+        "range_max",
+        "display_name",
+        "explicit_group",
+    ],
+)
+
+
+class App:
+    def __init__(self):
+        self.OUTPUT_DIR = None
+        self.RESULTS_DIR = None
+        self.WORKDIR_DIR = None
+
+    def prepare_dirs(self):
+        self.OUTPUT_DIR = tempfile.mkdtemp()
+        self.RESULTS_DIR = tempfile.mkdtemp()
+        self.WORKDIR_DIR = tempfile.mkdtemp()
+
+        # when UT does not want to build compute-benchmarks from scratch, it can provide prebuilt path
+        cb_targetpath = os.environ.get("COMPUTE_BENCHMARKS_BUILD_PATH")
+        if cb_targetpath and os.path.isdir(cb_targetpath):
+            cb_build_dir = os.path.join(self.WORKDIR_DIR, "compute-benchmarks-build")
+            os.symlink(cb_targetpath, cb_build_dir)
+            with open(
+                os.path.join(self.WORKDIR_DIR, "BENCH_WORKDIR_VERSION"), "w"
+            ) as f:
+                f.write("2.0")  # TODO: take from main.INTERNAL_WORKDIR_VERSION
+
+    def remove_dirs(self):
+        for d in [self.RESULTS_DIR, self.OUTPUT_DIR, self.WORKDIR_DIR]:
+            if d is not None:
+                shutil.rmtree(d, ignore_errors=True)
+
+    def run_main(self, *args):
+
+        # TODO: not yet tested: "--detect-version", "sycl,compute_runtime"
+
+        procesResult = subprocess.run(
+            [
+                "./devops/scripts/benchmarks/main.py",
+                self.WORKDIR_DIR,
+                "--sycl",
+                os.environ.get("CMPLR_ROOT"),
+                "--save",
+                "testfile",
+                "--output-html",
+                "remote",
+                "--results-dir",
+                self.RESULTS_DIR,
+                "--output-dir",
+                self.OUTPUT_DIR,
+                "--preset",
+                "Minimal",
+                "--timestamp-override",
+                "20240102_030405",
+                "--stddev-threshold",
+                "999999999.9",
+                "--exit-on-failure",
+                *args,
+            ],
+            capture_output=True,
+        )
+        print("MAIN_PY_STDOUT:\n" + procesResult.stdout.decode())
+        print("MAIN_PY_STDERR:\n" + procesResult.stderr.decode())
+        return procesResult.returncode
+
+    def get_output(self):
+        with open(os.path.join(self.OUTPUT_DIR, "data.json")) as f:
+            out = json.load(f)
+            return DataJson(
+                runs=[
+                    DataJsonRun(
+                        name=run["name"],
+                        results=[
+                            DataJsonResult(
+                                name=r["name"],
+                                label=r["label"],
+                                suite=r["suite"],
+                                value=r["value"],
+                                unit=r["unit"],
+                            )
+                            for r in run["results"]
+                        ],
+                    )
+                    for run in out["benchmarkRuns"]
+                ],
+                metadata=dict(
+                    [
+                        (
+                            k,
+                            DataJsonMetatdata(
+                                type=v["type"],
+                                unstable=v.get("unstable", False),
+                                tags=v.get("tags", []),
+                                range_min=v.get("range_min"),
+                                range_max=v.get("range_max"),
+                                display_name=v.get("display_name"),
+                                explicit_group=v.get("explicit_group"),
+                            ),
+                        )
+                        for k, v in out["benchmarkMetadata"].items()
+                    ]
+                ),
+                tags=out["benchmarkTags"],
+                names=out["defaultCompareNames"],
+            )
+
+
+# add "--verbose" for debug logs
+
+
+class TestE2E(unittest.TestCase):
+    def setUp(self):
+        # Load test data
+        print(f"::group::{self._testMethodName}")
+        self.app = App()
+        self.app.remove_dirs()
+        self.app.prepare_dirs()
+
+    def tearDown(self):
+        self.app.remove_dirs()
+        print(f"::endgroup::")
+
+    def _checkGroup(
+        self, expectedGroupName: str, benchMetadata: DataJsonMetatdata, out: DataJson
+    ):
+        self.assertEqual(benchMetadata.type, "benchmark")
+        benchmarkGroupName = benchMetadata.explicit_group
+        self.assertEqual(benchmarkGroupName, expectedGroupName)
+        groupMetadata = out.metadata[benchmarkGroupName]
+        self.assertEqual(groupMetadata.type, "group")
+
+    def _checkResultsExist(self, caseName: str, out: DataJson):
+        self.assertIn(caseName, [r.name for r in out.runs[0].results])
+
+    def _checkCase(self, caseName: str, groupName: str, tags: set[str]):
+        run_result = self.app.run_main("--filter", caseName + "$")
+        self.assertEqual(run_result, 0, "Subprocess did not exit cleanly")
+
+        out = self.app.get_output()
+        self._checkResultsExist(caseName, out)
+
+        metadata = out.metadata[caseName]
+        self.assertEqual(set(metadata.tags), tags)
+        self._checkGroup(groupName, metadata, out)
+
+    def test_record_and_replay(self):
+        self._checkCase(
+            "record_and_replay_benchmark_l0 AppendCopy 1, AppendKern 10, CmdSetsInLvl 10, ForksInLvl 2, Instantiations 10, Lvls 4, Rec",
+            "RecordGraph large",
+            {"L0"},
+        )
+
+    def test_submit_kernel(self):
+        self._checkCase(
+            "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion KernelExecTime=20",
+            "SubmitKernel out of order with completion using events long kernel",
+            {"L0", "latency", "micro", "submit"},
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()