From 6aac4717b878ce3a78c1159ff2794e681a914e5b Mon Sep 17 00:00:00 2001
From: aioprli <126043521+aioprli@users.noreply.github.com>
Date: Mon, 6 May 2024 22:03:16 +0800
Subject: [PATCH] Update globalToShmemAsyncCopy.cu

Fix two obvious errors, the first one is that five tasks were submitted to pipeline at the same time and task 4 conflicts with task 0, the remaining two are copy errors
---
 .../globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu        | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Samples/3_CUDA_Features/globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu b/Samples/3_CUDA_Features/globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu
index 4b3c4875..73f59671 100644
--- a/Samples/3_CUDA_Features/globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu
+++ b/Samples/3_CUDA_Features/globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu
@@ -128,7 +128,7 @@ __global__ void MatrixMulAsyncCopyMultiStageLargeChunk(
        a <= aEnd; a += aStep, b += bStep, ++i) {
     // Load the matrices from device memory to shared memory; each thread loads
     // one element of each matrix
-    for (; aStage <= a + aStep * maxPipelineStages;
+    for (; aStage < a + aStep * maxPipelineStages;
          aStage += aStep, bStage += bStep, ++iStage) {
       pipe.producer_acquire();
       if (aStage <= aEnd && t4x < BLOCK_SIZE) {
@@ -137,7 +137,7 @@ __global__ void MatrixMulAsyncCopyMultiStageLargeChunk(
         cuda::memcpy_async(&As[j][threadIdx.y][t4x],
                            &A[aStage + wA * threadIdx.y + t4x], shape4, pipe);
         cuda::memcpy_async(&Bs[j][threadIdx.y][t4x],
-                           &B[aStage + wA * threadIdx.y + t4x], shape4, pipe);
+                           &B[bStage + wB * threadIdx.y + t4x], shape4, pipe);
       }
       pipe.producer_commit();
     }
@@ -222,7 +222,7 @@ __global__ void MatrixMulAsyncCopyLargeChunk(float *__restrict__ C,
 
       cuda::memcpy_async(&As[threadIdx.y][t4x], &A[a + wA * threadIdx.y + t4x],
                          shape4, pipe);
-      cuda::memcpy_async(&Bs[threadIdx.y][t4x], &B[a + wA * threadIdx.y + t4x],
+      cuda::memcpy_async(&Bs[threadIdx.y][t4x], &B[b + wB * threadIdx.y + t4x],
                          shape4, pipe);
 
       pipe.producer_commit();