ncu nsight

Ikko Lv4
1
vim gemm_naive.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#include <cuda_runtime.h>
#include <cstdio>
#include <cstdlib>

#define M 1024
#define N 1024
#define K 1024

// =======================
// naive GEMM kernel
// 每个 thread 计算一个 C(i,j)
// 无 shared memory
// =======================
__global__ void gemm_naive(const float* A,
const float* B,
float* C,
int m, int n, int k)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;

if (row < m && col < n) {
float sum = 0.0f;
for (int i = 0; i < k; ++i) {
sum += A[row * k + i] * B[i * n + col];
}
C[row * n + col] = sum;
}
}

// =======================
// host code
// =======================
int main()
{
size_t sizeA = M * K * sizeof(float);
size_t sizeB = K * N * sizeof(float);
size_t sizeC = M * N * sizeof(float);

float *hA = (float*)malloc(sizeA);
float *hB = (float*)malloc(sizeB);
float *hC = (float*)malloc(sizeC);

// init data
for (int i = 0; i < M * K; ++i) hA[i] = 1.0f;
for (int i = 0; i < K * N; ++i) hB[i] = 1.0f;

float *dA, *dB, *dC;
cudaMalloc(&dA, sizeA);
cudaMalloc(&dB, sizeB);
cudaMalloc(&dC, sizeC);

cudaMemcpy(dA, hA, sizeA, cudaMemcpyHostToDevice);
cudaMemcpy(dB, hB, sizeB, cudaMemcpyHostToDevice);

dim3 block(16, 16);
dim3 grid((N + block.x - 1) / block.x,
(M + block.y - 1) / block.y);

// warmup
gemm_naive<<<grid, block>>>(dA, dB, dC, M, N, K);
cudaDeviceSynchronize();

// timed run
gemm_naive<<<grid, block>>>(dA, dB, dC, M, N, K);
cudaDeviceSynchronize();

cudaMemcpy(hC, dC, sizeC, cudaMemcpyDeviceToHost);

printf("C[0] = %f\n", hC[0]);

cudaFree(dA);
cudaFree(dB);
cudaFree(dC);
free(hA);
free(hB);
free(hC);

return 0;
}
1
2
3
4
5
6
7
nvcc -O0 -lineinfo gemm_naive.cu -o gemm_naive

nsys profile \
--trace=cuda,osrt,nvtx \
--stats=true \
-o gemm_nsys \
./gemm_naive

输出

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
nsys profile \
--trace=cuda,osrt,nvtx \
--stats=true \
-o gemm_nsys \
./gemm_naive
WARNING: CPU IP/backtrace sampling not supported, disabling.
Try the 'nsys status --environment' command to learn more.

WARNING: CPU context switch tracing not supported, disabling.
Try the 'nsys status --environment' command to learn more.

C[0] = 1024.000000
Generating '/tmp/nsys-report-d705.qdstrm'
[1/8] [========================100%] gemm_nsys.nsys-rep
[2/8] [========================100%] gemm_nsys.sqlite
[3/8] Executing 'nvtxsum' stats report
SKIPPED: /home/xjy/culen/gemm_nsys.sqlite does not contain NV Tools Extension (NVTX) data.
[4/8] Executing 'osrtsum' stats report

Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- ------------- ------------- --------- ----------- ------------- --------------
67.6 225,051,573 2 112,525,786.5 112,525,786.5 1,452,590 223,598,983 157,081,220.9 sem_wait
16.5 54,956,396 10 5,495,639.6 555,811.5 2,555 31,027,255 10,373,816.7 poll
15.1 50,405,423 534 94,392.2 4,414.0 1,012 19,802,848 1,014,855.6 ioctl
0.3 898,995 9 99,888.3 73,013.0 55,891 322,717 85,225.0 sem_timedwait
0.2 819,490 28 29,267.5 3,400.5 1,298 543,432 101,281.8 mmap64
0.1 293,599 52 5,646.1 5,544.0 1,699 22,856 2,930.6 open64
0.0 128,149 5 25,629.8 26,079.0 18,581 32,930 5,883.9 pthread_create
0.0 88,219 14 6,301.4 4,594.0 1,152 33,522 8,253.7 mmap
0.0 71,150 25 2,846.0 1,850.0 1,004 8,543 2,305.9 fopen
0.0 33,323 4 8,330.8 5,519.0 2,010 20,275 8,504.6 fgets
0.0 22,578 5 4,515.6 4,108.0 1,825 9,132 2,992.4 fread
0.0 20,631 2 10,315.5 10,315.5 2,795 17,836 10,635.6 socket
0.0 15,608 5 3,121.6 3,126.0 2,525 3,786 499.1 munmap
0.0 14,326 5 2,865.2 2,648.0 1,036 4,671 1,403.5 open
0.0 9,046 7 1,292.3 1,231.0 1,082 1,914 283.1 read
0.0 7,664 3 2,554.7 1,847.0 1,094 4,723 1,915.2 fclose
0.0 6,706 1 6,706.0 6,706.0 6,706 6,706 0.0 connect
0.0 4,434 1 4,434.0 4,434.0 4,434 4,434 0.0 pipe2
0.0 2,364 1 2,364.0 2,364.0 2,364 2,364 0.0 fcntl
0.0 1,661 1 1,661.0 1,661.0 1,661 1,661 0.0 fopen64
0.0 1,261 1 1,261.0 1,261.0 1,261 1,261 0.0 bind
0.0 1,090 1 1,090.0 1,090.0 1,090 1,090 0.0 write

[5/8] Executing 'cudaapisum' stats report

Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- ------------ --------- -------- ---------- ------------ ----------------------
95.5 76,432,871 3 25,477,623.7 27,590.0 23,732 76,381,549 44,084,092.5 cudaMalloc
2.9 2,355,527 3 785,175.7 594,232.0 503,424 1,257,871 411,876.4 cudaMemcpy
1.1 858,646 2 429,323.0 429,323.0 429,016 429,630 434.2 cudaDeviceSynchronize
0.3 238,813 3 79,604.3 89,958.0 55,161 93,694 21,250.8 cudaFree
0.1 119,589 2 59,794.5 59,794.5 2,859 116,730 80,519.0 cudaLaunchKernel
0.0 942 1 942.0 942.0 942 942 0.0 cuModuleGetLoadingMode

[6/8] Executing 'gpukernsum' stats report

Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) GridXYZ BlockXYZ Name
-------- --------------- --------- --------- --------- -------- -------- ----------- -------------- -------------- ----------------------------------------------------------------
100.0 856,482 2 428,241.0 428,241.0 427,585 428,897 927.7 64 64 1 16 16 1 gemm_naive(const float *, const float *, float *, int, int, int)

[7/8] Executing 'gpumemtimesum' stats report

Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation
-------- --------------- ----- --------- --------- -------- -------- ----------- ------------------
56.4 916,771 2 458,385.5 458,385.5 411,170 505,601 66,772.8 [CUDA memcpy HtoD]
43.6 709,090 1 709,090.0 709,090.0 709,090 709,090 0.0 [CUDA memcpy DtoH]

[8/8] Executing 'gpumemsizesum' stats report

Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation
---------- ----- -------- -------- -------- -------- ----------- ------------------
8.389 2 4.194 4.194 4.194 4.194 0.000 [CUDA memcpy HtoD]
4.194 1 4.194 4.194 4.194 4.194 0.000 [CUDA memcpy DtoH]

Generated:
  • Title: ncu nsight
  • Author: Ikko
  • Created at : 2025-12-23 01:06:12
  • Updated at : 2026-01-24 23:52:57
  • Link: http://ikko-debug.github.io/2025/12/23/PMPP/
  • License: This work is licensed under CC BY-NC-SA 4.0.
Comments
On this page
ncu nsight