1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
| nsys profile \ --trace=cuda,osrt,nvtx \ --stats=true \ -o gemm_nsys \ ./gemm_naive WARNING: CPU IP/backtrace sampling not supported, disabling. Try the 'nsys status --environment' command to learn more.
WARNING: CPU context switch tracing not supported, disabling. Try the 'nsys status --environment' command to learn more.
C[0] = 1024.000000 Generating '/tmp/nsys-report-d705.qdstrm' [1/8] [========================100%] gemm_nsys.nsys-rep [2/8] [========================100%] gemm_nsys.sqlite [3/8] Executing 'nvtxsum' stats report SKIPPED: /home/xjy/culen/gemm_nsys.sqlite does not contain NV Tools Extension (NVTX) data. [4/8] Executing 'osrtsum' stats report
Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name -------- --------------- --------- ------------- ------------- --------- ----------- ------------- -------------- 67.6 225,051,573 2 112,525,786.5 112,525,786.5 1,452,590 223,598,983 157,081,220.9 sem_wait 16.5 54,956,396 10 5,495,639.6 555,811.5 2,555 31,027,255 10,373,816.7 poll 15.1 50,405,423 534 94,392.2 4,414.0 1,012 19,802,848 1,014,855.6 ioctl 0.3 898,995 9 99,888.3 73,013.0 55,891 322,717 85,225.0 sem_timedwait 0.2 819,490 28 29,267.5 3,400.5 1,298 543,432 101,281.8 mmap64 0.1 293,599 52 5,646.1 5,544.0 1,699 22,856 2,930.6 open64 0.0 128,149 5 25,629.8 26,079.0 18,581 32,930 5,883.9 pthread_create 0.0 88,219 14 6,301.4 4,594.0 1,152 33,522 8,253.7 mmap 0.0 71,150 25 2,846.0 1,850.0 1,004 8,543 2,305.9 fopen 0.0 33,323 4 8,330.8 5,519.0 2,010 20,275 8,504.6 fgets 0.0 22,578 5 4,515.6 4,108.0 1,825 9,132 2,992.4 fread 0.0 20,631 2 10,315.5 10,315.5 2,795 17,836 10,635.6 socket 0.0 15,608 5 3,121.6 3,126.0 2,525 3,786 499.1 munmap 0.0 14,326 5 2,865.2 2,648.0 1,036 4,671 1,403.5 open 0.0 9,046 7 1,292.3 1,231.0 1,082 1,914 283.1 read 0.0 7,664 3 2,554.7 1,847.0 1,094 4,723 1,915.2 fclose 0.0 6,706 1 6,706.0 6,706.0 6,706 6,706 0.0 connect 0.0 4,434 1 4,434.0 4,434.0 4,434 4,434 0.0 pipe2 0.0 2,364 1 2,364.0 2,364.0 2,364 2,364 0.0 fcntl 0.0 1,661 1 1,661.0 1,661.0 1,661 1,661 0.0 fopen64 0.0 1,261 1 1,261.0 1,261.0 1,261 1,261 0.0 bind 0.0 1,090 1 1,090.0 1,090.0 1,090 1,090 0.0 write
[5/8] Executing 'cudaapisum' stats report
Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name -------- --------------- --------- ------------ --------- -------- ---------- ------------ ---------------------- 95.5 76,432,871 3 25,477,623.7 27,590.0 23,732 76,381,549 44,084,092.5 cudaMalloc 2.9 2,355,527 3 785,175.7 594,232.0 503,424 1,257,871 411,876.4 cudaMemcpy 1.1 858,646 2 429,323.0 429,323.0 429,016 429,630 434.2 cudaDeviceSynchronize 0.3 238,813 3 79,604.3 89,958.0 55,161 93,694 21,250.8 cudaFree 0.1 119,589 2 59,794.5 59,794.5 2,859 116,730 80,519.0 cudaLaunchKernel 0.0 942 1 942.0 942.0 942 942 0.0 cuModuleGetLoadingMode
[6/8] Executing 'gpukernsum' stats report
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) GridXYZ BlockXYZ Name -------- --------------- --------- --------- --------- -------- -------- ----------- -------------- -------------- ---------------------------------------------------------------- 100.0 856,482 2 428,241.0 428,241.0 427,585 428,897 927.7 64 64 1 16 16 1 gemm_naive(const float *, const float *, float *, int, int, int)
[7/8] Executing 'gpumemtimesum' stats report
Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation -------- --------------- ----- --------- --------- -------- -------- ----------- ------------------ 56.4 916,771 2 458,385.5 458,385.5 411,170 505,601 66,772.8 [CUDA memcpy HtoD] 43.6 709,090 1 709,090.0 709,090.0 709,090 709,090 0.0 [CUDA memcpy DtoH]
[8/8] Executing 'gpumemsizesum' stats report
Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation ---------- ----- -------- -------- -------- -------- ----------- ------------------ 8.389 2 4.194 4.194 4.194 4.194 0.000 [CUDA memcpy HtoD] 4.194 1 4.194 4.194 4.194 4.194 0.000 [CUDA memcpy DtoH]
Generated:
|