NVIDIA 已经发布了他们的 Kepler Architecture. CUDA toolkit 做了更新,以三代不同的 GTX cards (480, 580 and 680)来考察:
Matrix multiplication:
可以看到GTX 680 有 1 Terraflop 性能single precision, 而 GTX 580 接近.然而,680峰值约为2048×2048,然后会向下接近580性能。高端teslac2070的单精度在第三系列480之后。
对于双精度,C2070领先群雄。
Fast Fourier Transform:
从480到580的性能提升显著( 20% ) ,而680似乎没有比之前优越多少。快速傅立叶变换是一个测试,它使这些卡都在接近耗尽内存前达到峰值性能。2GB, 680可容纳两个8192 × 8192单精度复数矩阵,但该算法所需的暂存空间超过可用空间。所有的转换都是2D,实数到复数的变换。
SORT:
这里的GTX 680开始较强,但之后在GTX 580前失去了优势,并最终落后于480 。我们使用相同的基数排序算法为基准。令人吃惊的是,在高峰时 680慢了超过20%以上。
附件资源:
Benchmark Code
Benchmark Results
相关链接:
Tom’s hardware: LuxMark Benchmarks
Anandtech: Retaking the performance crown
测试代码使用arrayfire:
- #include <stdio.h>
- #include <arrayfire.h>
- using namespace af;
- array in;
- void bench_blas()
- {
- array out = matmul(in, in);
- }
- void bench_fft()
- {
- array out = fft2(in);
- }
- void bench_lu()
- {
- array l, u;
- lu(l, u, in);
- }
- void bench_sort()
- {
- array sorted = sort(in);
- }
- int main(int argc, char **argv)
- {
- af::deviceset(1);
- af::info();
- printf("Benching blas\n");
- for (int n = 1024; n <= 8192; n+=1024) {
- double time_s, time_d;
- in = randu(n, n, f32);
- time_s = timeit(bench_blas);
- in = randu(n, n, f64);
- time_d = timeit(bench_blas) : -1;
- printf("[%4d x %4d] Single: %4.4lf Double: %4.4lf\n", n, n, time_s, time_d);
- }
- printf("\n\n");
- printf("Benching lu\n");
- for (int n = 512; n <= 4096; n+=512) {
- double time_s, time_d;
- in = randu(n, n, f32);
- time_s = timeit(bench_lu);
- in = randu(n, n, f64);
- time_d = timeit(bench_lu);
- printf("[%4d x %4d] Single: %4.4lf Double: %4.4lf\n", n, n, time_s, time_d);
- }
- printf("\n\n");
- printf("Benching fft\n");
- for (int n = 256; n <= 4096; n*=2) {
- double time_s, time_d;
- in = randu(n, n, f32);
- time_s = timeit(bench_fft);
- in = randu(n, n, f64);
- time_d = timeit(bench_fft);
- printf("[%4d x %4d] Single: %4.4lf Double: %4.4lf\n", n, n, time_s, time_d);
- }
- printf("\n\n");
- printf("Benching sort\n");
- for (int n = 1024; n <= 50e6; n*=2) {
- double time_s, time_d;
- in = randu(n, 1, f32);
- time_s = timeit(bench_sort);
- in = randu(n, 1, f64);
- time_d = timeit(bench_sort);
- printf("[%8d x 1] Single: %4.4lf Double: %4.4lf\n", n, time_s, time_d);
- }
- printf("\n\n");
- return 0;
- }