@article{mei2017microbench, author = {Mei, Xinxin and Chu, Xiaowen}, title = {Dissecting GPU Memory Hierarchy Through Microbenchmarking}, journal = {IEEE Trans. Parallel Distrib. Syst.}, volume = {28}, number = {1}, pages = {72--86}, publisher = {IEEE Press}, year = {2017}, issn = {1045-9219}, doi = {10.1109/TPDS.2016.2549523}, } @inproceedings{zhang2017performance, author = {Zhang, Xiuxia and Tan, Guangming and Xue, Shuangbai and Li, Jiajia and Zhou, Keren and Chen, Mingyu}, title = {Understanding the GPU Microarchitecture to Achieve Bare-Metal Performance Tuning}, booktitle = {Proceedings of the 22Nd ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, series = {PPoPP '17}, year = {2017}, isbn = {978-1-4503-4493-7}, pages = {31--43}, doi = {10.1145/3018743.3018755}, publisher = {ACM} } @article{lim2017autotuning, author = {Robert V. Lim and Boyana Norris and Allen D. Malony}, title = {Autotuning {GPU} Kernels via Static and Predictive Analysis}, journal = {CoRR}, volume = {abs/1701.08547}, year = {2017}, url = {http://arxiv.org/abs/1701.08547} } @phdthesis{volkov2016thesis, author = {Volkov, Vasily}, title = {Understanding Latency Hiding on GPUs}, school = {EECS Department, University of California, Berkeley}, number = {UCB/EECS-2016-143}, year = {2016}, url = {http://www2.eecs.berkeley.edu/Pubs/TechRpts/2016/EECS-2016-143.html} } @inbook{mei2014, author = {Mei, Xinxin and Zhao, Kaiyong and Liu, Chengjian and Chu, Xiaowen}, title = {Benchmarking the Memory Hierarchy of Modern GPUs}, booktitle = {Network and Parallel Computing: 11th IFIP WG 10.3 International Conference}, year = {2014}, publisher = {Springer Berlin Heidelberg}, pages = {144--156}, isbn = {978-3-662-44917-2}, doi = {10.1007/978-3-662-44917-2_13}, } @article{zhang2014performance, author = {Zhang, Ying and Peng, Lu and Li, Bin and Peir, Jih-Kwon and Chen, Jianmin}, title = {Performance and Power Comparisons Between Nvidia and ATI GPUs}, journal = {International Journal of Computer Science \& Information Technology}, volume = {6}, number = {6}, year = {2014} } @inproceedings{zhang2011ati, author = {Zhang, Ying and Hu, Yue and Li, Bin and Peng, Lu}, title = {Performance and power analysis of ATI GPU: A statistical approach}, booktitle = {Networking, Architecture and Storage (NAS), 6th IEEE International Conference on}, pages = {149--158}, year = {2011} } @inproceedings{volkov2010occupation, author = {Volkov, Vasily}, title = {Better performance at lower occupancy}, booktitle = {Proceedings of the GPU technology conference (GTC)}, volume = {10}, pages = {16}, year = {2010} } @inproceedings{konstantinidis2016gpumembench, author = {E. Konstantinidis and Y. Cotronis}, title = {A Quantitative Performance Evaluation of Fast on-Chip Memories of GPUs}, booktitle= {24th Euromicro International Conference on Parallel, Distributed, and Network-Based Processing (PDP)}, pages = {448-455}, year = {2016}, doi = {10.1109/PDP.2016.56} } @article{konstantinidis2017mixbench, author = {Elias Konstantinidis and Yiannis Cotronis}, title = {A quantitative roofline model for GPU kernel performance estimation using micro-benchmarks and hardware metric profiling}, journal = {Journal of Parallel and Distributed Computing}, volume = {107}, pages = {37 - 56}, year = {2017}, doi = {10.1016/j.jpdc.2017.04.002} }