@article{mei2017microbench,
  author    = {Mei, Xinxin and Chu, Xiaowen},
  title     = {Dissecting GPU Memory Hierarchy Through Microbenchmarking},
  journal   = {IEEE Trans. Parallel Distrib. Syst.},
  volume    = {28},
  number    = {1},
  pages     = {72--86},
  publisher = {IEEE Press},
  year      = {2017},
  issn      = {1045-9219},
  doi       = {10.1109/TPDS.2016.2549523},
} 

@inproceedings{zhang2017performance,
  author    = {Zhang, Xiuxia and Tan, Guangming and Xue, Shuangbai and Li, Jiajia and Zhou, Keren and Chen, Mingyu},
  title     = {Understanding the GPU Microarchitecture to Achieve Bare-Metal Performance Tuning},
  booktitle = {Proceedings of the 22Nd ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming},
  series    = {PPoPP '17},
  year      = {2017},
  isbn      = {978-1-4503-4493-7},
  pages     = {31--43},
  doi       = {10.1145/3018743.3018755},
  publisher = {ACM}
} 

@article{lim2017autotuning,
  author    = {Robert V. Lim and Boyana Norris and Allen D. Malony},
  title     = {Autotuning {GPU} Kernels via Static and Predictive Analysis},
  journal   = {CoRR},
  volume    = {abs/1701.08547},
  year      = {2017},
  url       = {http://arxiv.org/abs/1701.08547}
}

@phdthesis{volkov2016thesis,
  author    = {Volkov, Vasily},
  title     = {Understanding Latency Hiding on GPUs},
  school    = {EECS Department, University of California, Berkeley},
  number    = {UCB/EECS-2016-143},
  year      = {2016},
  url       = {http://www2.eecs.berkeley.edu/Pubs/TechRpts/2016/EECS-2016-143.html}
}

@inbook{mei2014,
  author    = {Mei, Xinxin and Zhao, Kaiyong and Liu, Chengjian and Chu, Xiaowen},
  title     = {Benchmarking the Memory Hierarchy of Modern GPUs},
  booktitle = {Network and Parallel Computing: 11th IFIP WG 10.3 International Conference},
  year      = {2014},
  publisher = {Springer Berlin Heidelberg},
  pages     = {144--156},
  isbn      = {978-3-662-44917-2},
  doi       = {10.1007/978-3-662-44917-2_13},
}


@article{zhang2014performance,
  author    = {Zhang, Ying and Peng, Lu and Li, Bin and Peir, Jih-Kwon and Chen, Jianmin},
  title     = {Performance and Power Comparisons Between Nvidia and ATI GPUs},
  journal   = {International Journal of Computer Science \& Information Technology},
  volume    = {6},
  number    = {6},
  year      = {2014}
}

@inproceedings{zhang2011ati,
  author    = {Zhang, Ying and Hu, Yue and Li, Bin and Peng, Lu},
  title     = {Performance and power analysis of ATI GPU: A statistical approach},
  booktitle = {Networking, Architecture and Storage (NAS), 6th IEEE International Conference on},
  pages     = {149--158},
  year      = {2011}
}

@inproceedings{volkov2010occupation,
  author    = {Volkov, Vasily},
  title     = {Better performance at lower occupancy},
  booktitle = {Proceedings of the GPU technology conference (GTC)},
  volume    = {10},
  pages     = {16},
  year      = {2010}
}

@inproceedings{konstantinidis2016gpumembench,
  author   = {E. Konstantinidis and Y. Cotronis},
  title    = {A Quantitative Performance Evaluation of Fast on-Chip Memories of GPUs},
  booktitle= {24th Euromicro International Conference on Parallel, Distributed, and Network-Based Processing (PDP)},
  pages    = {448-455},
  year     = {2016},
  doi      = {10.1109/PDP.2016.56}
}

@article{konstantinidis2017mixbench,
  author   = {Elias Konstantinidis and Yiannis Cotronis},
  title    = {A quantitative roofline model for GPU kernel performance estimation using micro-benchmarks and hardware metric profiling},
  journal  = {Journal of Parallel and Distributed Computing},
  volume   = {107},
  pages    = {37 - 56},
  year     = {2017},
  doi      = {10.1016/j.jpdc.2017.04.002}
}