@article{oai:uec.repo.nii.ac.jp:00000288, author = {Junichi, Ohmura and Takefumi, Miyoshi and Hidetsugu, Irie and Tsutomu, Yoshinaga}, issue = {12}, journal = {IEICE Transactions on Information and Systems}, month = {Dec}, note = {1000050422407, 1000060210738, In this paper, we propose an approach to obtaining en-hanced performance of the Linpack benchmark on a GPU-accelerated PCcluster connected via relatively slow inter-node connections. For one nodewith a quad-core Intel Xeon W3520 processor and a NVIDIA Tesla C1060GPU card, we implement a CPU-GPU parallel double-precision generalmatrix-matirx multiplication (dgemm) operation, and achieve a perfor-mance improvement of 34% compared with the GPU-only case and 64%compared with the CPU-only case. For an entire 16-node cluster, each nodeof which is the same as the above and is connected with two gigabit Ether-net links, we use a computation-communication overlap scheme with GPUacceleration for the Linpack benchmark, and achieve a performance im-provement of 28% compared with the GPU-accelerated high-performanceLinpack benchmark (HPL) without overlapping. Our overlap GPU accel-eration solution uses overlaps in which the main inter-node communicationand data transfer to the GPU device memory are overlapped with the maincomputation task on the CPU cores. These overlaps use multi-core pro-cessors, which almost all of today’s high-performance computers use. Inparticular, as well as using a CPU core for communication tasks, we alsosimultaneously use other CPU cores and the GPU for computation tasks.In order to enable overlap between inter-node communication and com-putation tasks, we eliminate their close dependence by breaking the maincomputation task into smaller tasks and rescheduling. Based on a scheme inwhich part of the CPU computation power is simultaneously used for tasksother than computation tasks, we experimentally find the optimal compu-tation ratio for CPUs; this ratio differs from the case of parallel dgemmoperation of one node.}, pages = {2319--2327}, title = {Computation-Communication Overlap of Linpack on a GPU-Accelerated PC Cluster}, volume = {E94-D}, year = {2011} }