{"created":"2023-05-15T08:38:03.221806+00:00","id":288,"links":{},"metadata":{"_buckets":{"deposit":"de60f270-5f2d-41c3-a6b4-3f6d8b3ff403"},"_deposit":{"created_by":3,"id":"288","owners":[3],"pid":{"revision_id":0,"type":"depid","value":"288"},"status":"published"},"_oai":{"id":"oai:uec.repo.nii.ac.jp:00000288","sets":["6"]},"author_link":["6416","6414","6415","6413"],"item_10001_biblio_info_7":{"attribute_name":"書誌情報","attribute_value_mlt":[{"bibliographicIssueDates":{"bibliographicIssueDate":"2011-12-01","bibliographicIssueDateType":"Issued"},"bibliographicIssueNumber":"12","bibliographicPageEnd":"2327","bibliographicPageStart":"2319","bibliographicVolumeNumber":"E94-D","bibliographic_titles":[{"bibliographic_title":"IEICE Transactions on Information and Systems"}]}]},"item_10001_description_4":{"attribute_name":"著者ID","attribute_value_mlt":[{"subitem_description":"1000050422407","subitem_description_type":"Other"},{"subitem_description":"1000060210738","subitem_description_type":"Other"}]},"item_10001_description_6":{"attribute_name":"内容記述","attribute_value_mlt":[{"subitem_description":"In this paper, we propose an approach to obtaining en-hanced performance of the Linpack benchmark on a GPU-accelerated PCcluster connected via relatively slow inter-node connections. For one nodewith a quad-core Intel Xeon W3520 processor and a NVIDIA Tesla C1060GPU card, we implement a CPU-GPU parallel double-precision generalmatrix-matirx multiplication (dgemm) operation, and achieve a perfor-mance improvement of 34% compared with the GPU-only case and 64%compared with the CPU-only case. For an entire 16-node cluster, each nodeof which is the same as the above and is connected with two gigabit Ether-net links, we use a computation-communication overlap scheme with GPUacceleration for the Linpack benchmark, and achieve a performance im-provement of 28% compared with the GPU-accelerated high-performanceLinpack benchmark (HPL) without overlapping. Our overlap GPU accel-eration solution uses overlaps in which the main inter-node communicationand data transfer to the GPU device memory are overlapped with the maincomputation task on the CPU cores. These overlaps use multi-core pro-cessors, which almost all of today’s high-performance computers use. Inparticular, as well as using a CPU core for communication tasks, we alsosimultaneously use other CPU cores and the GPU for computation tasks.In order to enable overlap between inter-node communication and com-putation tasks, we eliminate their close dependence by breaking the maincomputation task into smaller tasks and rescheduling. Based on a scheme inwhich part of the CPU computation power is simultaneously used for tasksother than computation tasks, we experimentally find the optimal compu-tation ratio for CPUs; this ratio differs from the case of parallel dgemmoperation of one node.","subitem_description_type":"Other"}]},"item_10001_publisher_8":{"attribute_name":"出版者","attribute_value_mlt":[{"subitem_publisher":"The Institute of Electronics, Information and Comunication Engineers"}]},"item_10001_relation_17":{"attribute_name":"関連サイト","attribute_value_mlt":[{"subitem_relation_name":[{"subitem_relation_name_text":"http://www.ieice.org/jpn/index.html"}],"subitem_relation_type_id":{"subitem_relation_type_id_text":"http://www.ieice.org/jpn/index.html","subitem_relation_type_select":"URI"}}]},"item_10001_source_id_9":{"attribute_name":"ISSN","attribute_value_mlt":[{"subitem_source_identifier":"09168532 ","subitem_source_identifier_type":"ISSN"}]},"item_10001_text_24":{"attribute_name":"自由記述ライセンス","attribute_value_mlt":[{"subitem_text_value":"Copyright c 2011 The Institute of Electronics, Information and Communication Engineers"}]},"item_10001_version_type_20":{"attribute_name":"著者版フラグ","attribute_value_mlt":[{"subitem_version_resource":"http://purl.org/coar/version/c_970fb48d4fbd8a85","subitem_version_type":"VoR"}]},"item_creator":{"attribute_name":"著者","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"Junichi, Ohmura","creatorNameLang":"en"}],"nameIdentifiers":[{"nameIdentifier":"6413","nameIdentifierScheme":"WEKO"}]},{"creatorNames":[{"creatorName":"Takefumi, Miyoshi","creatorNameLang":"en"}],"nameIdentifiers":[{"nameIdentifier":"6414","nameIdentifierScheme":"WEKO"}]},{"creatorNames":[{"creatorName":"Hidetsugu, Irie","creatorNameLang":"en"}],"nameIdentifiers":[{"nameIdentifier":"6415","nameIdentifierScheme":"WEKO"}]},{"creatorNames":[{"creatorName":"Tsutomu, Yoshinaga","creatorNameLang":"en"}],"nameIdentifiers":[{"nameIdentifier":"6416","nameIdentifierScheme":"WEKO"}]}]},"item_files":{"attribute_name":"ファイル情報","attribute_type":"file","attribute_value_mlt":[{"accessrole":"open_date","date":[{"dateType":"Available","dateValue":"2016-09-15"}],"displaytype":"detail","filename":"9000000554.pdf","filesize":[{"value":"652.4 kB"}],"format":"application/pdf","licensetype":"license_note","mimetype":"application/pdf","url":{"label":"9000000554.pdf","url":"https://uec.repo.nii.ac.jp/record/288/files/9000000554.pdf"},"version_id":"ed97d760-5f32-408a-a7b1-02ffde2301c2"}]},"item_keyword":{"attribute_name":"キーワード","attribute_value_mlt":[{"subitem_subject":"parallel processing, multi-core processor, GPU, computation-communication overlap","subitem_subject_language":"en","subitem_subject_scheme":"Other"}]},"item_language":{"attribute_name":"言語","attribute_value_mlt":[{"subitem_language":"eng"}]},"item_resource_type":{"attribute_name":"資源タイプ","attribute_value_mlt":[{"resourcetype":"journal article","resourceuri":"http://purl.org/coar/resource_type/c_6501"}]},"item_title":"Computation-Communication Overlap of Linpack on a GPU-Accelerated PC Cluster","item_titles":{"attribute_name":"タイトル","attribute_value_mlt":[{"subitem_title":"Computation-Communication Overlap of Linpack on a GPU-Accelerated PC Cluster","subitem_title_language":"en"}]},"item_type_id":"10001","owner":"3","path":["6"],"pubdate":{"attribute_name":"公開日","attribute_value":"2011-12-01"},"publish_date":"2011-12-01","publish_status":"0","recid":"288","relation_version_is_last":true,"title":["Computation-Communication Overlap of Linpack on a GPU-Accelerated PC Cluster"],"weko_creator_id":"3","weko_shared_id":3},"updated":"2023-05-15T10:07:00.868850+00:00"}