@inproceedings{oai:nagasaki-u.repo.nii.ac.jp:00016784,
 author = {Ling, Cheng and Benkrid, Khaled and Hamada, Tsuyoshi},
 book = {2009 IEEE 7th Symposium on Application Specific Processors},
 month = {Jul},
 note = {This paper describes a multi-threaded parallel design and implementation of the Smith-Waterman (SM) algorithm on compute unified device architecture (CUDA)-compatible graphic processing units (GPUs). A novel technique has been put forward to solve the restriction on the length of the query sequence in previous GPU implementations of the Smith-Waterman algorithm. The main reasons behind this limitation in previous GPU implementations were the finite size of local memory and number of threads per block. Our solution to this problem uses a divide and conquer approach to compute the alignment matrix involved in each pairwise sequence alignment, as it divides the entire matrix computation into multiple sub-matrices and allocates the available amount of threads and memory resources to each submatrix iteratively. Intermediate data is stored in shared and global memory on the fly depending on the length of sequences in hand. The proposed technique resulted in up to 4.2 GCUPS (Giga Cell Updates per Second) performance when tested against the SWISS-PROT protein database, which is up to 15 times faster than a equivalent optimised CPU-only implementation running on a Pentium4 3.4GHz desktop computer. Moreover, our implementation can cope with any query or subject sequence size, unlike previously reported GPU implementations of the Smith-Waterman algorithm which makes it fully deployable in real world bioinformatics applications., 2009 IEEE 7th Symposium on Application Specific Processors (SASP) : San Francisco, CA, USA, 2009.07.27-2009.07.28, 2009 IEEE 7th Symposium on Application Specific Processors, pp.94-100; 2009},
 pages = {94--100},
 publisher = {IEEE},
 title = {A parameterisable and scalable Smith-Waterman algorithm implementation on CUDA-compatible GPUs},
 year = {2009}
}