001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.bio.alignment; 022 023import java.util.Formatter; 024import java.util.HashMap; 025import java.util.Map; 026 027import org.biojava.bio.BioException; 028import org.biojava.bio.seq.GappedSequence; 029import org.biojava.bio.seq.Sequence; 030import org.biojava.bio.seq.impl.SimpleGappedSequence; 031import org.biojava.bio.symbol.Symbol; 032import org.biojava.bio.symbol.SymbolList; 033 034/** 035 * This class stores the result of an alignment procedure that creates a 036 * pairwise alignment of two sequences. Currently, these sequences must have the 037 * identical length (this may be changed in the future). A format routine 038 * produces a BLAST-like output for the sequences but all necessary information 039 * to visualize the alignment are contained in this class. 040 * 041 * @author Andreas Dräger 042 * 043 */ 044public class AlignmentPair extends SimpleAlignment { 045 046 /** 047 * Generated serial version identifier 048 */ 049 private static final long serialVersionUID = -8834131912021612261L; 050 051 /** 052 * 053 * @param query 054 * @param subject 055 * @param algorithm 056 * @return 057 * @throws Exception 058 */ 059 public static AlignmentPair align(Sequence query, Sequence subject, 060 AlignmentAlgorithm algorithm) throws Exception { 061 AlignmentPair pair = algorithm.pairwiseAlignment(query, subject); 062 return pair; 063 } 064 065 /** 066 * Creates the Map required by the super class. 067 * 068 * @param s1 069 * @param s2 070 * @return 071 */ 072 private static Map<String, SymbolList> createHashMap(Sequence s1, 073 Sequence s2) { 074 Map<String, SymbolList> m = new HashMap<String, SymbolList>(); 075 m.put(s1.getName(), s1); 076 m.put(s2.getName(), s2); 077 return m; 078 } 079 080 /** 081 * Number of identical elements in both sequences 082 */ 083 private int identicals; 084 085 /** 086 * Start position in the query 087 */ 088 private int queryStart; 089 090 /** 091 * @return the queryStart 092 */ 093 public int getQueryStart() { 094 return queryStart; 095 } 096 097 /** 098 * @param queryStart 099 * the queryStart to set 100 * @throws BioException 101 */ 102 void setQueryStart(int queryStart) throws BioException { 103 this.queryStart = queryStart; 104 init(); 105 } 106 107 /** 108 * End position in the query 109 */ 110 private int queryEnd; 111 112 /** 113 * @return the queryEnd 114 */ 115 public int getQueryEnd() { 116 return queryEnd; 117 } 118 119 /** 120 * @param queryEnd 121 * the queryEnd to set 122 * @throws BioException 123 */ 124 void setQueryEnd(int queryEnd) throws BioException { 125 this.queryEnd = queryEnd; 126 init(); 127 } 128 129 /** 130 * Number of gaps in query 131 */ 132 private int nGapsQ; 133 134 /** 135 * Number of gaps in subject 136 */ 137 private int nGapsS; 138 139 /** 140 * Length of the query sequence 141 */ 142 private final Sequence query; 143 144 /** 145 * Start position in the subject 146 */ 147 private int subjectStart; 148 149 /** 150 * @return the subjectStart 151 */ 152 public int getSubjectStart() { 153 return subjectStart; 154 } 155 156 /** 157 * @param subjectStart 158 * the subjectStart to set 159 * @throws BioException 160 */ 161 void setSubjectStart(int subjectStart) throws BioException { 162 this.subjectStart = subjectStart; 163 init(); 164 } 165 166 /** 167 * End position in the subject 168 */ 169 private int subjectEnd; 170 171 /** 172 * @return the subjectEnd 173 */ 174 public int getSubjectEnd() { 175 return subjectEnd; 176 } 177 178 /** 179 * @param subjectEnd 180 * the subjectEnd to set 181 * @throws BioException 182 */ 183 void setSubjectEnd(int subjectEnd) throws BioException { 184 this.subjectEnd = subjectEnd; 185 init(); 186 } 187 188 /** 189 * Number of similar symbols 190 */ 191 private int similars; 192 193 /** 194 * The subject sequence. 195 */ 196 private final Sequence subject; 197 198 /** 199 * Reference to the underlying substitution matrix of this alignment. 200 */ 201 private final SubstitutionMatrix subMatrix; 202 203 /** 204 * Time consumption to create this alignment. 205 */ 206 private long time; 207 208 /** 209 * 210 * @param queryStart 211 * the start position in the query, where the alignment starts. 212 * For example zero for normal Needleman-Wunsch-Alignments. 213 * @param queryEnd 214 * the end position, that means the sequence coordinate, which is 215 * the last symbol of the query sequence. Counting starts at 216 * zero! 217 * @param queryLength 218 * The length of the query sequence without gaps. 219 * @param subjectStart 220 * These are all the same for the target. Have a look at these 221 * above. 222 * @param subjectEnd 223 * @param subMatrix 224 * the subsitution Matrix used for calculating the alignment 225 * @throws IllegalArgumentException 226 * @throws BioException 227 */ 228 public AlignmentPair(Sequence query, Sequence subject, int queryStart, 229 int queryEnd, int subjectStart, int subjectEnd, 230 SubstitutionMatrix subMatrix) throws IllegalArgumentException, 231 BioException { 232 super(createHashMap(query, subject)); 233 this.subMatrix = subMatrix; 234 this.query = query; 235 this.subject = subject; 236 this.queryStart = queryStart; 237 this.queryEnd = queryEnd; 238 this.subjectStart = subjectStart; 239 this.subjectEnd = subjectEnd; 240 init(); 241 } 242 243 /** 244 * 245 * @param query 246 * @param subject 247 * @param subMatrix 248 * @throws IllegalArgumentException 249 * @throws BioException 250 */ 251 public AlignmentPair(Sequence query, Sequence subject, 252 SubstitutionMatrix subMatrix) throws IllegalArgumentException, 253 BioException { 254 this(query, subject, 1, query.length(), 1, subject.length(), subMatrix); 255 } 256 257 /** 258 * 259 * @throws BioException 260 */ 261 private void init() throws BioException { 262 similars = 0; 263 identicals = 0; 264 nGapsQ = 0; 265 nGapsS = 0; 266 for (int i = 0; i < Math.min(queryEnd - queryStart, subjectEnd 267 - subjectStart); i++) { 268 Symbol a = query.symbolAt(i + queryStart); 269 Symbol b = subject.symbolAt(i + subjectStart); 270 boolean gap = false; 271 if (a.equals(b)) { 272 identicals++; 273 } 274 275 // get score for this pair. if it is positive, they are similar... 276 if (a.equals(query.getAlphabet().getGapSymbol())) { 277 nGapsQ++; 278 gap = true; 279 } 280 if (b.equals(subject.getAlphabet().getGapSymbol())) { 281 nGapsS++; 282 gap = true; 283 } 284 if (!gap && subMatrix != null && subMatrix.getValueAt(a, b) > 0) { 285 similars++; 286 } 287 } 288 } 289 290 /** 291 * @return the time 292 */ 293 public long getComputationTime() { 294 return time; 295 } 296 297 /** 298 * @return the ngapsq 299 */ 300 public int getNumGapsInQuery() { 301 return nGapsQ; 302 } 303 304 /** 305 * @return the ngapst 306 */ 307 public int getNumGapsInSubject() { 308 return nGapsS; 309 } 310 311 /** 312 * @return the identicals 313 */ 314 public int getNumIdenticals() { 315 return identicals; 316 } 317 318 /** 319 * @return the similars 320 */ 321 public int getNumSimilars() { 322 return similars; 323 } 324 325 /** 326 * 327 * @return 328 */ 329 public float getPercentIdentityQuery() { 330 return identicals / (float) (query.length() - nGapsQ) * 100; 331 } 332 333 /** 334 * 335 * @return 336 */ 337 public float getPercentIdentitySubject() { 338 return identicals / (float) (subject.length() - nGapsS) * 100; 339 } 340 341 /** 342 * 343 * @return 344 */ 345 public float getPercentSimilarityQuery() { 346 return similars / (float) query.length() * 100; 347 } 348 349 /** 350 * 351 * @return 352 */ 353 public float getPercentSimilaritySubject() { 354 return similars / (float) subject.length() * 100; 355 } 356 357 /** 358 * 359 * @return 360 */ 361 public int getQueryLength() { 362 return query.length(); 363 } 364 365 /** 366 * 367 * @return 368 */ 369 public int getSubjectLength() { 370 return subject.length(); 371 } 372 373 /** 374 * @return the subMatrix 375 */ 376 public SubstitutionMatrix getSubstitutionMatrix() { 377 return subMatrix; 378 } 379 380 /** 381 * @param time 382 * The time in milliseconds, which was needed to generate the 383 * alignment. 384 */ 385 void setComputationTime(long time) { 386 this.time = time; 387 } 388 389 /** 390 * 391 * @return 392 * @throws BioException 393 */ 394 public String formatOutput() throws BioException { 395 return formatOutput(60); 396 } 397 398 /** 399 * This method provides a BLAST-like formated alignment from the given 400 * <code>String</code>s, in which the sequence coordinates and the 401 * information "Query" or "Sbjct", respectively is added to each line. Each 402 * line contains <code>width</code> sequence characters including the gap 403 * symbols plus the meta information. There is one white line between two 404 * pairs of sequences. 405 * 406 * @param width 407 * the number of symbols to be displayed per line. 408 * @return formated String. 409 * @throws BioException 410 */ 411 public String formatOutput(int width) throws BioException { 412 int i, j; 413 /* 414 * Highlights equal symbols within the alignment, String match/missmatch 415 * representation 416 */ 417 StringBuilder path = new StringBuilder(); 418 for (i = 0; i < Math.min(queryEnd - queryStart, subjectEnd 419 - subjectStart) + 1; i++) { 420 Symbol a = query.symbolAt(i + queryStart); 421 Symbol b = subject.symbolAt(i + subjectStart); 422 if (!a.equals(query.getAlphabet().getGapSymbol()) 423 && !b.equals(subject.getAlphabet().getGapSymbol()) 424 && ((subMatrix.getValueAt(a, b) >= 0) || a.equals(b))) { 425 path.append('|'); 426 } else { 427 path.append(' '); 428 } 429 } 430 431 int maxLength = path.length(); 432 /* 433 * Math.max(queryEnd - queryStart, subjectEnd - subjectStart) + 1; 434 */ 435 Formatter output = new Formatter(); 436 output.format("%n Time (ms): %s%n", time); 437 output.format(" Length: %d%n", maxLength); 438 output.format(" Score: %d%n", getScore()); 439 output.format(" Query: %s, Length: %d%n", query.getName(), query 440 .length() 441 - nGapsQ); 442 output.format(" Sbjct: %s, Length: %d%n", subject.getName(), 443 subject.length() - nGapsS); 444 output.format( 445 " Identities: %d/%d, i.e., %d %% (query) and %d %% (sbjct)%n", 446 identicals, maxLength, Math.round(getPercentIdentityQuery()), 447 Math.round(getPercentIdentitySubject())); 448 output.format( 449 " Similars: %d/%d, i.e., %d %% (query) and %d %% (sbjct)%n", 450 similars, maxLength, Math.round(getPercentSimilarityQuery()), 451 Math.round(getPercentSimilaritySubject())); 452 output.format( 453 " No. gaps: %d (%d %%) in query and %d (%d %%) in sbjct%n", 454 nGapsQ, Math.round(getPercentGapsQuery()), nGapsS, Math 455 .round(getPercentGapsTarget())); 456 457 int queryLPos = queryStart, queryRPos, pathLPos = 0, pathRPos; 458 int subjectLPos = subjectStart, subjectRPos; 459 int ql = queryLPos - 1, qr = queryLPos - 1, qgaps; 460 int sl = subjectLPos - 1, sr = subjectLPos - 1, sgaps; 461 462 int widthLeft = String.valueOf(Math.max(queryStart, queryEnd)).length(); 463 int widthRight = String.valueOf(Math.max(queryEnd, subjectEnd)) 464 .length() + 1; 465 466 // Take width of the meta information into account. 467 width = Math.max(width - widthLeft - widthRight - 12, 2); 468 469 for (i = 1; i <= Math.ceil((double) maxLength / width); i++) { 470 471 // Query 472 queryRPos = Math.min(queryStart + i * width - 1, Math.min(queryEnd, 473 subjectEnd - subjectStart + queryStart)); 474 qgaps = 0; 475 for (j = queryLPos; j <= queryRPos; j++) { 476 if (!query.symbolAt(j).equals( 477 query.getAlphabet().getGapSymbol())) { 478 qr++; 479 } else { 480 qgaps++; 481 } 482 } 483 if (qgaps <= queryRPos - queryLPos) { 484 ql++; 485 } 486 output.format("%nQuery: %" + widthLeft + "d ", ql); 487 output.format("%s ", query.subStr(queryLPos, queryRPos)); 488 output.format("%-" + widthRight + "d%n", qr); 489 queryLPos = queryRPos + 1; 490 ql = qr; 491 492 // Path 493 pathRPos = Math.min(i * width, path.length()); 494 output.format("%-" + (widthLeft + 10) + "c%s", Character 495 .valueOf(' '), path.substring(pathLPos, pathRPos)); 496 pathLPos = pathRPos; 497 498 // Sbjct 499 subjectRPos = Math.min(subjectStart + i * width - 1, Math.min( 500 queryEnd - queryStart + subjectStart, subjectEnd)); 501 sgaps = 0; 502 for (j = subjectLPos; j <= subjectRPos; j++) { 503 if (!subject.symbolAt(j).equals( 504 subject.getAlphabet().getGapSymbol())) { 505 sr++; 506 } else { 507 sgaps++; 508 } 509 } 510 if (sgaps <= subjectRPos - subjectLPos) { 511 sl++; 512 } 513 output.format("%nSbjct: %" + widthLeft + "d ", sl); 514 output.format("%s ", subject.subStr(subjectLPos, subjectRPos)); 515 output.format("%-" + widthRight + "d%n", sr); 516 subjectLPos = subjectRPos + 1; 517 sl = sr; 518 } 519 return output.toString(); 520 } 521 522 /** 523 * 524 * @return 525 */ 526 public float getPercentGapsTarget() { 527 return nGapsS / (float) subject.length() * 100; 528 } 529 530 /** 531 * 532 * @return 533 */ 534 public float getPercentGapsQuery() { 535 return nGapsQ / (float) query.length() * 100; 536 } 537 538 539 /** 540 * Return the query sequence as a gapped sequence. 541 * 542 * @return the query sequence as a gapped sequence 543 */ 544 public GappedSequence getQuery() { 545 return (query instanceof GappedSequence) ? (GappedSequence) query : new SimpleGappedSequence(query); 546 } 547 548 /** 549 * Return the subject sequence as a gapped sequence. 550 * 551 * @return the subject sequence as a gapped sequence 552 */ 553 public GappedSequence getSubject() { 554 return (subject instanceof GappedSequence) ? (GappedSequence) subject : new SimpleGappedSequence(subject); 555 } 556}