001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on June 14, 2010 021 * Author: Mark Chapman 022 */ 023 024package org.biojava.nbio.core.alignment; 025 026import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper; 027import org.biojava.nbio.core.alignment.template.AlignedSequence; 028import org.biojava.nbio.core.alignment.template.AlignedSequence.Step; 029import org.biojava.nbio.core.alignment.template.Profile; 030import org.biojava.nbio.core.alignment.template.ProfileView; 031import org.biojava.nbio.core.alignment.template.SubstitutionMatrix; 032import org.biojava.nbio.core.sequence.AccessionID; 033import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 034import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 035import org.biojava.nbio.core.sequence.io.util.IOUtils; 036import org.biojava.nbio.core.sequence.location.template.Location; 037import org.biojava.nbio.core.sequence.template.Compound; 038import org.biojava.nbio.core.sequence.template.CompoundSet; 039import org.biojava.nbio.core.sequence.template.Sequence; 040 041import java.io.Serializable; 042import java.util.*; 043 044 045/** 046 * Implements a data structure for the results of sequence alignment. Every {@link List} returned is unmodifiable. 047 * 048 * @author Mark Chapman 049 * @author Paolo Pavan 050 * @param <S> each element of the alignment {@link Profile} is of type S 051 * @param <C> each element of an {@link AlignedSequence} is a {@link Compound} of type C 052 */ 053public class SimpleProfile<S extends Sequence<C>, C extends Compound> implements Serializable, Profile<S, C> { 054 055 056 private static final long serialVersionUID = 1L; 057 058 private List<AlignedSequence<S, C>> list; 059 private List<S> originals; 060 private int length; 061 062 /** 063 * Creates a pair profile for the given already aligned sequences. 064 * 065 * @param query the first sequence of the pair 066 * @param target the second sequence of the pair 067 * @throws IllegalArgumentException if sequences differ in size 068 */ 069 protected SimpleProfile(AlignedSequence<S, C> query, AlignedSequence<S, C> target) { 070 if (query.getLength() != target.getLength()) { 071 throw new IllegalArgumentException("Aligned sequences differ in size"); 072 } 073 list = new ArrayList<AlignedSequence<S, C>>(); 074 list.add(query); 075 list.add(target); 076 list = Collections.unmodifiableList(list); 077 originals = new ArrayList<S>(); 078 originals.add(query.getOriginalSequence()); 079 originals.add(target.getOriginalSequence()); 080 originals = Collections.unmodifiableList(originals); 081 length = query.getLength(); 082 } 083 084 /** 085 * Creates a profile from a single sequence. 086 * 087 * @param sequence sequence to seed profile 088 */ 089 public SimpleProfile(S sequence) { 090 List<Step> s = new ArrayList<Step>(); 091 for (int i = 0; i < sequence.getLength(); i++) { 092 s.add(Step.COMPOUND); 093 } 094 list = new ArrayList<AlignedSequence<S, C>>(); 095 list.add(new SimpleAlignedSequence<S, C>(sequence, s)); 096 list = Collections.unmodifiableList(list); 097 originals = new ArrayList<S>(); 098 originals.add(sequence); 099 originals = Collections.unmodifiableList(originals); 100 length = sequence.getLength(); 101 } 102 103 /** 104 * Creates a pair profile for the given sequences. 105 * 106 * @param query the first sequence of the pair 107 * @param target the second sequence of the pair 108 * @param sx lists whether the query sequence aligns a {@link Compound} or gap at each index of the alignment 109 * @param xb number of {@link Compound}s skipped in the query sequence before the aligned region 110 * @param xa number of {@link Compound}s skipped in the query sequence after the aligned region 111 * @param sy lists whether the target sequence aligns a {@link Compound} or gap at each index of the alignment 112 * @param yb number of {@link Compound}s skipped in the target sequence before the aligned region 113 * @param ya number of {@link Compound}s skipped in the target sequence after the aligned region 114 * @throws IllegalArgumentException if alignments differ in size or given sequences do not fit in alignments 115 */ 116 protected SimpleProfile(S query, S target, List<Step> sx, int xb, int xa, List<Step> sy, int yb, int ya) { 117 if (sx.size() != sy.size()) { 118 throw new IllegalArgumentException("Alignments differ in size"); 119 } 120 list = new ArrayList<AlignedSequence<S, C>>(); 121 list.add(new SimpleAlignedSequence<S, C>(query, sx, xb, xa)); 122 list.add(new SimpleAlignedSequence<S, C>(target, sy, yb, ya)); 123 list = Collections.unmodifiableList(list); 124 originals = new ArrayList<S>(); 125 originals.add(query); 126 originals.add(target); 127 originals = Collections.unmodifiableList(originals); 128 length = sx.size(); 129 } 130 131 /** 132 * Creates a pair profile for the given profiles. 133 * 134 * @param query the first profile of the pair 135 * @param target the second profile of the pair 136 * @param sx lists whether the query profile aligns a {@link Compound} or gap at each index of the alignment 137 * @param sy lists whether the target profile aligns a {@link Compound} or gap at each index of the alignment 138 * @throws IllegalArgumentException if alignments differ in size or given profiles do not fit in alignments 139 */ 140 protected SimpleProfile(Profile<S, C> query, Profile<S, C> target, List<Step> sx, List<Step> sy) { 141 if (sx.size() != sy.size()) { 142 throw new IllegalArgumentException("Alignments differ in size"); 143 } 144 list = new ArrayList<AlignedSequence<S, C>>(); 145 for (AlignedSequence<S, C> s : query) { 146 list.add(new SimpleAlignedSequence<S, C>(s, sx)); 147 } 148 for (AlignedSequence<S, C> s : target) { 149 list.add(new SimpleAlignedSequence<S, C>(s, sy)); 150 } 151 list = Collections.unmodifiableList(list); 152 originals = new ArrayList<S>(); 153 originals.addAll(query.getOriginalSequences()); 154 originals.addAll(target.getOriginalSequences()); 155 originals = Collections.unmodifiableList(originals); 156 length = sx.size(); 157 } 158 159 /** 160 * Creates a profile for the already aligned sequences. 161 * @param alignedSequences the already aligned sequences 162 * @throws IllegalArgument if aligned sequences differ in length or 163 * collection is empty. 164 */ 165 public SimpleProfile(Collection<AlignedSequence<S,C>> alignedSequences) { 166 list = new ArrayList<AlignedSequence<S,C>>(); 167 originals = new ArrayList<S>(); 168 169 Iterator<AlignedSequence<S,C>> itr = alignedSequences.iterator(); 170 if(!itr.hasNext()) { 171 throw new IllegalArgumentException("alignedSequences must not be empty"); 172 } 173 174 AlignedSequence<S, C> curAlignedSeq = itr.next(); 175 length = curAlignedSeq.getLength(); 176 list.add(curAlignedSeq); 177 originals.add(curAlignedSeq.getOriginalSequence()); 178 179 while (itr.hasNext()) { 180 curAlignedSeq = itr.next(); 181 if (curAlignedSeq.getLength() != length) { 182 throw new IllegalArgumentException("Aligned sequences differ in size"); 183 } 184 list.add(curAlignedSeq); 185 originals.add(curAlignedSeq.getOriginalSequence()); 186 } 187 list = Collections.unmodifiableList(list); 188 originals = Collections.unmodifiableList(originals); 189 } 190 191 192 // methods for Profile 193 194 @Override 195 public AlignedSequence<S, C> getAlignedSequence(int listIndex) { 196 return list.get(listIndex - 1); 197 } 198 199 @Override 200 public AlignedSequence<S, C> getAlignedSequence(S sequence) { 201 for (AlignedSequence<S, C> s : list) { 202 if (s.equals(sequence) || s.getOriginalSequence().equals(sequence)) { 203 return s; 204 } 205 } 206 return null; 207 } 208 209 @Override 210 public List<AlignedSequence<S, C>> getAlignedSequences() { 211 return list; 212 } 213 214 @Override 215 public List<AlignedSequence<S, C>> getAlignedSequences(int... listIndices) { 216 List<AlignedSequence<S, C>> tempList = new ArrayList<AlignedSequence<S, C>>(); 217 for (int i : listIndices) { 218 tempList.add(getAlignedSequence(i)); 219 } 220 return Collections.unmodifiableList(tempList); 221 } 222 223 @Override 224 public List<AlignedSequence<S, C>> getAlignedSequences(S... sequences) { 225 List<AlignedSequence<S, C>> tempList = new ArrayList<AlignedSequence<S, C>>(); 226 for (S s : sequences) { 227 tempList.add(getAlignedSequence(s)); 228 } 229 return Collections.unmodifiableList(tempList); 230 } 231 232 @Override 233 public C getCompoundAt(int listIndex, int alignmentIndex) { 234 return getAlignedSequence(listIndex).getCompoundAt(alignmentIndex); 235 } 236 237 @Override 238 public C getCompoundAt(S sequence, int alignmentIndex) { 239 AlignedSequence<S, C> s = getAlignedSequence(sequence); 240 return (s == null) ? null : s.getCompoundAt(alignmentIndex); 241 } 242 243 @Override 244 public int[] getCompoundCountsAt(int alignmentIndex) { 245 return getCompoundCountsAt(alignmentIndex, getCompoundSet().getAllCompounds()); 246 } 247 248 @Override 249 public int[] getCompoundCountsAt(int alignmentIndex, List<C> compounds) { 250 int[] counts = new int[compounds.size()]; 251 C gap = getCompoundSet().getCompoundForString("-"); 252 int igap = compounds.indexOf(gap); 253 for (C compound : getCompoundsAt(alignmentIndex)) { 254 int i = compounds.indexOf(compound); 255 if (i >= 0 && i != igap && !getCompoundSet().compoundsEquivalent(compound, gap)) { 256 counts[i]++; 257 } 258 } 259 return counts; 260 } 261 262 @Override 263 public List<C> getCompoundsAt(int alignmentIndex) { 264 // TODO handle circular alignments 265 List<C> column = new ArrayList<C>(); 266 for (AlignedSequence<S, C> s : list) { 267 column.add(s.getCompoundAt(alignmentIndex)); 268 } 269 return Collections.unmodifiableList(column); 270 } 271 272 @Override 273 public CompoundSet<C> getCompoundSet() { 274 return list.get(0).getCompoundSet(); 275 } 276 277 @Override 278 public float[] getCompoundWeightsAt(int alignmentIndex) { 279 return getCompoundWeightsAt(alignmentIndex, getCompoundSet().getAllCompounds()); 280 } 281 282 @Override 283 public float[] getCompoundWeightsAt(int alignmentIndex, List<C> compounds) { 284 float[] weights = new float[compounds.size()]; 285 int[] counts = getCompoundCountsAt(alignmentIndex, compounds); 286 float total = 0.0f; 287 for (int i : counts) { 288 total += i; 289 } 290 if (total > 0.0f) { 291 for (int i = 0; i < weights.length; i++) { 292 weights[i] = counts[i]/total; 293 } 294 } 295 return weights; 296 } 297 298 @Override 299 public int getIndexOf(C compound) { 300 for (int i = 1; i <= length; i++) { 301 if (getCompoundsAt(i).contains(compound)) { 302 return i; 303 } 304 } 305 return -1; 306 } 307 308 @Override 309 public int[] getIndicesAt(int alignmentIndex) { 310 int[] indices = new int[list.size()]; 311 for (int i = 0; i < indices.length; i++) { 312 indices[i] = list.get(i).getSequenceIndexAt(alignmentIndex); 313 } 314 return indices; 315 } 316 317 @Override 318 public int getLastIndexOf(C compound) { 319 for (int i = length; i >= 1; i--) { 320 if (getCompoundsAt(i).contains(compound)) { 321 return i; 322 } 323 } 324 return -1; 325 } 326 327 @Override 328 public int getLength() { 329 return length; 330 } 331 332 @Override 333 public List<S> getOriginalSequences() { 334 return originals; 335 } 336 337 @Override 338 public int getSize() { 339 int size = 0; 340 for (AlignedSequence<S, C> s : list) { 341 size += s.getOverlapCount(); 342 } 343 return size; 344 } 345 346 @Override 347 public ProfileView<S, C> getSubProfile(Location location) { 348 // TODO ProfileView<S, C> getSubProfile(Location) 349 return null; 350 } 351 352 @Override 353 public boolean hasGap(int alignmentIndex) { 354 C gap = getCompoundSet().getCompoundForString("-"); 355 for (C compound : getCompoundsAt(alignmentIndex)) { 356 if (getCompoundSet().compoundsEquivalent(compound, gap)) { 357 return true; 358 } 359 } 360 return false; 361 } 362 363 @Override 364 public boolean isCircular() { 365 for (AlignedSequence<S, C> s : list) { 366 if (s.isCircular()) { 367 return true; 368 } 369 } 370 return false; 371 } 372 373 @Override 374 public String toString(int width) { 375 return toString(width, null, IOUtils.getIDFormat(list), true, true, true, true, true, false); 376 } 377 378 @Override 379 public String toString(StringFormat format) { 380 switch (format) { 381 case ALN: 382 case CLUSTALW: 383 default: 384 return toString(60, String.format("CLUSTAL W MSA from BioJava%n%n"), IOUtils.getIDFormat(list) + " ", 385 false, true, true, false, true, false); 386 case FASTA: 387 return toString(60, null, ">%s%n", false, false, false, false, false, false); 388 case GCG: 389 case MSF: 390 return toString(50, IOUtils.getGCGHeader(list), IOUtils.getIDFormat(list), false, false, true, false, 391 false, false); 392 case PDBWEB: 393 return toString(60, null, "%10s", true, true, true, false, true, true); 394 } 395 } 396 397 // method from Object 398 399 @Override 400 public String toString() { 401 return toString(getLength(), null, null, false, false, false, false, false, false); 402 } 403 404 // method for Iterable 405 406 @Override 407 public Iterator<AlignedSequence<S, C>> iterator() { 408 return list.iterator(); 409 } 410 411 // helper methods 412 413 // creates formatted String 414 private String toString(int width, String header, String idFormat, boolean seqIndexPre, boolean seqIndexPost, 415 boolean interlaced, boolean aligIndices, boolean aligConservation, boolean webDisplay) { 416 417 // TODO handle circular alignments 418 StringBuilder s = (header == null) ? new StringBuilder() : new StringBuilder(header); 419 420 if ( webDisplay && list.size() == 2){ 421 s.append("<div><pre>"); 422 } 423 424 width = Math.max(1, width); 425 int seqIndexPad = (int) (Math.floor(Math.log10(getLength())) + 2); 426 String seqIndexFormatPre = "%" + seqIndexPad + "d ", seqIndexFormatPost = "%" + seqIndexPad + "d"; 427 if (interlaced) { 428 String aligIndFormat = "%-" + Math.max(1, width / 2) + "d %" + Math.max(1, width - (width / 2) - 1) + 429 "d%n"; 430 for (int i = 0; i < getLength(); i += width) { 431 int start = i + 1, end = Math.min(getLength(), i + width); 432 if (i > 0) { 433 s.append(String.format("%n")); 434 } 435 if (aligIndices) { 436 if (end < i + width) { 437 int line = end - start + 1; 438 aligIndFormat = "%-" + Math.max(1, line / 2) + "d %" + Math.max(1, line - (line / 2) - 1) + 439 "d%n"; 440 } 441 if (idFormat != null) { 442 s.append(String.format(idFormat, "")); 443 } 444 if (seqIndexPre) { 445 s.append(String.format("%" + (seqIndexPad + 1) + "s", "")); 446 } 447 s.append(String.format(aligIndFormat, start, end)); 448 } 449 450 int counter = 0; 451 for (AlignedSequence<S, C> as : list) { 452 counter++; 453 454 if ( webDisplay && list.size() == 2){ 455 printSequenceAlignmentWeb(s, counter, idFormat, seqIndexPre, seqIndexFormatPre, seqIndexPost, 456 seqIndexFormatPost, start, end); 457 } else { 458 if (idFormat != null) { 459 s.append(String.format(idFormat, as.getAccession())); 460 } 461 if (seqIndexPre) { 462 s.append(String.format(seqIndexFormatPre, as.getSequenceIndexAt(start))); 463 } 464 465 s.append(as.getSubSequence(start, end).getSequenceAsString()); 466 467 if (seqIndexPost) { 468 s.append(String.format(seqIndexFormatPost, as.getSequenceIndexAt(end))); 469 } 470 s.append(String.format("%n")); 471 } 472 473 if (aligConservation && list.size() == 2 && counter == 1) { 474 printConservation(s, idFormat, seqIndexPad, seqIndexPre, start, end, webDisplay); 475 } 476 } 477 478 } 479 } else { 480 for (AlignedSequence<S, C> as : list) { 481 if (idFormat != null) { 482 s.append(String.format(idFormat, as.getAccession())); 483 } 484 for (int i = 0; i < getLength(); i += width) { 485 int start = i + 1, end = Math.min(getLength(), i + width); 486 if (seqIndexPre) { 487 s.append(String.format(seqIndexFormatPre, as.getSequenceIndexAt(start))); 488 } 489 s.append(as.getSubSequence(start, end).getSequenceAsString()); 490 if (seqIndexPost) { 491 s.append(String.format(seqIndexFormatPost, as.getSequenceIndexAt(end))); 492 } 493 s.append(String.format("%n")); 494 } 495 } 496 } 497 498 if (webDisplay && aligConservation && list.size() == 2) { 499 s.append(IOUtils.getPDBLegend()); 500 } 501 return s.toString(); 502 } 503 504 private void printSequenceAlignmentWeb(StringBuilder s, int counter, String idFormat, boolean seqIndexPre, 505 String seqIndexFormatPre, boolean seqIndexPost, String seqIndexFormatPost, int start, int end) { 506 AlignedSequence<S,C> as1 = list.get(0), as2 = list.get(1), as = list.get(counter - 1); 507 508 if (idFormat != null) { 509 s.append(String.format(idFormat, as.getAccession())); 510 } 511 if (seqIndexPre) { 512 s.append(String.format(seqIndexFormatPre, as.getSequenceIndexAt(start))); 513 } 514 515 String mySeq = as.getSubSequence(start, end).getSequenceAsString(); 516 String s1 = as1.getSubSequence(start, end).getSequenceAsString(); 517 String s2 = as2.getSubSequence(start, end).getSequenceAsString(); 518 519 for (int i = 0; i < s1.length(); i++) { 520 if (i >= s2.length() || i >= mySeq.length()) 521 break; 522 523 char c1 = s1.charAt(i); 524 char c2 = s2.charAt(i); 525 char c = mySeq.charAt(i); 526 s.append(IOUtils.getPDBCharacter(true, c1, c2, isSimilar(c1, c2), c)); 527 } 528 529 if (seqIndexPost) { 530 s.append(String.format(seqIndexFormatPost, as.getSequenceIndexAt(end))); 531 } 532 s.append(String.format("%n")); 533 534 } 535 536 private void printConservation(StringBuilder s, String idFormat, int seqIndexPad, boolean seqIndexPre, int start, 537 int end, boolean webDisplay) { 538 AlignedSequence<S,C> as1 = list.get(0), as2 = list.get(1); 539 540 if (idFormat != null) { 541 AccessionID ac1 = as1.getAccession(); 542 String id1 = (ac1 == null) ? "null" : ac1.getID(); 543 id1 = id1.replaceAll(".", " "); 544 s.append(String.format(idFormat, id1)); 545 } 546 547 if (seqIndexPre) { 548 s.append(String.format("%" + (seqIndexPad + 1) + "s", "")); 549 } 550 551 String subseq1 = as1.getSubSequence(start, end).getSequenceAsString(); 552 String subseq2 = as2.getSubSequence(start, end).getSequenceAsString(); 553 554 for ( int ii =0 ; ii < subseq1.length() ; ii++){ 555 if ( ii >= subseq2.length()) 556 break; 557 char c1 = subseq1.charAt(ii); 558 char c2 = subseq2.charAt(ii); 559 s.append(IOUtils.getPDBConservation(webDisplay, c1, c2, isSimilar(c1, c2))); 560 } 561 562 s.append(String.format("%n")); 563 } 564 565 protected static final SubstitutionMatrix<AminoAcidCompound> matrix = SubstitutionMatrixHelper.getBlosum65(); 566 567 private boolean isSimilar(char c1, char c2) { 568 AminoAcidCompoundSet set = AminoAcidCompoundSet.getAminoAcidCompoundSet(); 569 570 AminoAcidCompound aa1 = set.getCompoundForString(String.valueOf(c1)); 571 AminoAcidCompound aa2 = set.getCompoundForString(String.valueOf(c2)); 572 573 short val = matrix.getValue(aa1,aa2); 574 return val > 0; 575 } 576 577}