001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on DATE 021 * 022 */ 023 024package org.biojava.nbio.core.sequence; 025 026import org.biojava.nbio.core.sequence.io.util.IOUtils; 027import org.biojava.nbio.core.sequence.template.Compound; 028import org.biojava.nbio.core.sequence.template.CompoundSet; 029import org.biojava.nbio.core.sequence.template.LightweightProfile; 030import org.biojava.nbio.core.sequence.template.Sequence; 031 032import java.util.ArrayList; 033import java.util.Collections; 034import java.util.List; 035 036/** 037 * Implements a minimal data structure for reading and writing a sequence alignment. The full {@code Profile} data 038 * structure in the alignment module provides additional functionality. 039 * 040 * @author Scooter Willis 041 * @author Mark Chapman 042 */ 043public class MultipleSequenceAlignment<S extends Sequence<C>, C extends Compound> implements LightweightProfile<S, C> { 044 045 private List<S> sequences = new ArrayList<>(); 046 private Integer length = null; 047 048 /** 049 * A sequence that has been aligned to other sequences will have inserts. 050 * @param sequence 051 */ 052 public void addAlignedSequence(S sequence){ 053 if(length == null){ 054 length = sequence.getLength(); 055 } 056 if(sequence.getLength() != length){ 057 throw new IllegalArgumentException(sequence.getAccession() + " length = " + sequence.getLength() + 058 " not equal to MSA length = " + length); 059 } 060 sequences.add(sequence); 061 } 062 063 /** 064 * Remove a sequence 065 * @param sequence 066 * @return flag 067 */ 068 public boolean removeAlignedSequence(S sequence){ 069 return sequences.remove(sequence); 070 } 071//methods for LightweightProfile 072 073 /** 074 * Uses bioIndex starting at 1 instead of 0 075 * @param listIndex 076 * @return sequence 077 */ 078 079 080 @Override 081 public S getAlignedSequence(int listIndex) { 082 return sequences.get(listIndex - 1); 083 } 084 085 /** 086 * Get the list of sequences 087 * @return list of sequences 088 */ 089 @Override 090 public List<S> getAlignedSequences() { 091 return Collections.unmodifiableList(sequences); 092 } 093 094 /** 095 * Get a list of compounds at a sequence position 096 * @param alignmentIndex 097 * @return compounds 098 */ 099 @Override 100 public List<C> getCompoundsAt(int alignmentIndex) { 101 List<C> column = new ArrayList<>(); 102 for (S s : sequences) { 103 column.add(s.getCompoundAt(alignmentIndex)); 104 } 105 return Collections.unmodifiableList(column); 106 } 107 108 /** 109 * Get the Compounds defined in the first sequence 110 * @return get compound set 111 */ 112 @Override 113 public CompoundSet<C> getCompoundSet() { 114 return sequences.get(0).getCompoundSet(); 115 } 116 117 /** 118 * Get the length of the MSA where it is assumed that 119 * all sequence position 120 * @return length of MSA 121 */ 122 @Override 123 public int getLength() { 124 return length; 125 } 126 127 /** 128 * Get the number of sequences in the MSA 129 * @return nr of sequences 130 */ 131 @Override 132 public int getSize() { 133 return sequences.size(); 134 } 135 136 /** 137 * Get a string representation of the MSA with a fixed width 138 * @param width 139 * @return String 140 */ 141 @Override 142 public String toString(int width) { 143 return toString(width, null, IOUtils.getIDFormat(sequences), true, true, true, false); 144 } 145 146 /** 147 * Support for different MSA formats 148 * @param format 149 * @return String in one of the supported file formats. 150 */ 151 @Override 152 public String toString(StringFormat format) { 153 switch (format) { 154 case ALN: 155 case CLUSTALW: 156 default: 157 return toString(60, String.format("CLUSTAL W MSA from BioJava%n%n"), IOUtils.getIDFormat(sequences) + 158 " ", true, false, true, false); 159 case FASTA: 160 return toString(60, null, ">%s%n", false, false, false, false); 161 case GCG: 162 case MSF: 163 return toString(50, IOUtils.getGCGHeader(sequences), IOUtils.getIDFormat(sequences), true, false, false, 164 false); 165 case PDBWEB: 166 return toString(60, null, "%s", true, false, true, true); 167 } 168 } 169 170 /** 171 * String representation of the MSA 172 * @return String 173 */ 174 175 @Override 176 public String toString() { 177 return toString(getLength(), null, null, false, false, false, false); 178 } 179 180 // helper methods 181 182 /** 183 * Helper method that does all the formatting work 184 * @param width 185 * @param header 186 * @param idFormat 187 * @param interlaced 188 * @param aligIndices 189 * @param aligConservation 190 * @param webDisplay 191 * @return String 192 */ 193 // creates formatted String 194 private String toString(int width, String header, String idFormat, boolean interlaced, boolean aligIndices, 195 boolean aligConservation, boolean webDisplay) { 196 197 // TODO handle circular alignments 198 StringBuilder s = (header == null) ? new StringBuilder() : new StringBuilder(header); 199 200 if (webDisplay && sequences.size() == 2) { 201 s.append("<div><pre>"); 202 } 203 204 width = Math.max(1, width); 205 if (interlaced) { 206 String aligIndFormat = "%-" + Math.max(1, width / 2) + "d %" + Math.max(1, width - (width / 2) - 1) + 207 "d%n"; 208 for (int i = 0; i < getLength(); i += width) { 209 int start = i + 1, end = Math.min(getLength(), i + width); 210 if (i > 0) { 211 s.append(String.format("%n")); 212 } 213 if (aligIndices) { 214 if (end < i + width) { 215 int line = end - start + 1; 216 aligIndFormat = "%-" + Math.max(1, line / 2) + "d %" + Math.max(1, line - (line / 2) - 1) + 217 "d%n"; 218 } 219 if (idFormat != null) { 220 s.append(String.format(idFormat, "")); 221 } 222 s.append(String.format(aligIndFormat, start, end)); 223 } 224 int counter = 0; 225 for (S as : sequences) { 226 counter++; 227 if (webDisplay && sequences.size() == 2) { 228 printSequenceAlignmentWeb(s, counter, idFormat, start, end); 229 } else { 230 if (idFormat != null) { 231 s.append(String.format(idFormat, as.getAccession())); 232 } 233 s.append(as.getSubSequence(start, end).getSequenceAsString()); 234 s.append(String.format("%n")); 235 } 236 if (aligConservation && sequences.size() == 2 && counter == 1) { 237 printConservation(s, idFormat, start, end, webDisplay); 238 } 239 } 240 } 241 } else { 242 for (S as : sequences) { 243 if (idFormat != null) { 244 s.append(String.format(idFormat, as.getAccession())); 245 } 246 for (int i = 0; i < getLength(); i += width) { 247 int start = i + 1, end = Math.min(getLength(), i + width); 248 s.append(as.getSubSequence(start, end).getSequenceAsString()); 249 s.append(String.format("%n")); 250 } 251 } 252 } 253 254 if (webDisplay && aligConservation && sequences.size() == 2) { 255 s.append(IOUtils.getPDBLegend()); 256 } 257 return s.toString(); 258 } 259 260 /** 261 * 262 * @param s 263 * @param counter 264 * @param idFormat 265 * @param start 266 * @param end 267 */ 268 private void printSequenceAlignmentWeb(StringBuilder s, int counter, String idFormat, int start, int end) { 269 S as = sequences.get(counter - 1), seq1 = sequences.get(0), seq2 = sequences.get(1); 270 271 if (idFormat != null) { 272 s.append(String.format(idFormat, as.getAccession())); 273 } 274 275 String mySeq = as.getSubSequence(start, end).getSequenceAsString(); 276 String s1 = seq1.getSubSequence(start, end).getSequenceAsString(); 277 String s2 = seq2.getSubSequence(start, end).getSequenceAsString(); 278 CompoundSet<C> cs = getCompoundSet(); 279 280 for (int i = 0; i < s1.length(); i++) { 281 if (i >= s2.length() || i >= mySeq.length()) 282 break; 283 char c1 = s1.charAt(i); 284 char c2 = s2.charAt(i); 285 char c = mySeq.charAt(i); 286 s.append(IOUtils.getPDBCharacter(true, c1, c2, cs.compoundsEquivalent(seq1.getCompoundAt(i), 287 seq2.getCompoundAt(i)), c)); 288 } 289 290 s.append(String.format("%n")); 291 } 292 293 /** 294 * 295 * @param s 296 * @param idFormat 297 * @param start 298 * @param end 299 * @param webDisplay 300 */ 301 private void printConservation(StringBuilder s, String idFormat, int start, int end, boolean webDisplay) { 302 S seq1 = sequences.get(0), seq2 = sequences.get(1); 303 304 if (idFormat != null) { 305 AccessionID ac1 = sequences.get(0).getAccession(); 306 String id1 = (ac1 == null) ? "null" : ac1.getID(); 307 id1 = id1.replaceAll("\\.", " "); 308 s.append(String.format(idFormat, id1)); 309 } 310 311 String s1 = seq1.getSubSequence(start, end).getSequenceAsString(); 312 String s2 = seq2.getSubSequence(start, end).getSequenceAsString(); 313 CompoundSet<C> cs = getCompoundSet(); 314 315 for (int i = 0; i < s1.length(); i++) { 316 if (i >= s2.length()) 317 break; 318 char c1 = s1.charAt(i); 319 char c2 = s2.charAt(i); 320 s.append(IOUtils.getPDBConservation(webDisplay, c1, c2, cs.compoundsEquivalent(seq1.getCompoundAt(i), 321 seq2.getCompoundAt(i)))); 322 } 323 324 s.append(String.format("%n")); 325 } 326 327}