001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the terms of the GNU Lesser General Public Licence. This 005 * should be distributed with the code. If you do not have a copy, see: 006 * 007 * http://www.gnu.org/copyleft/lesser.html 008 * 009 * Copyright for this code is held jointly by the individual authors. These should be listed in @author doc comments. 010 * 011 * For more information on the BioJava project and its aims, or to join the biojava-l mailing list, visit the home page 012 * at: 013 * 014 * http://www.biojava.org/ 015 * 016 * Created on August 13, 2010 Author: Mark Chapman 017 */ 018 019package org.biojava.nbio.alignment.io; 020 021import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 022import org.biojava.nbio.core.sequence.ProteinSequence; 023import org.biojava.nbio.core.sequence.RNASequence; 024import org.biojava.nbio.core.sequence.template.AbstractCompound; 025import org.biojava.nbio.core.sequence.template.AbstractSequence; 026import org.slf4j.Logger; 027import org.slf4j.LoggerFactory; 028 029import java.util.ArrayList; 030import java.util.HashMap; 031import java.util.List; 032import java.util.Map; 033 034/** 035 * Stores all the content of a Stockholm file. <i><b>N.B.: This structure will undergo several enhancements later on. 036 * Don't depend on it in a final code, otherwise it will be hard to maintain.</b></i> 037 * 038 * In general, Stockholm File contains the alignment mark-up lines.<br> 039 * <br> 040 * 041 * <Table border="1" align="center"> 042 * <tr> 043 * <td><b>Header Section</b></td> 044 * </tr> 045 * <tr> 046 * <td><b>Reference Section</b></td> 047 * </tr> 048 * <tr> 049 * <td><b>Comment Section</b></td> 050 * </tr> 051 * <tr> 052 * <td><B>Alignment Section</B></td> 053 * </tr> 054 * </table> 055 * 056 * Sequence letters may include any characters except whitespace. Gaps may be indicated by "." or "-".<br> 057 * Mark-up lines may include any characters except whitespace. Use underscore ("_") instead of space.<br> 058 * 059 * <Table border="1"> 060 * <th>section field</th> 061 * <th>preferred location</th> 062 * <tr> 063 * <td>#=GF <feature> <Generic per-File annotation, free text></td> 064 * <td>Above the alignment</td> 065 * <tr> 066 * <td>#=GC <feature> <Generic per-Column annotation, exactly 1 char per column></td> 067 * <td>Below the alignment</td> 068 * <tr> 069 * <td>#=GS <seqname> <feature> <Generic per-Sequence annotation, free text></td> 070 * <td>Above the alignment or just below the corresponding sequence</td> 071 * <tr> 072 * <td>#=GR <seqname> <feature> <Generic per-Residue annotation, exactly 1 char per residue></td> 073 * <td>Just below the corresponding sequence</td> 074 * </tr> 075 * </table> 076 * 077 * @since 3.0.5 078 * @author Amr ALHOSSARY 079 * @author Marko Vaz 080 * 081 */ 082public class StockholmStructure { 083 084 private final static Logger logger = LoggerFactory.getLogger(StockholmStructure.class); 085 086 public static final String PFAM = "PFAM"; 087 public static final String RFAM = "RFAM"; 088 private final StockholmFileAnnotation fileAnnotation; 089 private final StockholmConsensusAnnotation consAnnotation; 090 private final Map<String, StringBuffer> sequences; 091 private final Map<String, StockholmSequenceAnnotation> seqsAnnotation; 092 private final Map<String, StockholmResidueAnnotation> resAnnotation; 093 094 public StockholmStructure() { 095 fileAnnotation = new StockholmFileAnnotation(); 096 consAnnotation = new StockholmConsensusAnnotation(); 097 sequences = new HashMap<String, StringBuffer>(); 098 seqsAnnotation = new HashMap<String, StockholmSequenceAnnotation>(); 099 resAnnotation = new HashMap<String, StockholmResidueAnnotation>(); 100 } 101 102 public StockholmFileAnnotation getFileAnnotation() { 103 return fileAnnotation; 104 } 105 106 public StockholmConsensusAnnotation getConsAnnotation() { 107 return consAnnotation; 108 } 109 110 /** 111 * @param seqName 112 * @param seqText 113 */ 114 public void appendToSequence(String seqName, String seqText) { 115 StringBuffer seq = sequences.get(seqName); 116 if (seq != null) { 117 // add sequence without space 118 seq.append(seqText); 119 } else { 120 seq = new StringBuffer(seqText); 121 sequences.put(seqName, seq); 122 } 123 } 124 125 public Map<String, StringBuffer> getSequences() { 126 return sequences; 127 } 128 129 private StockholmSequenceAnnotation getSequenceAnnotation(String seqName) { 130 if (!seqsAnnotation.containsKey(seqName)) { 131 seqsAnnotation.put(seqName, new StockholmSequenceAnnotation()); 132 } 133 return seqsAnnotation.get(seqName); 134 } 135 136 /** 137 * @param seqName 138 * @param text 139 */ 140 public void addGSAccessionNumber(String seqName, String text) { 141 getSequenceAnnotation(seqName).setAccessionNumber(text); 142 } 143 144 public void addGSDescription(String seqName, String text) { 145 getSequenceAnnotation(seqName).addToDescription(text); 146 } 147 148 /** 149 * @param seqName 150 * @param text 151 */ 152 public void addGSdbReference(String seqName, String text) { 153 getSequenceAnnotation(seqName).addDBReference(text); 154 } 155 156 public void addGSOrganismSpecies(String seqName, String text) { 157 getSequenceAnnotation(seqName).setOrganism(text); 158 } 159 160 public void addGSOrganismClassification(String seqName, String text) { 161 getSequenceAnnotation(seqName).setOrganismClassification(text); 162 } 163 164 public void addGSLook(String seqName, String text) { 165 getSequenceAnnotation(seqName).setLook(text); 166 } 167 168 private StockholmResidueAnnotation getResidueAnnotation(String seqName) { 169 if (!resAnnotation.containsKey(seqName)) { 170 resAnnotation.put(seqName, new StockholmResidueAnnotation()); 171 } 172 return resAnnotation.get(seqName); 173 } 174 175 public void addSurfaceAccessibility(String seqName, String text) { 176 getResidueAnnotation(seqName).setSurfaceAccessibility(text); 177 } 178 179 public void addTransMembrane(String seqName, String text) { 180 getResidueAnnotation(seqName).setTransMembrane(text); 181 } 182 183 public void addPosteriorProbability(String seqName, String text) { 184 getResidueAnnotation(seqName).setPosteriorProbability(text); 185 } 186 187 public void addLigandBinding(String seqName, String text) { 188 getResidueAnnotation(seqName).setLigandBinding(text); 189 } 190 191 public void addActiveSite(String seqName, String text) { 192 getResidueAnnotation(seqName).setActiveSite(text); 193 } 194 195 public void addASPFamPredicted(String seqName, String text) { 196 getResidueAnnotation(seqName).setAsPFamPredicted(text); 197 } 198 199 public void addASSwissProt(String seqName, String text) { 200 getResidueAnnotation(seqName).setAsSwissProt(text); 201 } 202 203 public void addIntron(String seqName, String text) { 204 getResidueAnnotation(seqName).setIntron(text); 205 } 206 207 public void addSecondaryStructure(String seqName, String text) { 208 getResidueAnnotation(seqName).setSecondaryStructure(text); 209 } 210 211 /** 212 * used to retrieve sequences from the structure 213 * 214 * @return Biosequences (case sensitive) 215 * @see #getBioSequences(boolean) 216 * @see #getBioSequences(boolean, String) 217 */ 218 public List<AbstractSequence<? extends AbstractCompound>> getBioSequences() { 219 return getBioSequences(false); 220 } 221 222 /** 223 * This function tolerates mixed case letters, and allows for forcing the output biosequence type (PFAM/RFAM). 224 * 225 * @param ignoreCase 226 * if <code>true</code>, the function will deal with small letters as if they are capital ones 227 * @param forcedSequenceType 228 * either <code>null</code>, {@link #PFAM}, or {@link #RFAM}. 229 * @return Biosequences according to the criteria specified 230 * @see #getBioSequences() 231 * @see #getBioSequences(boolean) 232 */ 233 public List<AbstractSequence<? extends AbstractCompound>> getBioSequences(boolean ignoreCase, 234 String forcedSequenceType) { 235 if (forcedSequenceType != null && !(forcedSequenceType.equals(PFAM) || forcedSequenceType.equals(RFAM))) { 236 throw new IllegalArgumentException("Illegal Argument " + forcedSequenceType); 237 } 238 List<AbstractSequence<? extends AbstractCompound>> seqs = new ArrayList<AbstractSequence<? extends AbstractCompound>>(); 239 for (String sequencename : sequences.keySet()) { 240 AbstractSequence<? extends AbstractCompound> seq = null; 241 String sequence = sequences.get(sequencename).toString(); 242 if (ignoreCase) { 243 sequence = sequence.toUpperCase(); 244 } 245 246 try { 247 if (forcedSequenceType == null) 248 seq = fileAnnotation.isPFam() ? new ProteinSequence(sequence) : new RNASequence(sequence); 249 else if (forcedSequenceType.equals(PFAM)) 250 seq = new ProteinSequence(sequence); 251 else 252 seq = new RNASequence(sequence); 253 } catch (CompoundNotFoundException e) { 254 logger.warn("Could not create sequence because of unknown compounds ({}). Sequence {} will be ignored.",e.getMessage(),sequencename); 255 continue; 256 } 257 String[] seqDetails = splitSeqName(sequencename); 258 seq.setDescription(seqDetails[0]); 259 seq.setBioBegin((seqDetails[1] == null || seqDetails[1].trim().equals("") ? null : new Integer( 260 seqDetails[1]))); 261 seq.setBioEnd((seqDetails[2] == null || seqDetails[2].trim().equals("") ? null : new Integer(seqDetails[2]))); 262 263 seqs.add(seq); 264 } 265 return seqs; 266 } 267 268 /** 269 * Because some database files have incorrectly small letters (e.g. Pfam23 structure PF00389.22 sequence 270 * TKRA_BACSU/6-322), this function is used to ignore the small letters case. 271 * 272 * @param ignoreCase 273 * @return 274 * @see #getBioSequences() 275 * @see #getBioSequences(boolean, String) 276 */ 277 public List<AbstractSequence<? extends AbstractCompound>> getBioSequences(boolean ignoreCase) { 278 return getBioSequences(ignoreCase, null); 279 } 280 281 /** 282 * Returns an array with the following sequence related content: name, start, end. 283 * 284 * @param sequenceName 285 * the sequence from where to extract the content. It is supposed that it follows the following 286 * convention name/start-end (e.g.: COATB_BPIKE/30-81) 287 * @return array with the following sequence related content: name, start, end. 288 */ 289 private String[] splitSeqName(String sequenceName) { 290 String[] result = new String[3]; 291 292 String[] barSplit = sequenceName.split("/"); 293 if (barSplit.length == 2) { 294 result[0] = barSplit[0]; 295 String[] positions = barSplit[1].split("-"); 296 if (positions.length == 2) { 297 result[1] = positions[0]; 298 result[2] = positions[1]; 299 } 300 } else { 301 result[0] = sequenceName; 302 result[1] = null; 303 result[2] = null; 304 } 305 306 return result; 307 } 308 309 @Override 310 public String toString() { 311 StringBuffer result = new StringBuffer(); 312 List<AbstractSequence<? extends AbstractCompound>> bioSeqs = getBioSequences(false); 313 int sequenceLength = -1; 314 for (AbstractSequence<? extends AbstractCompound> sequence : bioSeqs) { 315 String sequenceAsString = sequence.getSequenceAsString(); 316 sequenceLength = sequenceAsString.length(); 317 if (sequenceLength > 50) { 318 result.append(sequenceAsString.substring(0, 40)); 319 result.append("..."); 320 result.append(sequenceAsString.substring(sequenceLength - 3, sequenceLength)); 321 } else { 322 result.append(sequenceAsString); 323 } 324 result.append(" " + sequence.getDescription() + "\n"); 325 } 326 result.append("Alignment with " + bioSeqs.size() + " rows and " + sequenceLength + " columns"); 327 328 return result.toString(); 329 } 330 331 public static class DatabaseReference { 332 public static final String EXPERT = "EXPERT"; 333 public static final String MIM = "MIM"; 334 public static final String PFAMB = "PFAMB"; 335 public static final String PRINTS = "PRINTS"; 336 public static final String PROSITE = "PROSITE"; 337 public static final String PROSITE_PROFILE = "PROSITE_PROFILE"; 338 public static final String SCOP = "SCOP"; 339 public static final String PDB = "PDB"; 340 public static final String SMART = "SMART"; 341 public static final String URL = "URL"; 342 public static final String LOAD = "LOAD"; 343 public static final String HOMSTRAD = "HOMSTRAD"; 344 public static final String INTERPRO = "INTERPRO"; 345 346 private final String database; 347 /** TODO this field should be subdivided into smaller fields if the database is SCOP or PDB. */ 348 private final String reference; 349 350 public DatabaseReference(String database, String reference) { 351 this.database = database; 352 this.reference = reference; 353 } 354 355 public DatabaseReference(String representativeAnnotationString) { 356 int semiColonIndex = representativeAnnotationString.indexOf(';'); 357 this.database = representativeAnnotationString.substring(0, semiColonIndex); 358 this.reference = representativeAnnotationString.substring(semiColonIndex + 1, 359 representativeAnnotationString.lastIndexOf(';')).trim(); 360 } 361 362 @Override 363 public String toString() { 364 return new StringBuilder(this.database).append(';').append(' ').append(this.reference).append(';') 365 .toString(); 366 } 367 368 public String getDatabase() { 369 return database; 370 } 371 372 public String getReference() { 373 return reference; 374 } 375 } 376}