001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the terms of the GNU Lesser General Public Licence. This 005 * should be distributed with the code. If you do not have a copy, see: 006 * 007 * http://www.gnu.org/copyleft/lesser.html 008 * 009 * Copyright for this code is held jointly by the individual authors. These should be listed in @author doc comments. 010 * 011 * For more information on the BioJava project and its aims, or to join the biojava-l mailing list, visit the home page 012 * at: 013 * 014 * http://www.biojava.org/ 015 * 016 * Created on August 13, 2010 Author: Mark Chapman 017 */ 018 019package org.biojava.nbio.alignment.io; 020 021import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 022import org.biojava.nbio.core.sequence.ProteinSequence; 023import org.biojava.nbio.core.sequence.RNASequence; 024import org.biojava.nbio.core.sequence.template.AbstractCompound; 025import org.biojava.nbio.core.sequence.template.AbstractSequence; 026import org.slf4j.Logger; 027import org.slf4j.LoggerFactory; 028 029import java.util.ArrayList; 030import java.util.HashMap; 031import java.util.List; 032import java.util.Map; 033 034/** 035 * Stores all the content of a Stockholm file. <i><b>N.B.: This structure will undergo several enhancements later on. 036 * Don't depend on it in a final code, otherwise it will be hard to maintain.</b></i> 037 * 038 * In general, Stockholm File contains the alignment mark-up lines.<br> 039 * <br> 040 * 041 * <Table border="1" align="center"> 042 * <tr> 043 * <td><b>Header Section</b></td> 044 * </tr> 045 * <tr> 046 * <td><b>Reference Section</b></td> 047 * </tr> 048 * <tr> 049 * <td><b>Comment Section</b></td> 050 * </tr> 051 * <tr> 052 * <td><B>Alignment Section</B></td> 053 * </tr> 054 * </table> 055 * 056 * Sequence letters may include any characters except whitespace. Gaps may be indicated by "." or "-".<br> 057 * Mark-up lines may include any characters except whitespace. Use underscore ("_") instead of space.<br> 058 * 059 * <Table border="1"> 060 * <th>section field</th> 061 * <th>preferred location</th> 062 * <tr> 063 * <td>#=GF <feature> <Generic per-File annotation, free text></td> 064 * <td>Above the alignment</td> 065 * <tr> 066 * <td>#=GC <feature> <Generic per-Column annotation, exactly 1 char per column></td> 067 * <td>Below the alignment</td> 068 * <tr> 069 * <td>#=GS <seqname> <feature> <Generic per-Sequence annotation, free text></td> 070 * <td>Above the alignment or just below the corresponding sequence</td> 071 * <tr> 072 * <td>#=GR <seqname> <feature> <Generic per-Residue annotation, exactly 1 char per residue></td> 073 * <td>Just below the corresponding sequence</td> 074 * </tr> 075 * </table> 076 * 077 * @since 3.0.5 078 * @author Amr AL-Hossary 079 * @author Marko Vaz 080 * 081 */ 082public class StockholmStructure { 083 084 private final static Logger logger = LoggerFactory.getLogger(StockholmStructure.class); 085 086 public static final String PFAM = "PFAM"; 087 public static final String RFAM = "RFAM"; 088 private final StockholmFileAnnotation fileAnnotation; 089 private final StockholmConsensusAnnotation consAnnotation; 090 private final Map<String, StringBuffer> sequences; 091 private final Map<String, StockholmSequenceAnnotation> seqsAnnotation; 092 private final Map<String, StockholmResidueAnnotation> resAnnotation; 093 094 public StockholmStructure() { 095 fileAnnotation = new StockholmFileAnnotation(); 096 consAnnotation = new StockholmConsensusAnnotation(); 097 sequences = new HashMap<String, StringBuffer>(); 098 seqsAnnotation = new HashMap<String, StockholmSequenceAnnotation>(); 099 resAnnotation = new HashMap<String, StockholmResidueAnnotation>(); 100 } 101 102 public StockholmFileAnnotation getFileAnnotation() { 103 return fileAnnotation; 104 } 105 106 public StockholmConsensusAnnotation getConsAnnotation() { 107 return consAnnotation; 108 } 109 110 /** 111 * Actually this function should be called appendToSequence 112 * 113 * @param seqName 114 * @param seqText 115 * @deprecated Use {@link #appendToSequence(String,String)} instead 116 */ 117 @Deprecated 118 public void addSequence(String seqName, String seqText) { 119 appendToSequence(seqName, seqText); 120 } 121 122 /** 123 * @param seqName 124 * @param seqText 125 */ 126 public void appendToSequence(String seqName, String seqText) { 127 StringBuffer seq = sequences.get(seqName); 128 if (seq != null) { 129 // add sequence without space 130 seq.append(seqText); 131 } else { 132 seq = new StringBuffer(seqText); 133 sequences.put(seqName, seq); 134 } 135 } 136 137 public Map<String, StringBuffer> getSequences() { 138 return sequences; 139 } 140 141 private StockholmSequenceAnnotation getSequenceAnnotation(String seqName) { 142 if (!seqsAnnotation.containsKey(seqName)) { 143 seqsAnnotation.put(seqName, new StockholmSequenceAnnotation()); 144 } 145 return seqsAnnotation.get(seqName); 146 } 147 148 /** 149 * @param seqName 150 * @param text 151 */ 152 public void addGSAccessionNumber(String seqName, String text) { 153 getSequenceAnnotation(seqName).setAccessionNumber(text); 154 } 155 156 public void addGSDescription(String seqName, String text) { 157 getSequenceAnnotation(seqName).addToDescription(text); 158 } 159 160 /** 161 * @param seqName 162 * @param text 163 */ 164 public void addGSdbReference(String seqName, String text) { 165 getSequenceAnnotation(seqName).addDBReference(text); 166 } 167 168 public void addGSOrganismSpecies(String seqName, String text) { 169 getSequenceAnnotation(seqName).setOrganism(text); 170 } 171 172 public void addGSOrganismClassification(String seqName, String text) { 173 getSequenceAnnotation(seqName).setOrganismClassification(text); 174 } 175 176 public void addGSLook(String seqName, String text) { 177 getSequenceAnnotation(seqName).setLook(text); 178 } 179 180 private StockholmResidueAnnotation getResidueAnnotation(String seqName) { 181 if (!resAnnotation.containsKey(seqName)) { 182 resAnnotation.put(seqName, new StockholmResidueAnnotation()); 183 } 184 return resAnnotation.get(seqName); 185 } 186 187 public void addSurfaceAccessibility(String seqName, String text) { 188 getResidueAnnotation(seqName).setSurfaceAccessibility(text); 189 } 190 191 public void addTransMembrane(String seqName, String text) { 192 getResidueAnnotation(seqName).setTransMembrane(text); 193 } 194 195 public void addPosteriorProbability(String seqName, String text) { 196 getResidueAnnotation(seqName).setPosteriorProbability(text); 197 } 198 199 public void addLigandBinding(String seqName, String text) { 200 getResidueAnnotation(seqName).setLigandBinding(text); 201 } 202 203 public void addActiveSite(String seqName, String text) { 204 getResidueAnnotation(seqName).setActiveSite(text); 205 } 206 207 public void addASPFamPredicted(String seqName, String text) { 208 getResidueAnnotation(seqName).setAsPFamPredicted(text); 209 } 210 211 public void addASSwissProt(String seqName, String text) { 212 getResidueAnnotation(seqName).setAsSwissProt(text); 213 } 214 215 public void addIntron(String seqName, String text) { 216 getResidueAnnotation(seqName).setIntron(text); 217 } 218 219 public void addSecondaryStructure(String seqName, String text) { 220 getResidueAnnotation(seqName).setSecondaryStructure(text); 221 } 222 223 /** 224 * used to retrieve sequences from the structure 225 * 226 * @return Biosequences (case sensitive) 227 * @see #getBioSequences(boolean) 228 * @see #getBioSequences(boolean, String) 229 */ 230 public List<AbstractSequence<? extends AbstractCompound>> getBioSequences() { 231 return getBioSequences(false); 232 } 233 234 /** 235 * This function tolerates mixed case letters, and allows for forcing the output biosequence type (PFAM/RFAM). 236 * 237 * @param ignoreCase 238 * if <code>true</code>, the function will deal with small letters as if they are capital ones 239 * @param forcedSequenceType 240 * either <code>null</code>, {@link #PFAM}, or {@link #RFAM}. 241 * @return Biosequences according to the criteria specified 242 * @see #getBioSequences() 243 * @see #getBioSequences(boolean) 244 */ 245 public List<AbstractSequence<? extends AbstractCompound>> getBioSequences(boolean ignoreCase, 246 String forcedSequenceType) { 247 if (forcedSequenceType != null && !(forcedSequenceType.equals(PFAM) | forcedSequenceType.equals(RFAM))) { 248 throw new IllegalArgumentException("Illegal Argument " + forcedSequenceType); 249 } 250 List<AbstractSequence<? extends AbstractCompound>> seqs = new ArrayList<AbstractSequence<? extends AbstractCompound>>(); 251 for (String sequencename : sequences.keySet()) { 252 AbstractSequence<? extends AbstractCompound> seq = null; 253 String sequence = sequences.get(sequencename).toString(); 254 if (ignoreCase) { 255 sequence = sequence.toUpperCase(); 256 } 257 258 try { 259 if (forcedSequenceType == null) 260 seq = fileAnnotation.isPFam() ? new ProteinSequence(sequence) : new RNASequence(sequence); 261 else if (forcedSequenceType.equals(PFAM)) 262 seq = new ProteinSequence(sequence); 263 else 264 seq = new RNASequence(sequence); 265 } catch (CompoundNotFoundException e) { 266 logger.warn("Could not create sequence because of unknown compounds ({}). Sequence {} will be ignored.",e.getMessage(),sequencename); 267 continue; 268 } 269 String[] seqDetails = splitSeqName(sequencename); 270 seq.setDescription(seqDetails[0]); 271 seq.setBioBegin((seqDetails[1] == null || seqDetails[1].trim().equals("") ? null : new Integer( 272 seqDetails[1]))); 273 seq.setBioEnd((seqDetails[2] == null || seqDetails[2].trim().equals("") ? null : new Integer(seqDetails[2]))); 274 275 seqs.add(seq); 276 } 277 return seqs; 278 } 279 280 /** 281 * Because some database files have incorrectly small letters (e.g. Pfam23 structure PF00389.22 sequence 282 * TKRA_BACSU/6-322), this function is used to ignore the small letters case. 283 * 284 * @param ignoreCase 285 * @return 286 * @see #getBioSequences() 287 * @see #getBioSequences(boolean, String) 288 */ 289 public List<AbstractSequence<? extends AbstractCompound>> getBioSequences(boolean ignoreCase) { 290 return getBioSequences(ignoreCase, null); 291 } 292 293 /** 294 * Returns an array with the following sequence related content: name, start, end. 295 * 296 * @param sequenceName 297 * the sequence from where to extract the content. It is supposed that it follows the following 298 * convention name/start-end (e.g.: COATB_BPIKE/30-81) 299 * @return array with the following sequence related content: name, start, end. 300 */ 301 private String[] splitSeqName(String sequenceName) { 302 String[] result = new String[3]; 303 304 String[] barSplit = sequenceName.toString().split("/"); 305 if (barSplit.length == 2) { 306 result[0] = barSplit[0]; 307 String[] positions = barSplit[1].split("-"); 308 if (positions.length == 2) { 309 result[1] = positions[0]; 310 result[2] = positions[1]; 311 } 312 } else { 313 result[0] = sequenceName; 314 result[1] = null; 315 result[2] = null; 316 } 317 318 return result; 319 } 320 321 @Override 322 public String toString() { 323 StringBuffer result = new StringBuffer(); 324 List<AbstractSequence<? extends AbstractCompound>> bioSeqs = getBioSequences(false); 325 int sequenceLength = -1; 326 for (AbstractSequence<? extends AbstractCompound> sequence : bioSeqs) { 327 String sequenceAsString = sequence.getSequenceAsString(); 328 sequenceLength = sequenceAsString.length(); 329 if (sequenceLength > 50) { 330 result.append(sequenceAsString.substring(0, 40)); 331 result.append("..."); 332 result.append(sequenceAsString.substring(sequenceLength - 3, sequenceLength)); 333 } else { 334 result.append(sequenceAsString); 335 } 336 result.append(" " + sequence.getDescription() + "\n"); 337 } 338 result.append("Alignment with " + bioSeqs.size() + " rows and " + sequenceLength + " columns"); 339 340 return result.toString(); 341 } 342 343 public static class DatabaseReference { 344 public static final String EXPERT = "EXPERT"; 345 public static final String MIM = "MIM"; 346 public static final String PFAMB = "PFAMB"; 347 public static final String PRINTS = "PRINTS"; 348 public static final String PROSITE = "PROSITE"; 349 public static final String PROSITE_PROFILE = "PROSITE_PROFILE"; 350 public static final String SCOP = "SCOP"; 351 public static final String PDB = "PDB"; 352 public static final String SMART = "SMART"; 353 public static final String URL = "URL"; 354 public static final String LOAD = "LOAD"; 355 public static final String HOMSTRAD = "HOMSTRAD"; 356 public static final String INTERPRO = "INTERPRO"; 357 358 private final String database; 359 /** TODO this field should be subdivided into smaller fields if the database is SCOP or PDB. */ 360 private final String reference; 361 362 public DatabaseReference(String database, String reference) { 363 this.database = database; 364 this.reference = reference; 365 } 366 367 public DatabaseReference(String representativeAnnotationString) { 368 int semiColonIndex = representativeAnnotationString.indexOf(';'); 369 this.database = representativeAnnotationString.substring(0, semiColonIndex); 370 this.reference = representativeAnnotationString.substring(semiColonIndex + 1, 371 representativeAnnotationString.lastIndexOf(';')).trim(); 372 } 373 374 @Override 375 public String toString() { 376 return new StringBuilder(this.database).append(';').append(' ').append(this.reference).append(';') 377 .toString(); 378 } 379 380 public String getDatabase() { 381 return database; 382 } 383 384 public String getReference() { 385 return reference; 386 } 387 } 388}